youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __authors__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         )
  19
  20 __license__ = 'Public Domain'
  21 __version__ = '2012.01.08b'
  22
  23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  24
  25 import cookielib
  26 import datetime
  27 import gzip
  28 import htmlentitydefs
  29 import HTMLParser
  30 import httplib
  31 import locale
  32 import math
  33 import netrc
  34 import os
  35 import os.path
  36 import re
  37 import socket
  38 import string
  39 import subprocess
  40 import sys
  41 import time
  42 import urllib
  43 import urllib2
  44 import warnings
  45 import zlib
  46
  47 if os.name == 'nt':
  48         import ctypes
  49
  50 try:
  51         import email.utils
  52 except ImportError: # Python 2.4
  53         import email.Utils
  54 try:
  55         import cStringIO as StringIO
  56 except ImportError:
  57         import StringIO
  58
  59 # parse_qs was moved from the cgi module to the urlparse module recently.
  60 try:
  61         from urlparse import parse_qs
  62 except ImportError:
  63         from cgi import parse_qs
  64
  65 try:
  66         import lxml.etree
  67 except ImportError:
  68         pass # Handled below
  69
  70 try:
  71         import xml.etree.ElementTree
  72 except ImportError: # Python<2.5: Not officially supported, but let it slip
  73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  74
  75 std_headers = {
  76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  79         'Accept-Encoding': 'gzip, deflate',
  80         'Accept-Language': 'en-us,en;q=0.5',
  81 }
  82
  83 try:
  84         import json
  85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  86         import re
  87         class json(object):
  88                 @staticmethod
  89                 def loads(s):
  90                         s = s.decode('UTF-8')
  91                         def raiseError(msg, i):
  92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  93                         def skipSpace(i, expectMore=True):
  94                                 while i < len(s) and s[i] in ' \t\r\n':
  95                                         i += 1
  96                                 if expectMore:
  97                                         if i >= len(s):
  98                                                 raiseError('Premature end', i)
  99                                 return i
 100                         def decodeEscape(match):
 101                                 esc = match.group(1)
 102                                 _STATIC = {
 103                                         '"': '"',
 104                                         '\\': '\\',
 105                                         '/': '/',
 106                                         'b': unichr(0x8),
 107                                         'f': unichr(0xc),
 108                                         'n': '\n',
 109                                         'r': '\r',
 110                                         't': '\t',
 111                                 }
 112                                 if esc in _STATIC:
 113                                         return _STATIC[esc]
 114                                 if esc[0] == 'u':
 115                                         if len(esc) == 1+4:
 116                                                 return unichr(int(esc[1:5], 16))
 117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 118                                                 hi = int(esc[1:5], 16)
 119                                                 low = int(esc[7:11], 16)
 120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 121                                 raise ValueError('Unknown escape ' + str(esc))
 122                         def parseString(i):
 123                                 i += 1
 124                                 e = i
 125                                 while True:
 126                                         e = s.index('"', e)
 127                                         bslashes = 0
 128                                         while s[e-bslashes-1] == '\\':
 129                                                 bslashes += 1
 130                                         if bslashes % 2 == 1:
 131                                                 e += 1
 132                                                 continue
 133                                         break
 134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 135                                 stri = rexp.sub(decodeEscape, s[i:e])
 136                                 return (e+1,stri)
 137                         def parseObj(i):
 138                                 i += 1
 139                                 res = {}
 140                                 i = skipSpace(i)
 141                                 if s[i] == '}': # Empty dictionary
 142                                         return (i+1,res)
 143                                 while True:
 144                                         if s[i] != '"':
 145                                                 raiseError('Expected a string object key', i)
 146                                         i,key = parseString(i)
 147                                         i = skipSpace(i)
 148                                         if i >= len(s) or s[i] != ':':
 149                                                 raiseError('Expected a colon', i)
 150                                         i,val = parse(i+1)
 151                                         res[key] = val
 152                                         i = skipSpace(i)
 153                                         if s[i] == '}':
 154                                                 return (i+1, res)
 155                                         if s[i] != ',':
 156                                                 raiseError('Expected comma or closing curly brace', i)
 157                                         i = skipSpace(i+1)
 158                         def parseArray(i):
 159                                 res = []
 160                                 i = skipSpace(i+1)
 161                                 if s[i] == ']': # Empty array
 162                                         return (i+1,res)
 163                                 while True:
 164                                         i,val = parse(i)
 165                                         res.append(val)
 166                                         i = skipSpace(i) # Raise exception if premature end
 167                                         if s[i] == ']':
 168                                                 return (i+1, res)
 169                                         if s[i] != ',':
 170                                                 raiseError('Expected a comma or closing bracket', i)
 171                                         i = skipSpace(i+1)
 172                         def parseDiscrete(i):
 173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 174                                         if s.startswith(k, i):
 175                                                 return (i+len(k), v)
 176                                 raiseError('Not a boolean (or null)', i)
 177                         def parseNumber(i):
 178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 179                                 if mobj is None:
 180                                         raiseError('Not a number', i)
 181                                 nums = mobj.group(1)
 182                                 if '.' in nums or 'e' in nums or 'E' in nums:
 183                                         return (i+len(nums), float(nums))
 184                                 return (i+len(nums), int(nums))
 185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 186                         def parse(i):
 187                                 i = skipSpace(i)
 188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 189                                 i = skipSpace(i, False)
 190                                 return (i,res)
 191                         i,res = parse(0)
 192                         if i < len(s):
 193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 194                         return res
 195
 196 def preferredencoding():
 197         """Get preferred encoding.
 198
 199         Returns the best encoding scheme for the system, based on
 200         locale.getpreferredencoding() and some further tweaks.
 201         """
 202         def yield_preferredencoding():
 203                 try:
 204                         pref = locale.getpreferredencoding()
 205                         u'TEST'.encode(pref)
 206                 except:
 207                         pref = 'UTF-8'
 208                 while True:
 209                         yield pref
 210         return yield_preferredencoding().next()
 211
 212
 213 def htmlentity_transform(matchobj):
 214         """Transforms an HTML entity to a Unicode character.
 215
 216         This function receives a match object and is intended to be used with
 217         the re.sub() function.
 218         """
 219         entity = matchobj.group(1)
 220
 221         # Known non-numeric HTML entity
 222         if entity in htmlentitydefs.name2codepoint:
 223                 return unichr(htmlentitydefs.name2codepoint[entity])
 224
 225         # Unicode character
 226         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 227         if mobj is not None:
 228                 numstr = mobj.group(1)
 229                 if numstr.startswith(u'x'):
 230                         base = 16
 231                         numstr = u'0%s' % numstr
 232                 else:
 233                         base = 10
 234                 return unichr(long(numstr, base))
 235
 236         # Unknown entity in name, return its literal representation
 237         return (u'&%s;' % entity)
 238
 239
 240 def sanitize_title(utitle):
 241         """Sanitizes a video title so it could be used as part of a filename."""
 242         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 243         return utitle.replace(unicode(os.sep), u'%')
 244
 245
 246 def sanitize_open(filename, open_mode):
 247         """Try to open the given filename, and slightly tweak it if this fails.
 248
 249         Attempts to open the given filename. If this fails, it tries to change
 250         the filename slightly, step by step, until it's either able to open it
 251         or it fails and raises a final exception, like the standard open()
 252         function.
 253
 254         It returns the tuple (stream, definitive_file_name).
 255         """
 256         try:
 257                 if filename == u'-':
 258                         if sys.platform == 'win32':
 259                                 import msvcrt
 260                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 261                         return (sys.stdout, filename)
 262                 stream = open(_encodeFilename(filename), open_mode)
 263                 return (stream, filename)
 264         except (IOError, OSError), err:
 265                 # In case of error, try to remove win32 forbidden chars
 266                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 267
 268                 # An exception here should be caught in the caller
 269                 stream = open(_encodeFilename(filename), open_mode)
 270                 return (stream, filename)
 271
 272
 273 def timeconvert(timestr):
 274         """Convert RFC 2822 defined time string into system timestamp"""
 275         timestamp = None
 276         timetuple = email.utils.parsedate_tz(timestr)
 277         if timetuple is not None:
 278                 timestamp = email.utils.mktime_tz(timetuple)
 279         return timestamp
 280
 281 def _simplify_title(title):
 282         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 283         return expr.sub(u'_', title).strip(u'_')
 284
 285 def _orderedSet(iterable):
 286         """ Remove all duplicates from the input iterable """
 287         res = []
 288         for el in iterable:
 289                 if el not in res:
 290                         res.append(el)
 291         return res
 292
 293 def _unescapeHTML(s):
 294         """
 295         @param s a string (of type unicode)
 296         """
 297         assert type(s) == type(u'')
 298
 299         htmlParser = HTMLParser.HTMLParser()
 300         return htmlParser.unescape(s)
 301
 302 def _encodeFilename(s):
 303         """
 304         @param s The name of the file (of type unicode)
 305         """
 306
 307         assert type(s) == type(u'')
 308         return s.encode(sys.getfilesystemencoding(), 'ignore')
 309
 310 class DownloadError(Exception):
 311         """Download Error exception.
 312
 313         This exception may be thrown by FileDownloader objects if they are not
 314         configured to continue on errors. They will contain the appropriate
 315         error message.
 316         """
 317         pass
 318
 319
 320 class SameFileError(Exception):
 321         """Same File exception.
 322
 323         This exception will be thrown by FileDownloader objects if they detect
 324         multiple files would have to be downloaded to the same file on disk.
 325         """
 326         pass
 327
 328
 329 class PostProcessingError(Exception):
 330         """Post Processing exception.
 331
 332         This exception may be raised by PostProcessor's .run() method to
 333         indicate an error in the postprocessing task.
 334         """
 335         pass
 336
 337 class MaxDownloadsReached(Exception):
 338         """ --max-downloads limit has been reached. """
 339         pass
 340
 341
 342 class UnavailableVideoError(Exception):
 343         """Unavailable Format exception.
 344
 345         This exception will be thrown when a video is requested
 346         in a format that is not available for that video.
 347         """
 348         pass
 349
 350
 351 class ContentTooShortError(Exception):
 352         """Content Too Short exception.
 353
 354         This exception may be raised by FileDownloader objects when a file they
 355         download is too small for what the server announced first, indicating
 356         the connection was probably interrupted.
 357         """
 358         # Both in bytes
 359         downloaded = None
 360         expected = None
 361
 362         def __init__(self, downloaded, expected):
 363                 self.downloaded = downloaded
 364                 self.expected = expected
 365
 366
 367 class YoutubeDLHandler(urllib2.HTTPHandler):
 368         """Handler for HTTP requests and responses.
 369
 370         This class, when installed with an OpenerDirector, automatically adds
 371         the standard headers to every HTTP request and handles gzipped and
 372         deflated responses from web servers. If compression is to be avoided in
 373         a particular request, the original request in the program code only has
 374         to include the HTTP header "Youtubedl-No-Compression", which will be
 375         removed before making the real request.
 376
 377         Part of this code was copied from:
 378
 379         http://techknack.net/python-urllib2-handlers/
 380
 381         Andrew Rowls, the author of that code, agreed to release it to the
 382         public domain.
 383         """
 384
 385         @staticmethod
 386         def deflate(data):
 387                 try:
 388                         return zlib.decompress(data, -zlib.MAX_WBITS)
 389                 except zlib.error:
 390                         return zlib.decompress(data)
 391
 392         @staticmethod
 393         def addinfourl_wrapper(stream, headers, url, code):
 394                 if hasattr(urllib2.addinfourl, 'getcode'):
 395                         return urllib2.addinfourl(stream, headers, url, code)
 396                 ret = urllib2.addinfourl(stream, headers, url)
 397                 ret.code = code
 398                 return ret
 399
 400         def http_request(self, req):
 401                 for h in std_headers:
 402                         if h in req.headers:
 403                                 del req.headers[h]
 404                         req.add_header(h, std_headers[h])
 405                 if 'Youtubedl-no-compression' in req.headers:
 406                         if 'Accept-encoding' in req.headers:
 407                                 del req.headers['Accept-encoding']
 408                         del req.headers['Youtubedl-no-compression']
 409                 return req
 410
 411         def http_response(self, req, resp):
 412                 old_resp = resp
 413                 # gzip
 414                 if resp.headers.get('Content-encoding', '') == 'gzip':
 415                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 416                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 417                         resp.msg = old_resp.msg
 418                 # deflate
 419                 if resp.headers.get('Content-encoding', '') == 'deflate':
 420                         gz = StringIO.StringIO(self.deflate(resp.read()))
 421                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 422                         resp.msg = old_resp.msg
 423                 return resp
 424
 425
 426 class FileDownloader(object):
 427         """File Downloader class.
 428
 429         File downloader objects are the ones responsible of downloading the
 430         actual video file and writing it to disk if the user has requested
 431         it, among some other tasks. In most cases there should be one per
 432         program. As, given a video URL, the downloader doesn't know how to
 433         extract all the needed information, task that InfoExtractors do, it
 434         has to pass the URL to one of them.
 435
 436         For this, file downloader objects have a method that allows
 437         InfoExtractors to be registered in a given order. When it is passed
 438         a URL, the file downloader handles it to the first InfoExtractor it
 439         finds that reports being able to handle it. The InfoExtractor extracts
 440         all the information about the video or videos the URL refers to, and
 441         asks the FileDownloader to process the video information, possibly
 442         downloading the video.
 443
 444         File downloaders accept a lot of parameters. In order not to saturate
 445         the object constructor with arguments, it receives a dictionary of
 446         options instead. These options are available through the params
 447         attribute for the InfoExtractors to use. The FileDownloader also
 448         registers itself as the downloader in charge for the InfoExtractors
 449         that are added to it, so this is a "mutual registration".
 450
 451         Available options:
 452
 453         username:         Username for authentication purposes.
 454         password:         Password for authentication purposes.
 455         usenetrc:         Use netrc for authentication instead.
 456         quiet:            Do not print messages to stdout.
 457         forceurl:         Force printing final URL.
 458         forcetitle:       Force printing title.
 459         forcethumbnail:   Force printing thumbnail URL.
 460         forcedescription: Force printing description.
 461         forcefilename:    Force printing final filename.
 462         simulate:         Do not download the video files.
 463         format:           Video format code.
 464         format_limit:     Highest quality format to try.
 465         outtmpl:          Template for output names.
 466         ignoreerrors:     Do not stop on download errors.
 467         ratelimit:        Download speed limit, in bytes/sec.
 468         nooverwrites:     Prevent overwriting files.
 469         retries:          Number of times to retry for HTTP error 5xx
 470         continuedl:       Try to continue downloads if possible.
 471         noprogress:       Do not print the progress bar.
 472         playliststart:    Playlist item to start at.
 473         playlistend:      Playlist item to end at.
 474         matchtitle:       Download only matching titles.
 475         rejecttitle:      Reject downloads for matching titles.
 476         logtostderr:      Log messages to stderr instead of stdout.
 477         consoletitle:     Display progress in console window's titlebar.
 478         nopart:           Do not use temporary .part files.
 479         updatetime:       Use the Last-modified header to set output file timestamps.
 480         writedescription: Write the video description to a .description file
 481         writeinfojson:    Write the video description to a .info.json file
 482         """
 483
 484         params = None
 485         _ies = []
 486         _pps = []
 487         _download_retcode = None
 488         _num_downloads = None
 489         _screen_file = None
 490
 491         def __init__(self, params):
 492                 """Create a FileDownloader object with the given options."""
 493                 self._ies = []
 494                 self._pps = []
 495                 self._download_retcode = 0
 496                 self._num_downloads = 0
 497                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 498                 self.params = params
 499
 500         @staticmethod
 501         def format_bytes(bytes):
 502                 if bytes is None:
 503                         return 'N/A'
 504                 if type(bytes) is str:
 505                         bytes = float(bytes)
 506                 if bytes == 0.0:
 507                         exponent = 0
 508                 else:
 509                         exponent = long(math.log(bytes, 1024.0))
 510                 suffix = 'bkMGTPEZY'[exponent]
 511                 converted = float(bytes) / float(1024 ** exponent)
 512                 return '%.2f%s' % (converted, suffix)
 513
 514         @staticmethod
 515         def calc_percent(byte_counter, data_len):
 516                 if data_len is None:
 517                         return '---.-%'
 518                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 519
 520         @staticmethod
 521         def calc_eta(start, now, total, current):
 522                 if total is None:
 523                         return '--:--'
 524                 dif = now - start
 525                 if current == 0 or dif < 0.001: # One millisecond
 526                         return '--:--'
 527                 rate = float(current) / dif
 528                 eta = long((float(total) - float(current)) / rate)
 529                 (eta_mins, eta_secs) = divmod(eta, 60)
 530                 if eta_mins > 99:
 531                         return '--:--'
 532                 return '%02d:%02d' % (eta_mins, eta_secs)
 533
 534         @staticmethod
 535         def calc_speed(start, now, bytes):
 536                 dif = now - start
 537                 if bytes == 0 or dif < 0.001: # One millisecond
 538                         return '%10s' % '---b/s'
 539                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 540
 541         @staticmethod
 542         def best_block_size(elapsed_time, bytes):
 543                 new_min = max(bytes / 2.0, 1.0)
 544                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 545                 if elapsed_time < 0.001:
 546                         return long(new_max)
 547                 rate = bytes / elapsed_time
 548                 if rate > new_max:
 549                         return long(new_max)
 550                 if rate < new_min:
 551                         return long(new_min)
 552                 return long(rate)
 553
 554         @staticmethod
 555         def parse_bytes(bytestr):
 556                 """Parse a string indicating a byte quantity into a long integer."""
 557                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 558                 if matchobj is None:
 559                         return None
 560                 number = float(matchobj.group(1))
 561                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 562                 return long(round(number * multiplier))
 563
 564         def add_info_extractor(self, ie):
 565                 """Add an InfoExtractor object to the end of the list."""
 566                 self._ies.append(ie)
 567                 ie.set_downloader(self)
 568
 569         def add_post_processor(self, pp):
 570                 """Add a PostProcessor object to the end of the chain."""
 571                 self._pps.append(pp)
 572                 pp.set_downloader(self)
 573
 574         def to_screen(self, message, skip_eol=False):
 575                 """Print message to stdout if not in quiet mode."""
 576                 assert type(message) == type(u'')
 577                 if not self.params.get('quiet', False):
 578                         terminator = [u'\n', u''][skip_eol]
 579                         output = message + terminator
 580
 581                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 582                                 output = output.encode(preferredencoding(), 'ignore')
 583                         self._screen_file.write(output)
 584                         self._screen_file.flush()
 585
 586         def to_stderr(self, message):
 587                 """Print message to stderr."""
 588                 print >>sys.stderr, message.encode(preferredencoding())
 589
 590         def to_cons_title(self, message):
 591                 """Set console/terminal window title to message."""
 592                 if not self.params.get('consoletitle', False):
 593                         return
 594                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 595                         # c_wchar_p() might not be necessary if `message` is
 596                         # already of type unicode()
 597                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 598                 elif 'TERM' in os.environ:
 599                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 600
 601         def fixed_template(self):
 602                 """Checks if the output template is fixed."""
 603                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 604
 605         def trouble(self, message=None):
 606                 """Determine action to take when a download problem appears.
 607
 608                 Depending on if the downloader has been configured to ignore
 609                 download errors or not, this method may throw an exception or
 610                 not when errors are found, after printing the message.
 611                 """
 612                 if message is not None:
 613                         self.to_stderr(message)
 614                 if not self.params.get('ignoreerrors', False):
 615                         raise DownloadError(message)
 616                 self._download_retcode = 1
 617
 618         def slow_down(self, start_time, byte_counter):
 619                 """Sleep if the download speed is over the rate limit."""
 620                 rate_limit = self.params.get('ratelimit', None)
 621                 if rate_limit is None or byte_counter == 0:
 622                         return
 623                 now = time.time()
 624                 elapsed = now - start_time
 625                 if elapsed <= 0.0:
 626                         return
 627                 speed = float(byte_counter) / elapsed
 628                 if speed > rate_limit:
 629                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 630
 631         def temp_name(self, filename):
 632                 """Returns a temporary filename for the given filename."""
 633                 if self.params.get('nopart', False) or filename == u'-' or \
 634                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
 635                         return filename
 636                 return filename + u'.part'
 637
 638         def undo_temp_name(self, filename):
 639                 if filename.endswith(u'.part'):
 640                         return filename[:-len(u'.part')]
 641                 return filename
 642
 643         def try_rename(self, old_filename, new_filename):
 644                 try:
 645                         if old_filename == new_filename:
 646                                 return
 647                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
 648                 except (IOError, OSError), err:
 649                         self.trouble(u'ERROR: unable to rename file')
 650
 651         def try_utime(self, filename, last_modified_hdr):
 652                 """Try to set the last-modified time of the given file."""
 653                 if last_modified_hdr is None:
 654                         return
 655                 if not os.path.isfile(_encodeFilename(filename)):
 656                         return
 657                 timestr = last_modified_hdr
 658                 if timestr is None:
 659                         return
 660                 filetime = timeconvert(timestr)
 661                 if filetime is None:
 662                         return filetime
 663                 try:
 664                         os.utime(filename, (time.time(), filetime))
 665                 except:
 666                         pass
 667                 return filetime
 668
 669         def report_writedescription(self, descfn):
 670                 """ Report that the description file is being written """
 671                 self.to_screen(u'[info] Writing video description to: ' + descfn)
 672
 673         def report_writeinfojson(self, infofn):
 674                 """ Report that the metadata file has been written """
 675                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 676
 677         def report_destination(self, filename):
 678                 """Report destination filename."""
 679                 self.to_screen(u'[download] Destination: ' + filename)
 680
 681         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 682                 """Report download progress."""
 683                 if self.params.get('noprogress', False):
 684                         return
 685                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 686                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 687                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 688                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 689
 690         def report_resuming_byte(self, resume_len):
 691                 """Report attempt to resume at given byte."""
 692                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 693
 694         def report_retry(self, count, retries):
 695                 """Report retry in case of HTTP error 5xx"""
 696                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 697
 698         def report_file_already_downloaded(self, file_name):
 699                 """Report file has already been fully downloaded."""
 700                 try:
 701                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 702                 except (UnicodeEncodeError), err:
 703                         self.to_screen(u'[download] The file has already been downloaded')
 704
 705         def report_unable_to_resume(self):
 706                 """Report it was impossible to resume download."""
 707                 self.to_screen(u'[download] Unable to resume')
 708
 709         def report_finish(self):
 710                 """Report download finished."""
 711                 if self.params.get('noprogress', False):
 712                         self.to_screen(u'[download] Download completed')
 713                 else:
 714                         self.to_screen(u'')
 715
 716         def increment_downloads(self):
 717                 """Increment the ordinal that assigns a number to each file."""
 718                 self._num_downloads += 1
 719
 720         def prepare_filename(self, info_dict):
 721                 """Generate the output filename."""
 722                 try:
 723                         template_dict = dict(info_dict)
 724                         template_dict['epoch'] = unicode(long(time.time()))
 725                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 726                         filename = self.params['outtmpl'] % template_dict
 727                         return filename
 728                 except (ValueError, KeyError), err:
 729                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 730                         return None
 731
 732         def _match_entry(self, info_dict):
 733                 """ Returns None iff the file should be downloaded """
 734
 735                 title = info_dict['title']
 736                 matchtitle = self.params.get('matchtitle', False)
 737                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 738                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 739                 rejecttitle = self.params.get('rejecttitle', False)
 740                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 741                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 742                 return None
 743
 744         def process_info(self, info_dict):
 745                 """Process a single dictionary returned by an InfoExtractor."""
 746
 747                 reason = self._match_entry(info_dict)
 748                 if reason is not None:
 749                         self.to_screen(u'[download] ' + reason)
 750                         return
 751
 752                 max_downloads = self.params.get('max_downloads')
 753                 if max_downloads is not None:
 754                         if self._num_downloads > int(max_downloads):
 755                                 raise MaxDownloadsReached()
 756
 757                 filename = self.prepare_filename(info_dict)
 758
 759                 # Forced printings
 760                 if self.params.get('forcetitle', False):
 761                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 762                 if self.params.get('forceurl', False):
 763                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 764                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 765                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 766                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 767                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 768                 if self.params.get('forcefilename', False) and filename is not None:
 769                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 770                 if self.params.get('forceformat', False):
 771                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 772
 773                 # Do nothing else if in simulate mode
 774                 if self.params.get('simulate', False):
 775                         return
 776
 777                 if filename is None:
 778                         return
 779
 780                 try:
 781                         dn = os.path.dirname(_encodeFilename(filename))
 782                         if dn != '' and not os.path.exists(dn): # dn is already encoded
 783                                 os.makedirs(dn)
 784                 except (OSError, IOError), err:
 785                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 786                         return
 787
 788                 if self.params.get('writedescription', False):
 789                         try:
 790                                 descfn = filename + u'.description'
 791                                 self.report_writedescription(descfn)
 792                                 descfile = open(_encodeFilename(descfn), 'wb')
 793                                 try:
 794                                         descfile.write(info_dict['description'].encode('utf-8'))
 795                                 finally:
 796                                         descfile.close()
 797                         except (OSError, IOError):
 798                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 799                                 return
 800
 801                 if self.params.get('writeinfojson', False):
 802                         infofn = filename + u'.info.json'
 803                         self.report_writeinfojson(infofn)
 804                         try:
 805                                 json.dump
 806                         except (NameError,AttributeError):
 807                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 808                                 return
 809                         try:
 810                                 infof = open(_encodeFilename(infofn), 'wb')
 811                                 try:
 812                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 813                                         json.dump(json_info_dict, infof)
 814                                 finally:
 815                                         infof.close()
 816                         except (OSError, IOError):
 817                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 818                                 return
 819
 820                 if not self.params.get('skip_download', False):
 821                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
 822                                 success = True
 823                         else:
 824                                 try:
 825                                         success = self._do_download(filename, info_dict)
 826                                 except (OSError, IOError), err:
 827                                         raise UnavailableVideoError
 828                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 829                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 830                                         return
 831                                 except (ContentTooShortError, ), err:
 832                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 833                                         return
 834
 835                         if success:
 836                                 try:
 837                                         self.post_process(filename, info_dict)
 838                                 except (PostProcessingError), err:
 839                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 840                                         return
 841
 842         def download(self, url_list):
 843                 """Download a given list of URLs."""
 844                 if len(url_list) > 1 and self.fixed_template():
 845                         raise SameFileError(self.params['outtmpl'])
 846
 847                 for url in url_list:
 848                         suitable_found = False
 849                         for ie in self._ies:
 850                                 # Go to next InfoExtractor if not suitable
 851                                 if not ie.suitable(url):
 852                                         continue
 853
 854                                 # Suitable InfoExtractor found
 855                                 suitable_found = True
 856
 857                                 # Extract information from URL and process it
 858                                 ie.extract(url)
 859
 860                                 # Suitable InfoExtractor had been found; go to next URL
 861                                 break
 862
 863                         if not suitable_found:
 864                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 865
 866                 return self._download_retcode
 867
 868         def post_process(self, filename, ie_info):
 869                 """Run the postprocessing chain on the given file."""
 870                 info = dict(ie_info)
 871                 info['filepath'] = filename
 872                 for pp in self._pps:
 873                         info = pp.run(info)
 874                         if info is None:
 875                                 break
 876
 877         def _download_with_rtmpdump(self, filename, url, player_url):
 878                 self.report_destination(filename)
 879                 tmpfilename = self.temp_name(filename)
 880
 881                 # Check for rtmpdump first
 882                 try:
 883                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 884                 except (OSError, IOError):
 885                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 886                         return False
 887
 888                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 889                 # the connection was interrumpted and resuming appears to be
 890                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 891                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 892                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 893                 while retval == 2 or retval == 1:
 894                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
 895                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 896                         time.sleep(5.0) # This seems to be needed
 897                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 898                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
 899                         if prevsize == cursize and retval == 1:
 900                                 break
 901                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 902                         if prevsize == cursize and retval == 2 and cursize > 1024:
 903                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 904                                 retval = 0
 905                                 break
 906                 if retval == 0:
 907                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
 908                         self.try_rename(tmpfilename, filename)
 909                         return True
 910                 else:
 911                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 912                         return False
 913
 914         def _do_download(self, filename, info_dict):
 915                 url = info_dict['url']
 916                 player_url = info_dict.get('player_url', None)
 917
 918                 # Check file already present
 919                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
 920                         self.report_file_already_downloaded(filename)
 921                         return True
 922
 923                 # Attempt to download using rtmpdump
 924                 if url.startswith('rtmp'):
 925                         return self._download_with_rtmpdump(filename, url, player_url)
 926
 927                 tmpfilename = self.temp_name(filename)
 928                 stream = None
 929
 930                 # Do not include the Accept-Encoding header
 931                 headers = {'Youtubedl-no-compression': 'True'}
 932                 basic_request = urllib2.Request(url, None, headers)
 933                 request = urllib2.Request(url, None, headers)
 934
 935                 # Establish possible resume length
 936                 if os.path.isfile(_encodeFilename(tmpfilename)):
 937                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
 938                 else:
 939                         resume_len = 0
 940
 941                 open_mode = 'wb'
 942                 if resume_len != 0:
 943                         if self.params.get('continuedl', False):
 944                                 self.report_resuming_byte(resume_len)
 945                                 request.add_header('Range','bytes=%d-' % resume_len)
 946                                 open_mode = 'ab'
 947                         else:
 948                                 resume_len = 0
 949
 950                 count = 0
 951                 retries = self.params.get('retries', 0)
 952                 while count <= retries:
 953                         # Establish connection
 954                         try:
 955                                 if count == 0 and 'urlhandle' in info_dict:
 956                                         data = info_dict['urlhandle']
 957                                 data = urllib2.urlopen(request)
 958                                 break
 959                         except (urllib2.HTTPError, ), err:
 960                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 961                                         # Unexpected HTTP error
 962                                         raise
 963                                 elif err.code == 416:
 964                                         # Unable to resume (requested range not satisfiable)
 965                                         try:
 966                                                 # Open the connection again without the range header
 967                                                 data = urllib2.urlopen(basic_request)
 968                                                 content_length = data.info()['Content-Length']
 969                                         except (urllib2.HTTPError, ), err:
 970                                                 if err.code < 500 or err.code >= 600:
 971                                                         raise
 972                                         else:
 973                                                 # Examine the reported length
 974                                                 if (content_length is not None and
 975                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 976                                                         # The file had already been fully downloaded.
 977                                                         # Explanation to the above condition: in issue #175 it was revealed that
 978                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 979                                                         # changing the file size slightly and causing problems for some users. So
 980                                                         # I decided to implement a suggested change and consider the file
 981                                                         # completely downloaded if the file size differs less than 100 bytes from
 982                                                         # the one in the hard drive.
 983                                                         self.report_file_already_downloaded(filename)
 984                                                         self.try_rename(tmpfilename, filename)
 985                                                         return True
 986                                                 else:
 987                                                         # The length does not match, we start the download over
 988                                                         self.report_unable_to_resume()
 989                                                         open_mode = 'wb'
 990                                                         break
 991                         # Retry
 992                         count += 1
 993                         if count <= retries:
 994                                 self.report_retry(count, retries)
 995
 996                 if count > retries:
 997                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 998                         return False
 999
1000                 data_len = data.info().get('Content-length', None)
1001                 if data_len is not None:
1002                         data_len = long(data_len) + resume_len
1003                 data_len_str = self.format_bytes(data_len)
1004                 byte_counter = 0 + resume_len
1005                 block_size = 1024
1006                 start = time.time()
1007                 while True:
1008                         # Download and write
1009                         before = time.time()
1010                         data_block = data.read(block_size)
1011                         after = time.time()
1012                         if len(data_block) == 0:
1013                                 break
1014                         byte_counter += len(data_block)
1015
1016                         # Open file just in time
1017                         if stream is None:
1018                                 try:
1019                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1020                                         assert stream is not None
1021                                         filename = self.undo_temp_name(tmpfilename)
1022                                         self.report_destination(filename)
1023                                 except (OSError, IOError), err:
1024                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1025                                         return False
1026                         try:
1027                                 stream.write(data_block)
1028                         except (IOError, OSError), err:
1029                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1030                                 return False
1031                         block_size = self.best_block_size(after - before, len(data_block))
1032
1033                         # Progress message
1034                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1035                         if data_len is None:
1036                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1037                         else:
1038                                 percent_str = self.calc_percent(byte_counter, data_len)
1039                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1040                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1041
1042                         # Apply rate limit
1043                         self.slow_down(start, byte_counter - resume_len)
1044
1045                 if stream is None:
1046                         self.trouble(u'\nERROR: Did not get any data blocks')
1047                         return False
1048                 stream.close()
1049                 self.report_finish()
1050                 if data_len is not None and byte_counter != data_len:
1051                         raise ContentTooShortError(byte_counter, long(data_len))
1052                 self.try_rename(tmpfilename, filename)
1053
1054                 # Update file modification time
1055                 if self.params.get('updatetime', True):
1056                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1057
1058                 return True
1059
1060
1061 class InfoExtractor(object):
1062         """Information Extractor class.
1063
1064         Information extractors are the classes that, given a URL, extract
1065         information from the video (or videos) the URL refers to. This
1066         information includes the real video URL, the video title and simplified
1067         title, author and others. The information is stored in a dictionary
1068         which is then passed to the FileDownloader. The FileDownloader
1069         processes this information possibly downloading the video to the file
1070         system, among other possible outcomes. The dictionaries must include
1071         the following fields:
1072
1073         id:             Video identifier.
1074         url:            Final video URL.
1075         uploader:       Nickname of the video uploader.
1076         title:          Literal title.
1077         stitle:         Simplified title.
1078         ext:            Video filename extension.
1079         format:         Video format.
1080         player_url:     SWF Player URL (may be None).
1081
1082         The following fields are optional. Their primary purpose is to allow
1083         youtube-dl to serve as the backend for a video search function, such
1084         as the one in youtube2mp3.  They are only used when their respective
1085         forced printing functions are called:
1086
1087         thumbnail:      Full URL to a video thumbnail image.
1088         description:    One-line video description.
1089
1090         Subclasses of this one should re-define the _real_initialize() and
1091         _real_extract() methods and define a _VALID_URL regexp.
1092         Probably, they should also be added to the list of extractors.
1093         """
1094
1095         _ready = False
1096         _downloader = None
1097
1098         def __init__(self, downloader=None):
1099                 """Constructor. Receives an optional downloader."""
1100                 self._ready = False
1101                 self.set_downloader(downloader)
1102
1103         def suitable(self, url):
1104                 """Receives a URL and returns True if suitable for this IE."""
1105                 return re.match(self._VALID_URL, url) is not None
1106
1107         def initialize(self):
1108                 """Initializes an instance (authentication, etc)."""
1109                 if not self._ready:
1110                         self._real_initialize()
1111                         self._ready = True
1112
1113         def extract(self, url):
1114                 """Extracts URL information and returns it in list of dicts."""
1115                 self.initialize()
1116                 return self._real_extract(url)
1117
1118         def set_downloader(self, downloader):
1119                 """Sets the downloader for this IE."""
1120                 self._downloader = downloader
1121
1122         def _real_initialize(self):
1123                 """Real initialization process. Redefine in subclasses."""
1124                 pass
1125
1126         def _real_extract(self, url):
1127                 """Real extraction process. Redefine in subclasses."""
1128                 pass
1129
1130
1131 class YoutubeIE(InfoExtractor):
1132         """Information extractor for youtube.com."""
1133
1134         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1135         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1136         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1137         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1138         _NETRC_MACHINE = 'youtube'
1139         # Listed in order of quality
1140         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1141         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1142         _video_extensions = {
1143                 '13': '3gp',
1144                 '17': 'mp4',
1145                 '18': 'mp4',
1146                 '22': 'mp4',
1147                 '37': 'mp4',
1148                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1149                 '43': 'webm',
1150                 '44': 'webm',
1151                 '45': 'webm',
1152         }
1153         _video_dimensions = {
1154                 '5': '240x400',
1155                 '6': '???',
1156                 '13': '???',
1157                 '17': '144x176',
1158                 '18': '360x640',
1159                 '22': '720x1280',
1160                 '34': '360x640',
1161                 '35': '480x854',
1162                 '37': '1080x1920',
1163                 '38': '3072x4096',
1164                 '43': '360x640',
1165                 '44': '480x854',
1166                 '45': '720x1280',
1167         }
1168         IE_NAME = u'youtube'
1169
1170         def report_lang(self):
1171                 """Report attempt to set language."""
1172                 self._downloader.to_screen(u'[youtube] Setting language')
1173
1174         def report_login(self):
1175                 """Report attempt to log in."""
1176                 self._downloader.to_screen(u'[youtube] Logging in')
1177
1178         def report_age_confirmation(self):
1179                 """Report attempt to confirm age."""
1180                 self._downloader.to_screen(u'[youtube] Confirming age')
1181
1182         def report_video_webpage_download(self, video_id):
1183                 """Report attempt to download video webpage."""
1184                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1185
1186         def report_video_info_webpage_download(self, video_id):
1187                 """Report attempt to download video info webpage."""
1188                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1189
1190         def report_information_extraction(self, video_id):
1191                 """Report attempt to extract video information."""
1192                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1193
1194         def report_unavailable_format(self, video_id, format):
1195                 """Report extracted video URL."""
1196                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1197
1198         def report_rtmp_download(self):
1199                 """Indicate the download will use the RTMP protocol."""
1200                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1201
1202         def _print_formats(self, formats):
1203                 print 'Available formats:'
1204                 for x in formats:
1205                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1206
1207         def _real_initialize(self):
1208                 if self._downloader is None:
1209                         return
1210
1211                 username = None
1212                 password = None
1213                 downloader_params = self._downloader.params
1214
1215                 # Attempt to use provided username and password or .netrc data
1216                 if downloader_params.get('username', None) is not None:
1217                         username = downloader_params['username']
1218                         password = downloader_params['password']
1219                 elif downloader_params.get('usenetrc', False):
1220                         try:
1221                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1222                                 if info is not None:
1223                                         username = info[0]
1224                                         password = info[2]
1225                                 else:
1226                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1227                         except (IOError, netrc.NetrcParseError), err:
1228                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1229                                 return
1230
1231                 # Set language
1232                 request = urllib2.Request(self._LANG_URL)
1233                 try:
1234                         self.report_lang()
1235                         urllib2.urlopen(request).read()
1236                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1237                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1238                         return
1239
1240                 # No authentication to be performed
1241                 if username is None:
1242                         return
1243
1244                 # Log in
1245                 login_form = {
1246                                 'current_form': 'loginForm',
1247                                 'next':         '/',
1248                                 'action_login': 'Log In',
1249                                 'username':     username,
1250                                 'password':     password,
1251                                 }
1252                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1253                 try:
1254                         self.report_login()
1255                         login_results = urllib2.urlopen(request).read()
1256                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1257                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1258                                 return
1259                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1260                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1261                         return
1262
1263                 # Confirm age
1264                 age_form = {
1265                                 'next_url':             '/',
1266                                 'action_confirm':       'Confirm',
1267                                 }
1268                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1269                 try:
1270                         self.report_age_confirmation()
1271                         age_results = urllib2.urlopen(request).read()
1272                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1273                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1274                         return
1275
1276         def _real_extract(self, url):
1277                 # Extract video id from URL
1278                 mobj = re.match(self._VALID_URL, url)
1279                 if mobj is None:
1280                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1281                         return
1282                 video_id = mobj.group(2)
1283
1284                 # Get video webpage
1285                 self.report_video_webpage_download(video_id)
1286                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1287                 try:
1288                         video_webpage = urllib2.urlopen(request).read()
1289                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1290                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1291                         return
1292
1293                 # Attempt to extract SWF player URL
1294                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1295                 if mobj is not None:
1296                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1297                 else:
1298                         player_url = None
1299
1300                 # Get video info
1301                 self.report_video_info_webpage_download(video_id)
1302                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1303                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1304                                         % (video_id, el_type))
1305                         request = urllib2.Request(video_info_url)
1306                         try:
1307                                 video_info_webpage = urllib2.urlopen(request).read()
1308                                 video_info = parse_qs(video_info_webpage)
1309                                 if 'token' in video_info:
1310                                         break
1311                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1312                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1313                                 return
1314                 if 'token' not in video_info:
1315                         if 'reason' in video_info:
1316                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1317                         else:
1318                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1319                         return
1320
1321                 # Start extracting information
1322                 self.report_information_extraction(video_id)
1323
1324                 # uploader
1325                 if 'author' not in video_info:
1326                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1327                         return
1328                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1329
1330                 # title
1331                 if 'title' not in video_info:
1332                         self._downloader.trouble(u'ERROR: unable to extract video title')
1333                         return
1334                 video_title = urllib.unquote_plus(video_info['title'][0])
1335                 video_title = video_title.decode('utf-8')
1336                 video_title = sanitize_title(video_title)
1337
1338                 # simplified title
1339                 simple_title = _simplify_title(video_title)
1340
1341                 # thumbnail image
1342                 if 'thumbnail_url' not in video_info:
1343                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1344                         video_thumbnail = ''
1345                 else:   # don't panic if we can't find it
1346                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1347
1348                 # upload date
1349                 upload_date = u'NA'
1350                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1351                 if mobj is not None:
1352                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1353                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1354                         for expression in format_expressions:
1355                                 try:
1356                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1357                                 except:
1358                                         pass
1359
1360                 # description
1361                 try:
1362                         lxml.etree
1363                 except NameError:
1364                         video_description = u'No description available.'
1365                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1366                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1367                                 if mobj is not None:
1368                                         video_description = mobj.group(1).decode('utf-8')
1369                 else:
1370                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1371                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1372                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1373                         # TODO use another parser
1374
1375                 # token
1376                 video_token = urllib.unquote_plus(video_info['token'][0])
1377
1378                 # Decide which formats to download
1379                 req_format = self._downloader.params.get('format', None)
1380
1381                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1382                         self.report_rtmp_download()
1383                         video_url_list = [(None, video_info['conn'][0])]
1384                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1385                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1386                         url_data = [parse_qs(uds) for uds in url_data_strs]
1387                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1388                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1389
1390                         format_limit = self._downloader.params.get('format_limit', None)
1391                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1392                         if format_limit is not None and format_limit in available_formats:
1393                                 format_list = available_formats[available_formats.index(format_limit):]
1394                         else:
1395                                 format_list = available_formats
1396                         existing_formats = [x for x in format_list if x in url_map]
1397                         if len(existing_formats) == 0:
1398                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1399                                 return
1400                         if self._downloader.params.get('listformats', None):
1401                                 self._print_formats(existing_formats)
1402                                 return
1403                         if req_format is None or req_format == 'best':
1404                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1405                         elif req_format == 'worst':
1406                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1407                         elif req_format in ('-1', 'all'):
1408                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1409                         else:
1410                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1411                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1412                                 req_formats = req_format.split('/')
1413                                 video_url_list = None
1414                                 for rf in req_formats:
1415                                         if rf in url_map:
1416                                                 video_url_list = [(rf, url_map[rf])]
1417                                                 break
1418                                 if video_url_list is None:
1419                                         self._downloader.trouble(u'ERROR: requested format not available')
1420                                         return
1421                 else:
1422                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1423                         return
1424
1425                 for format_param, video_real_url in video_url_list:
1426                         # At this point we have a new video
1427                         self._downloader.increment_downloads()
1428
1429                         # Extension
1430                         video_extension = self._video_extensions.get(format_param, 'flv')
1431
1432                         try:
1433                                 # Process video information
1434                                 self._downloader.process_info({
1435                                         'id':           video_id.decode('utf-8'),
1436                                         'url':          video_real_url.decode('utf-8'),
1437                                         'uploader':     video_uploader.decode('utf-8'),
1438                                         'upload_date':  upload_date,
1439                                         'title':        video_title,
1440                                         'stitle':       simple_title,
1441                                         'ext':          video_extension.decode('utf-8'),
1442                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1443                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1444                                         'description':  video_description,
1445                                         'player_url':   player_url,
1446                                 })
1447                         except UnavailableVideoError, err:
1448                                 self._downloader.trouble(u'\nERROR: unable to download video')
1449
1450
1451 class MetacafeIE(InfoExtractor):
1452         """Information Extractor for metacafe.com."""
1453
1454         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1455         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1456         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1457         _youtube_ie = None
1458         IE_NAME = u'metacafe'
1459
1460         def __init__(self, youtube_ie, downloader=None):
1461                 InfoExtractor.__init__(self, downloader)
1462                 self._youtube_ie = youtube_ie
1463
1464         def report_disclaimer(self):
1465                 """Report disclaimer retrieval."""
1466                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1467
1468         def report_age_confirmation(self):
1469                 """Report attempt to confirm age."""
1470                 self._downloader.to_screen(u'[metacafe] Confirming age')
1471
1472         def report_download_webpage(self, video_id):
1473                 """Report webpage download."""
1474                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1475
1476         def report_extraction(self, video_id):
1477                 """Report information extraction."""
1478                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1479
1480         def _real_initialize(self):
1481                 # Retrieve disclaimer
1482                 request = urllib2.Request(self._DISCLAIMER)
1483                 try:
1484                         self.report_disclaimer()
1485                         disclaimer = urllib2.urlopen(request).read()
1486                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1487                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1488                         return
1489
1490                 # Confirm age
1491                 disclaimer_form = {
1492                         'filters': '0',
1493                         'submit': "Continue - I'm over 18",
1494                         }
1495                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1496                 try:
1497                         self.report_age_confirmation()
1498                         disclaimer = urllib2.urlopen(request).read()
1499                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1500                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1501                         return
1502
1503         def _real_extract(self, url):
1504                 # Extract id and simplified title from URL
1505                 mobj = re.match(self._VALID_URL, url)
1506                 if mobj is None:
1507                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1508                         return
1509
1510                 video_id = mobj.group(1)
1511
1512                 # Check if video comes from YouTube
1513                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1514                 if mobj2 is not None:
1515                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1516                         return
1517
1518                 # At this point we have a new video
1519                 self._downloader.increment_downloads()
1520
1521                 simple_title = mobj.group(2).decode('utf-8')
1522
1523                 # Retrieve video webpage to extract further information
1524                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1525                 try:
1526                         self.report_download_webpage(video_id)
1527                         webpage = urllib2.urlopen(request).read()
1528                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1529                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1530                         return
1531
1532                 # Extract URL, uploader and title from webpage
1533                 self.report_extraction(video_id)
1534                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1535                 if mobj is not None:
1536                         mediaURL = urllib.unquote(mobj.group(1))
1537                         video_extension = mediaURL[-3:]
1538
1539                         # Extract gdaKey if available
1540                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1541                         if mobj is None:
1542                                 video_url = mediaURL
1543                         else:
1544                                 gdaKey = mobj.group(1)
1545                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1546                 else:
1547                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1548                         if mobj is None:
1549                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1550                                 return
1551                         vardict = parse_qs(mobj.group(1))
1552                         if 'mediaData' not in vardict:
1553                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1554                                 return
1555                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1556                         if mobj is None:
1557                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1558                                 return
1559                         mediaURL = mobj.group(1).replace('\\/', '/')
1560                         video_extension = mediaURL[-3:]
1561                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1562
1563                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1564                 if mobj is None:
1565                         self._downloader.trouble(u'ERROR: unable to extract title')
1566                         return
1567                 video_title = mobj.group(1).decode('utf-8')
1568                 video_title = sanitize_title(video_title)
1569
1570                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1571                 if mobj is None:
1572                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1573                         return
1574                 video_uploader = mobj.group(1)
1575
1576                 try:
1577                         # Process video information
1578                         self._downloader.process_info({
1579                                 'id':           video_id.decode('utf-8'),
1580                                 'url':          video_url.decode('utf-8'),
1581                                 'uploader':     video_uploader.decode('utf-8'),
1582                                 'upload_date':  u'NA',
1583                                 'title':        video_title,
1584                                 'stitle':       simple_title,
1585                                 'ext':          video_extension.decode('utf-8'),
1586                                 'format':       u'NA',
1587                                 'player_url':   None,
1588                         })
1589                 except UnavailableVideoError:
1590                         self._downloader.trouble(u'\nERROR: unable to download video')
1591
1592
1593 class DailymotionIE(InfoExtractor):
1594         """Information Extractor for Dailymotion"""
1595
1596         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1597         IE_NAME = u'dailymotion'
1598
1599         def __init__(self, downloader=None):
1600                 InfoExtractor.__init__(self, downloader)
1601
1602         def report_download_webpage(self, video_id):
1603                 """Report webpage download."""
1604                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1605
1606         def report_extraction(self, video_id):
1607                 """Report information extraction."""
1608                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1609
1610         def _real_extract(self, url):
1611                 # Extract id and simplified title from URL
1612                 mobj = re.match(self._VALID_URL, url)
1613                 if mobj is None:
1614                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1615                         return
1616
1617                 # At this point we have a new video
1618                 self._downloader.increment_downloads()
1619                 video_id = mobj.group(1)
1620
1621                 video_extension = 'flv'
1622
1623                 # Retrieve video webpage to extract further information
1624                 request = urllib2.Request(url)
1625                 request.add_header('Cookie', 'family_filter=off')
1626                 try:
1627                         self.report_download_webpage(video_id)
1628                         webpage = urllib2.urlopen(request).read()
1629                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1630                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1631                         return
1632
1633                 # Extract URL, uploader and title from webpage
1634                 self.report_extraction(video_id)
1635                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1636                 if mobj is None:
1637                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1638                         return
1639                 sequence = urllib.unquote(mobj.group(1))
1640                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1641                 if mobj is None:
1642                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1643                         return
1644                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1645
1646                 # if needed add http://www.dailymotion.com/ if relative URL
1647
1648                 video_url = mediaURL
1649
1650                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1651                 if mobj is None:
1652                         self._downloader.trouble(u'ERROR: unable to extract title')
1653                         return
1654                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1655                 video_title = sanitize_title(video_title)
1656                 simple_title = _simplify_title(video_title)
1657
1658                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1659                 if mobj is None:
1660                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1661                         return
1662                 video_uploader = mobj.group(1)
1663
1664                 try:
1665                         # Process video information
1666                         self._downloader.process_info({
1667                                 'id':           video_id.decode('utf-8'),
1668                                 'url':          video_url.decode('utf-8'),
1669                                 'uploader':     video_uploader.decode('utf-8'),
1670                                 'upload_date':  u'NA',
1671                                 'title':        video_title,
1672                                 'stitle':       simple_title,
1673                                 'ext':          video_extension.decode('utf-8'),
1674                                 'format':       u'NA',
1675                                 'player_url':   None,
1676                         })
1677                 except UnavailableVideoError:
1678                         self._downloader.trouble(u'\nERROR: unable to download video')
1679
1680
1681 class GoogleIE(InfoExtractor):
1682         """Information extractor for video.google.com."""
1683
1684         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1685         IE_NAME = u'video.google'
1686
1687         def __init__(self, downloader=None):
1688                 InfoExtractor.__init__(self, downloader)
1689
1690         def report_download_webpage(self, video_id):
1691                 """Report webpage download."""
1692                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1693
1694         def report_extraction(self, video_id):
1695                 """Report information extraction."""
1696                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1697
1698         def _real_extract(self, url):
1699                 # Extract id from URL
1700                 mobj = re.match(self._VALID_URL, url)
1701                 if mobj is None:
1702                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1703                         return
1704
1705                 # At this point we have a new video
1706                 self._downloader.increment_downloads()
1707                 video_id = mobj.group(1)
1708
1709                 video_extension = 'mp4'
1710
1711                 # Retrieve video webpage to extract further information
1712                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1713                 try:
1714                         self.report_download_webpage(video_id)
1715                         webpage = urllib2.urlopen(request).read()
1716                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1718                         return
1719
1720                 # Extract URL, uploader, and title from webpage
1721                 self.report_extraction(video_id)
1722                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1723                 if mobj is None:
1724                         video_extension = 'flv'
1725                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1726                 if mobj is None:
1727                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1728                         return
1729                 mediaURL = urllib.unquote(mobj.group(1))
1730                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1731                 mediaURL = mediaURL.replace('\\x26', '\x26')
1732
1733                 video_url = mediaURL
1734
1735                 mobj = re.search(r'<title>(.*)</title>', webpage)
1736                 if mobj is None:
1737                         self._downloader.trouble(u'ERROR: unable to extract title')
1738                         return
1739                 video_title = mobj.group(1).decode('utf-8')
1740                 video_title = sanitize_title(video_title)
1741                 simple_title = _simplify_title(video_title)
1742
1743                 # Extract video description
1744                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1745                 if mobj is None:
1746                         self._downloader.trouble(u'ERROR: unable to extract video description')
1747                         return
1748                 video_description = mobj.group(1).decode('utf-8')
1749                 if not video_description:
1750                         video_description = 'No description available.'
1751
1752                 # Extract video thumbnail
1753                 if self._downloader.params.get('forcethumbnail', False):
1754                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1755                         try:
1756                                 webpage = urllib2.urlopen(request).read()
1757                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1758                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1759                                 return
1760                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1761                         if mobj is None:
1762                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1763                                 return
1764                         video_thumbnail = mobj.group(1)
1765                 else:   # we need something to pass to process_info
1766                         video_thumbnail = ''
1767
1768                 try:
1769                         # Process video information
1770                         self._downloader.process_info({
1771                                 'id':           video_id.decode('utf-8'),
1772                                 'url':          video_url.decode('utf-8'),
1773                                 'uploader':     u'NA',
1774                                 'upload_date':  u'NA',
1775                                 'title':        video_title,
1776                                 'stitle':       simple_title,
1777                                 'ext':          video_extension.decode('utf-8'),
1778                                 'format':       u'NA',
1779                                 'player_url':   None,
1780                         })
1781                 except UnavailableVideoError:
1782                         self._downloader.trouble(u'\nERROR: unable to download video')
1783
1784
1785 class PhotobucketIE(InfoExtractor):
1786         """Information extractor for photobucket.com."""
1787
1788         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1789         IE_NAME = u'photobucket'
1790
1791         def __init__(self, downloader=None):
1792                 InfoExtractor.__init__(self, downloader)
1793
1794         def report_download_webpage(self, video_id):
1795                 """Report webpage download."""
1796                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1797
1798         def report_extraction(self, video_id):
1799                 """Report information extraction."""
1800                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1801
1802         def _real_extract(self, url):
1803                 # Extract id from URL
1804                 mobj = re.match(self._VALID_URL, url)
1805                 if mobj is None:
1806                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1807                         return
1808
1809                 # At this point we have a new video
1810                 self._downloader.increment_downloads()
1811                 video_id = mobj.group(1)
1812
1813                 video_extension = 'flv'
1814
1815                 # Retrieve video webpage to extract further information
1816                 request = urllib2.Request(url)
1817                 try:
1818                         self.report_download_webpage(video_id)
1819                         webpage = urllib2.urlopen(request).read()
1820                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1821                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1822                         return
1823
1824                 # Extract URL, uploader, and title from webpage
1825                 self.report_extraction(video_id)
1826                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1827                 if mobj is None:
1828                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1829                         return
1830                 mediaURL = urllib.unquote(mobj.group(1))
1831
1832                 video_url = mediaURL
1833
1834                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1835                 if mobj is None:
1836                         self._downloader.trouble(u'ERROR: unable to extract title')
1837                         return
1838                 video_title = mobj.group(1).decode('utf-8')
1839                 video_title = sanitize_title(video_title)
1840                 simple_title = _simplify_title(vide_title)
1841
1842                 video_uploader = mobj.group(2).decode('utf-8')
1843
1844                 try:
1845                         # Process video information
1846                         self._downloader.process_info({
1847                                 'id':           video_id.decode('utf-8'),
1848                                 'url':          video_url.decode('utf-8'),
1849                                 'uploader':     video_uploader,
1850                                 'upload_date':  u'NA',
1851                                 'title':        video_title,
1852                                 'stitle':       simple_title,
1853                                 'ext':          video_extension.decode('utf-8'),
1854                                 'format':       u'NA',
1855                                 'player_url':   None,
1856                         })
1857                 except UnavailableVideoError:
1858                         self._downloader.trouble(u'\nERROR: unable to download video')
1859
1860
1861 class YahooIE(InfoExtractor):
1862         """Information extractor for video.yahoo.com."""
1863
1864         # _VALID_URL matches all Yahoo! Video URLs
1865         # _VPAGE_URL matches only the extractable '/watch/' URLs
1866         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1867         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1868         IE_NAME = u'video.yahoo'
1869
1870         def __init__(self, downloader=None):
1871                 InfoExtractor.__init__(self, downloader)
1872
1873         def report_download_webpage(self, video_id):
1874                 """Report webpage download."""
1875                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1876
1877         def report_extraction(self, video_id):
1878                 """Report information extraction."""
1879                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1880
1881         def _real_extract(self, url, new_video=True):
1882                 # Extract ID from URL
1883                 mobj = re.match(self._VALID_URL, url)
1884                 if mobj is None:
1885                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1886                         return
1887
1888                 # At this point we have a new video
1889                 self._downloader.increment_downloads()
1890                 video_id = mobj.group(2)
1891                 video_extension = 'flv'
1892
1893                 # Rewrite valid but non-extractable URLs as
1894                 # extractable English language /watch/ URLs
1895                 if re.match(self._VPAGE_URL, url) is None:
1896                         request = urllib2.Request(url)
1897                         try:
1898                                 webpage = urllib2.urlopen(request).read()
1899                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1900                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1901                                 return
1902
1903                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1904                         if mobj is None:
1905                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1906                                 return
1907                         yahoo_id = mobj.group(1)
1908
1909                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1910                         if mobj is None:
1911                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1912                                 return
1913                         yahoo_vid = mobj.group(1)
1914
1915                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1916                         return self._real_extract(url, new_video=False)
1917
1918                 # Retrieve video webpage to extract further information
1919                 request = urllib2.Request(url)
1920                 try:
1921                         self.report_download_webpage(video_id)
1922                         webpage = urllib2.urlopen(request).read()
1923                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1924                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1925                         return
1926
1927                 # Extract uploader and title from webpage
1928                 self.report_extraction(video_id)
1929                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1930                 if mobj is None:
1931                         self._downloader.trouble(u'ERROR: unable to extract video title')
1932                         return
1933                 video_title = mobj.group(1).decode('utf-8')
1934                 simple_title = _simplify_title(video_title)
1935
1936                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1937                 if mobj is None:
1938                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1939                         return
1940                 video_uploader = mobj.group(1).decode('utf-8')
1941
1942                 # Extract video thumbnail
1943                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1944                 if mobj is None:
1945                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1946                         return
1947                 video_thumbnail = mobj.group(1).decode('utf-8')
1948
1949                 # Extract video description
1950                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1951                 if mobj is None:
1952                         self._downloader.trouble(u'ERROR: unable to extract video description')
1953                         return
1954                 video_description = mobj.group(1).decode('utf-8')
1955                 if not video_description:
1956                         video_description = 'No description available.'
1957
1958                 # Extract video height and width
1959                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1960                 if mobj is None:
1961                         self._downloader.trouble(u'ERROR: unable to extract video height')
1962                         return
1963                 yv_video_height = mobj.group(1)
1964
1965                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1966                 if mobj is None:
1967                         self._downloader.trouble(u'ERROR: unable to extract video width')
1968                         return
1969                 yv_video_width = mobj.group(1)
1970
1971                 # Retrieve video playlist to extract media URL
1972                 # I'm not completely sure what all these options are, but we
1973                 # seem to need most of them, otherwise the server sends a 401.
1974                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1975                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1976                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1977                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1978                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1979                 try:
1980                         self.report_download_webpage(video_id)
1981                         webpage = urllib2.urlopen(request).read()
1982                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1983                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1984                         return
1985
1986                 # Extract media URL from playlist XML
1987                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1988                 if mobj is None:
1989                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1990                         return
1991                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1992                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1993
1994                 try:
1995                         # Process video information
1996                         self._downloader.process_info({
1997                                 'id':           video_id.decode('utf-8'),
1998                                 'url':          video_url,
1999                                 'uploader':     video_uploader,
2000                                 'upload_date':  u'NA',
2001                                 'title':        video_title,
2002                                 'stitle':       simple_title,
2003                                 'ext':          video_extension.decode('utf-8'),
2004                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2005                                 'description':  video_description,
2006                                 'thumbnail':    video_thumbnail,
2007                                 'player_url':   None,
2008                         })
2009                 except UnavailableVideoError:
2010                         self._downloader.trouble(u'\nERROR: unable to download video')
2011
2012
2013 class VimeoIE(InfoExtractor):
2014         """Information extractor for vimeo.com."""
2015
2016         # _VALID_URL matches Vimeo URLs
2017         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2018         IE_NAME = u'vimeo'
2019
2020         def __init__(self, downloader=None):
2021                 InfoExtractor.__init__(self, downloader)
2022
2023         def report_download_webpage(self, video_id):
2024                 """Report webpage download."""
2025                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2026
2027         def report_extraction(self, video_id):
2028                 """Report information extraction."""
2029                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2030
2031         def _real_extract(self, url, new_video=True):
2032                 # Extract ID from URL
2033                 mobj = re.match(self._VALID_URL, url)
2034                 if mobj is None:
2035                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2036                         return
2037
2038                 # At this point we have a new video
2039                 self._downloader.increment_downloads()
2040                 video_id = mobj.group(1)
2041
2042                 # Retrieve video webpage to extract further information
2043                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2044                 try:
2045                         self.report_download_webpage(video_id)
2046                         webpage = urllib2.urlopen(request).read()
2047                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2048                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2049                         return
2050
2051                 # Now we begin extracting as much information as we can from what we
2052                 # retrieved. First we extract the information common to all extractors,
2053                 # and latter we extract those that are Vimeo specific.
2054                 self.report_extraction(video_id)
2055
2056                 # Extract title
2057                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2058                 if mobj is None:
2059                         self._downloader.trouble(u'ERROR: unable to extract video title')
2060                         return
2061                 video_title = mobj.group(1).decode('utf-8')
2062                 simple_title = _simplify_title(video_title)
2063
2064                 # Extract uploader
2065                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2066                 if mobj is None:
2067                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2068                         return
2069                 video_uploader = mobj.group(1).decode('utf-8')
2070
2071                 # Extract video thumbnail
2072                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2073                 if mobj is None:
2074                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2075                         return
2076                 video_thumbnail = mobj.group(1).decode('utf-8')
2077
2078                 # # Extract video description
2079                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2080                 # if mobj is None:
2081                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2082                 #       return
2083                 # video_description = mobj.group(1).decode('utf-8')
2084                 # if not video_description: video_description = 'No description available.'
2085                 video_description = 'Foo.'
2086
2087                 # Vimeo specific: extract request signature
2088                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2089                 if mobj is None:
2090                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2091                         return
2092                 sig = mobj.group(1).decode('utf-8')
2093
2094                 # Vimeo specific: extract video quality information
2095                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2096                 if mobj is None:
2097                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2098                         return
2099                 quality = mobj.group(1).decode('utf-8')
2100
2101                 if int(quality) == 1:
2102                         quality = 'hd'
2103                 else:
2104                         quality = 'sd'
2105
2106                 # Vimeo specific: Extract request signature expiration
2107                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2108                 if mobj is None:
2109                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2110                         return
2111                 sig_exp = mobj.group(1).decode('utf-8')
2112
2113                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2114
2115                 try:
2116                         # Process video information
2117                         self._downloader.process_info({
2118                                 'id':           video_id.decode('utf-8'),
2119                                 'url':          video_url,
2120                                 'uploader':     video_uploader,
2121                                 'upload_date':  u'NA',
2122                                 'title':        video_title,
2123                                 'stitle':       simple_title,
2124                                 'ext':          u'mp4',
2125                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2126                                 'description':  video_description,
2127                                 'thumbnail':    video_thumbnail,
2128                                 'description':  video_description,
2129                                 'player_url':   None,
2130                         })
2131                 except UnavailableVideoError:
2132                         self._downloader.trouble(u'ERROR: unable to download video')
2133
2134
2135 class GenericIE(InfoExtractor):
2136         """Generic last-resort information extractor."""
2137
2138         _VALID_URL = r'.*'
2139         IE_NAME = u'generic'
2140
2141         def __init__(self, downloader=None):
2142                 InfoExtractor.__init__(self, downloader)
2143
2144         def report_download_webpage(self, video_id):
2145                 """Report webpage download."""
2146                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2147                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2148
2149         def report_extraction(self, video_id):
2150                 """Report information extraction."""
2151                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2152
2153         def _real_extract(self, url):
2154                 # At this point we have a new video
2155                 self._downloader.increment_downloads()
2156
2157                 video_id = url.split('/')[-1]
2158                 request = urllib2.Request(url)
2159                 try:
2160                         self.report_download_webpage(video_id)
2161                         webpage = urllib2.urlopen(request).read()
2162                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2163                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2164                         return
2165                 except ValueError, err:
2166                         # since this is the last-resort InfoExtractor, if
2167                         # this error is thrown, it'll be thrown here
2168                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2169                         return
2170
2171                 self.report_extraction(video_id)
2172                 # Start with something easy: JW Player in SWFObject
2173                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2174                 if mobj is None:
2175                         # Broaden the search a little bit
2176                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2177                 if mobj is None:
2178                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2179                         return
2180
2181                 # It's possible that one of the regexes
2182                 # matched, but returned an empty group:
2183                 if mobj.group(1) is None:
2184                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2185                         return
2186
2187                 video_url = urllib.unquote(mobj.group(1))
2188                 video_id = os.path.basename(video_url)
2189
2190                 # here's a fun little line of code for you:
2191                 video_extension = os.path.splitext(video_id)[1][1:]
2192                 video_id = os.path.splitext(video_id)[0]
2193
2194                 # it's tempting to parse this further, but you would
2195                 # have to take into account all the variations like
2196                 #   Video Title - Site Name
2197                 #   Site Name | Video Title
2198                 #   Video Title - Tagline | Site Name
2199                 # and so on and so forth; it's just not practical
2200                 mobj = re.search(r'<title>(.*)</title>', webpage)
2201                 if mobj is None:
2202                         self._downloader.trouble(u'ERROR: unable to extract title')
2203                         return
2204                 video_title = mobj.group(1).decode('utf-8')
2205                 video_title = sanitize_title(video_title)
2206                 simple_title = _simplify_title(video_title)
2207
2208                 # video uploader is domain name
2209                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2210                 if mobj is None:
2211                         self._downloader.trouble(u'ERROR: unable to extract title')
2212                         return
2213                 video_uploader = mobj.group(1).decode('utf-8')
2214
2215                 try:
2216                         # Process video information
2217                         self._downloader.process_info({
2218                                 'id':           video_id.decode('utf-8'),
2219                                 'url':          video_url.decode('utf-8'),
2220                                 'uploader':     video_uploader,
2221                                 'upload_date':  u'NA',
2222                                 'title':        video_title,
2223                                 'stitle':       simple_title,
2224                                 'ext':          video_extension.decode('utf-8'),
2225                                 'format':       u'NA',
2226                                 'player_url':   None,
2227                         })
2228                 except UnavailableVideoError, err:
2229                         self._downloader.trouble(u'\nERROR: unable to download video')
2230
2231
2232 class YoutubeSearchIE(InfoExtractor):
2233         """Information Extractor for YouTube search queries."""
2234         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2235         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2236         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2237         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2238         _youtube_ie = None
2239         _max_youtube_results = 1000
2240         IE_NAME = u'youtube:search'
2241
2242         def __init__(self, youtube_ie, downloader=None):
2243                 InfoExtractor.__init__(self, downloader)
2244                 self._youtube_ie = youtube_ie
2245
2246         def report_download_page(self, query, pagenum):
2247                 """Report attempt to download playlist page with given number."""
2248                 query = query.decode(preferredencoding())
2249                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2250
2251         def _real_initialize(self):
2252                 self._youtube_ie.initialize()
2253
2254         def _real_extract(self, query):
2255                 mobj = re.match(self._VALID_URL, query)
2256                 if mobj is None:
2257                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2258                         return
2259
2260                 prefix, query = query.split(':')
2261                 prefix = prefix[8:]
2262                 query = query.encode('utf-8')
2263                 if prefix == '':
2264                         self._download_n_results(query, 1)
2265                         return
2266                 elif prefix == 'all':
2267                         self._download_n_results(query, self._max_youtube_results)
2268                         return
2269                 else:
2270                         try:
2271                                 n = long(prefix)
2272                                 if n <= 0:
2273                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2274                                         return
2275                                 elif n > self._max_youtube_results:
2276                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2277                                         n = self._max_youtube_results
2278                                 self._download_n_results(query, n)
2279                                 return
2280                         except ValueError: # parsing prefix as integer fails
2281                                 self._download_n_results(query, 1)
2282                                 return
2283
2284         def _download_n_results(self, query, n):
2285                 """Downloads a specified number of results for a query"""
2286
2287                 video_ids = []
2288                 already_seen = set()
2289                 pagenum = 1
2290
2291                 while True:
2292                         self.report_download_page(query, pagenum)
2293                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2294                         request = urllib2.Request(result_url)
2295                         try:
2296                                 page = urllib2.urlopen(request).read()
2297                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2298                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2299                                 return
2300
2301                         # Extract video identifiers
2302                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2303                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2304                                 if video_id not in already_seen:
2305                                         video_ids.append(video_id)
2306                                         already_seen.add(video_id)
2307                                         if len(video_ids) == n:
2308                                                 # Specified n videos reached
2309                                                 for id in video_ids:
2310                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2311                                                 return
2312
2313                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2314                                 for id in video_ids:
2315                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2316                                 return
2317
2318                         pagenum = pagenum + 1
2319
2320
2321 class GoogleSearchIE(InfoExtractor):
2322         """Information Extractor for Google Video search queries."""
2323         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2324         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2325         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2326         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2327         _google_ie = None
2328         _max_google_results = 1000
2329         IE_NAME = u'video.google:search'
2330
2331         def __init__(self, google_ie, downloader=None):
2332                 InfoExtractor.__init__(self, downloader)
2333                 self._google_ie = google_ie
2334
2335         def report_download_page(self, query, pagenum):
2336                 """Report attempt to download playlist page with given number."""
2337                 query = query.decode(preferredencoding())
2338                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2339
2340         def _real_initialize(self):
2341                 self._google_ie.initialize()
2342
2343         def _real_extract(self, query):
2344                 mobj = re.match(self._VALID_URL, query)
2345                 if mobj is None:
2346                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2347                         return
2348
2349                 prefix, query = query.split(':')
2350                 prefix = prefix[8:]
2351                 query = query.encode('utf-8')
2352                 if prefix == '':
2353                         self._download_n_results(query, 1)
2354                         return
2355                 elif prefix == 'all':
2356                         self._download_n_results(query, self._max_google_results)
2357                         return
2358                 else:
2359                         try:
2360                                 n = long(prefix)
2361                                 if n <= 0:
2362                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2363                                         return
2364                                 elif n > self._max_google_results:
2365                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2366                                         n = self._max_google_results
2367                                 self._download_n_results(query, n)
2368                                 return
2369                         except ValueError: # parsing prefix as integer fails
2370                                 self._download_n_results(query, 1)
2371                                 return
2372
2373         def _download_n_results(self, query, n):
2374                 """Downloads a specified number of results for a query"""
2375
2376                 video_ids = []
2377                 already_seen = set()
2378                 pagenum = 1
2379
2380                 while True:
2381                         self.report_download_page(query, pagenum)
2382                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2383                         request = urllib2.Request(result_url)
2384                         try:
2385                                 page = urllib2.urlopen(request).read()
2386                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2387                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2388                                 return
2389
2390                         # Extract video identifiers
2391                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2392                                 video_id = mobj.group(1)
2393                                 if video_id not in already_seen:
2394                                         video_ids.append(video_id)
2395                                         already_seen.add(video_id)
2396                                         if len(video_ids) == n:
2397                                                 # Specified n videos reached
2398                                                 for id in video_ids:
2399                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2400                                                 return
2401
2402                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2403                                 for id in video_ids:
2404                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2405                                 return
2406
2407                         pagenum = pagenum + 1
2408
2409
2410 class YahooSearchIE(InfoExtractor):
2411         """Information Extractor for Yahoo! Video search queries."""
2412         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2413         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2414         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2415         _MORE_PAGES_INDICATOR = r'\s*Next'
2416         _yahoo_ie = None
2417         _max_yahoo_results = 1000
2418         IE_NAME = u'video.yahoo:search'
2419
2420         def __init__(self, yahoo_ie, downloader=None):
2421                 InfoExtractor.__init__(self, downloader)
2422                 self._yahoo_ie = yahoo_ie
2423
2424         def report_download_page(self, query, pagenum):
2425                 """Report attempt to download playlist page with given number."""
2426                 query = query.decode(preferredencoding())
2427                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2428
2429         def _real_initialize(self):
2430                 self._yahoo_ie.initialize()
2431
2432         def _real_extract(self, query):
2433                 mobj = re.match(self._VALID_URL, query)
2434                 if mobj is None:
2435                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2436                         return
2437
2438                 prefix, query = query.split(':')
2439                 prefix = prefix[8:]
2440                 query = query.encode('utf-8')
2441                 if prefix == '':
2442                         self._download_n_results(query, 1)
2443                         return
2444                 elif prefix == 'all':
2445                         self._download_n_results(query, self._max_yahoo_results)
2446                         return
2447                 else:
2448                         try:
2449                                 n = long(prefix)
2450                                 if n <= 0:
2451                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2452                                         return
2453                                 elif n > self._max_yahoo_results:
2454                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2455                                         n = self._max_yahoo_results
2456                                 self._download_n_results(query, n)
2457                                 return
2458                         except ValueError: # parsing prefix as integer fails
2459                                 self._download_n_results(query, 1)
2460                                 return
2461
2462         def _download_n_results(self, query, n):
2463                 """Downloads a specified number of results for a query"""
2464
2465                 video_ids = []
2466                 already_seen = set()
2467                 pagenum = 1
2468
2469                 while True:
2470                         self.report_download_page(query, pagenum)
2471                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2472                         request = urllib2.Request(result_url)
2473                         try:
2474                                 page = urllib2.urlopen(request).read()
2475                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2476                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2477                                 return
2478
2479                         # Extract video identifiers
2480                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2481                                 video_id = mobj.group(1)
2482                                 if video_id not in already_seen:
2483                                         video_ids.append(video_id)
2484                                         already_seen.add(video_id)
2485                                         if len(video_ids) == n:
2486                                                 # Specified n videos reached
2487                                                 for id in video_ids:
2488                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2489                                                 return
2490
2491                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2492                                 for id in video_ids:
2493                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2494                                 return
2495
2496                         pagenum = pagenum + 1
2497
2498
2499 class YoutubePlaylistIE(InfoExtractor):
2500         """Information Extractor for YouTube playlists."""
2501
2502         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2503         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2504         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2505         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2506         _youtube_ie = None
2507         IE_NAME = u'youtube:playlist'
2508
2509         def __init__(self, youtube_ie, downloader=None):
2510                 InfoExtractor.__init__(self, downloader)
2511                 self._youtube_ie = youtube_ie
2512
2513         def report_download_page(self, playlist_id, pagenum):
2514                 """Report attempt to download playlist page with given number."""
2515                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2516
2517         def _real_initialize(self):
2518                 self._youtube_ie.initialize()
2519
2520         def _real_extract(self, url):
2521                 # Extract playlist id
2522                 mobj = re.match(self._VALID_URL, url)
2523                 if mobj is None:
2524                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2525                         return
2526
2527                 # Single video case
2528                 if mobj.group(3) is not None:
2529                         self._youtube_ie.extract(mobj.group(3))
2530                         return
2531
2532                 # Download playlist pages
2533                 # prefix is 'p' as default for playlists but there are other types that need extra care
2534                 playlist_prefix = mobj.group(1)
2535                 if playlist_prefix == 'a':
2536                         playlist_access = 'artist'
2537                 else:
2538                         playlist_prefix = 'p'
2539                         playlist_access = 'view_play_list'
2540                 playlist_id = mobj.group(2)
2541                 video_ids = []
2542                 pagenum = 1
2543
2544                 while True:
2545                         self.report_download_page(playlist_id, pagenum)
2546                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2547                         request = urllib2.Request(url)
2548                         try:
2549                                 page = urllib2.urlopen(request).read()
2550                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2551                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2552                                 return
2553
2554                         # Extract video identifiers
2555                         ids_in_page = []
2556                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2557                                 if mobj.group(1) not in ids_in_page:
2558                                         ids_in_page.append(mobj.group(1))
2559                         video_ids.extend(ids_in_page)
2560
2561                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2562                                 break
2563                         pagenum = pagenum + 1
2564
2565                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2566                 playlistend = self._downloader.params.get('playlistend', -1)
2567                 video_ids = video_ids[playliststart:playlistend]
2568
2569                 for id in video_ids:
2570                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2571                 return
2572
2573
2574 class YoutubeUserIE(InfoExtractor):
2575         """Information Extractor for YouTube users."""
2576
2577         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2578         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2579         _GDATA_PAGE_SIZE = 50
2580         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2581         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2582         _youtube_ie = None
2583         IE_NAME = u'youtube:user'
2584
2585         def __init__(self, youtube_ie, downloader=None):
2586                 InfoExtractor.__init__(self, downloader)
2587                 self._youtube_ie = youtube_ie
2588
2589         def report_download_page(self, username, start_index):
2590                 """Report attempt to download user page."""
2591                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2592                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2593
2594         def _real_initialize(self):
2595                 self._youtube_ie.initialize()
2596
2597         def _real_extract(self, url):
2598                 # Extract username
2599                 mobj = re.match(self._VALID_URL, url)
2600                 if mobj is None:
2601                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2602                         return
2603
2604                 username = mobj.group(1)
2605
2606                 # Download video ids using YouTube Data API. Result size per
2607                 # query is limited (currently to 50 videos) so we need to query
2608                 # page by page until there are no video ids - it means we got
2609                 # all of them.
2610
2611                 video_ids = []
2612                 pagenum = 0
2613
2614                 while True:
2615                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2616                         self.report_download_page(username, start_index)
2617
2618                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2619
2620                         try:
2621                                 page = urllib2.urlopen(request).read()
2622                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2623                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2624                                 return
2625
2626                         # Extract video identifiers
2627                         ids_in_page = []
2628
2629                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2630                                 if mobj.group(1) not in ids_in_page:
2631                                         ids_in_page.append(mobj.group(1))
2632
2633                         video_ids.extend(ids_in_page)
2634
2635                         # A little optimization - if current page is not
2636                         # "full", ie. does not contain PAGE_SIZE video ids then
2637                         # we can assume that this page is the last one - there
2638                         # are no more ids on further pages - no need to query
2639                         # again.
2640
2641                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2642                                 break
2643
2644                         pagenum += 1
2645
2646                 all_ids_count = len(video_ids)
2647                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2648                 playlistend = self._downloader.params.get('playlistend', -1)
2649
2650                 if playlistend == -1:
2651                         video_ids = video_ids[playliststart:]
2652                 else:
2653                         video_ids = video_ids[playliststart:playlistend]
2654
2655                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2656                                 (username, all_ids_count, len(video_ids)))
2657
2658                 for video_id in video_ids:
2659                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2660
2661
2662 class DepositFilesIE(InfoExtractor):
2663         """Information extractor for depositfiles.com"""
2664
2665         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2666         IE_NAME = u'DepositFiles'
2667
2668         def __init__(self, downloader=None):
2669                 InfoExtractor.__init__(self, downloader)
2670
2671         def report_download_webpage(self, file_id):
2672                 """Report webpage download."""
2673                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2674
2675         def report_extraction(self, file_id):
2676                 """Report information extraction."""
2677                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2678
2679         def _real_extract(self, url):
2680                 # At this point we have a new file
2681                 self._downloader.increment_downloads()
2682
2683                 file_id = url.split('/')[-1]
2684                 # Rebuild url in english locale
2685                 url = 'http://depositfiles.com/en/files/' + file_id
2686
2687                 # Retrieve file webpage with 'Free download' button pressed
2688                 free_download_indication = { 'gateway_result' : '1' }
2689                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2690                 try:
2691                         self.report_download_webpage(file_id)
2692                         webpage = urllib2.urlopen(request).read()
2693                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2694                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2695                         return
2696
2697                 # Search for the real file URL
2698                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2699                 if (mobj is None) or (mobj.group(1) is None):
2700                         # Try to figure out reason of the error.
2701                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2702                         if (mobj is not None) and (mobj.group(1) is not None):
2703                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2704                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2705                         else:
2706                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2707                         return
2708
2709                 file_url = mobj.group(1)
2710                 file_extension = os.path.splitext(file_url)[1][1:]
2711
2712                 # Search for file title
2713                 mobj = re.search(r'<b title="(.*?)">', webpage)
2714                 if mobj is None:
2715                         self._downloader.trouble(u'ERROR: unable to extract title')
2716                         return
2717                 file_title = mobj.group(1).decode('utf-8')
2718
2719                 try:
2720                         # Process file information
2721                         self._downloader.process_info({
2722                                 'id':           file_id.decode('utf-8'),
2723                                 'url':          file_url.decode('utf-8'),
2724                                 'uploader':     u'NA',
2725                                 'upload_date':  u'NA',
2726                                 'title':        file_title,
2727                                 'stitle':       file_title,
2728                                 'ext':          file_extension.decode('utf-8'),
2729                                 'format':       u'NA',
2730                                 'player_url':   None,
2731                         })
2732                 except UnavailableVideoError, err:
2733                         self._downloader.trouble(u'ERROR: unable to download file')
2734
2735
2736 class FacebookIE(InfoExtractor):
2737         """Information Extractor for Facebook"""
2738
2739         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2740         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2741         _NETRC_MACHINE = 'facebook'
2742         _available_formats = ['video', 'highqual', 'lowqual']
2743         _video_extensions = {
2744                 'video': 'mp4',
2745                 'highqual': 'mp4',
2746                 'lowqual': 'mp4',
2747         }
2748         IE_NAME = u'facebook'
2749
2750         def __init__(self, downloader=None):
2751                 InfoExtractor.__init__(self, downloader)
2752
2753         def _reporter(self, message):
2754                 """Add header and report message."""
2755                 self._downloader.to_screen(u'[facebook] %s' % message)
2756
2757         def report_login(self):
2758                 """Report attempt to log in."""
2759                 self._reporter(u'Logging in')
2760
2761         def report_video_webpage_download(self, video_id):
2762                 """Report attempt to download video webpage."""
2763                 self._reporter(u'%s: Downloading video webpage' % video_id)
2764
2765         def report_information_extraction(self, video_id):
2766                 """Report attempt to extract video information."""
2767                 self._reporter(u'%s: Extracting video information' % video_id)
2768
2769         def _parse_page(self, video_webpage):
2770                 """Extract video information from page"""
2771                 # General data
2772                 data = {'title': r'\("video_title", "(.*?)"\)',
2773                         'description': r'<div class="datawrap">(.*?)</div>',
2774                         'owner': r'\("video_owner_name", "(.*?)"\)',
2775                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2776                         }
2777                 video_info = {}
2778                 for piece in data.keys():
2779                         mobj = re.search(data[piece], video_webpage)
2780                         if mobj is not None:
2781                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2782
2783                 # Video urls
2784                 video_urls = {}
2785                 for fmt in self._available_formats:
2786                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2787                         if mobj is not None:
2788                                 # URL is in a Javascript segment inside an escaped Unicode format within
2789                                 # the generally utf-8 page
2790                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2791                 video_info['video_urls'] = video_urls
2792
2793                 return video_info
2794
2795         def _real_initialize(self):
2796                 if self._downloader is None:
2797                         return
2798
2799                 useremail = None
2800                 password = None
2801                 downloader_params = self._downloader.params
2802
2803                 # Attempt to use provided username and password or .netrc data
2804                 if downloader_params.get('username', None) is not None:
2805                         useremail = downloader_params['username']
2806                         password = downloader_params['password']
2807                 elif downloader_params.get('usenetrc', False):
2808                         try:
2809                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2810                                 if info is not None:
2811                                         useremail = info[0]
2812                                         password = info[2]
2813                                 else:
2814                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2815                         except (IOError, netrc.NetrcParseError), err:
2816                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2817                                 return
2818
2819                 if useremail is None:
2820                         return
2821
2822                 # Log in
2823                 login_form = {
2824                         'email': useremail,
2825                         'pass': password,
2826                         'login': 'Log+In'
2827                         }
2828                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2829                 try:
2830                         self.report_login()
2831                         login_results = urllib2.urlopen(request).read()
2832                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2833                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2834                                 return
2835                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2836                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2837                         return
2838
2839         def _real_extract(self, url):
2840                 mobj = re.match(self._VALID_URL, url)
2841                 if mobj is None:
2842                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2843                         return
2844                 video_id = mobj.group('ID')
2845
2846                 # Get video webpage
2847                 self.report_video_webpage_download(video_id)
2848                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2849                 try:
2850                         page = urllib2.urlopen(request)
2851                         video_webpage = page.read()
2852                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2853                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2854                         return
2855
2856                 # Start extracting information
2857                 self.report_information_extraction(video_id)
2858
2859                 # Extract information
2860                 video_info = self._parse_page(video_webpage)
2861
2862                 # uploader
2863                 if 'owner' not in video_info:
2864                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2865                         return
2866                 video_uploader = video_info['owner']
2867
2868                 # title
2869                 if 'title' not in video_info:
2870                         self._downloader.trouble(u'ERROR: unable to extract video title')
2871                         return
2872                 video_title = video_info['title']
2873                 video_title = video_title.decode('utf-8')
2874                 video_title = sanitize_title(video_title)
2875
2876                 simple_title = _simplify_title(video_title)
2877
2878                 # thumbnail image
2879                 if 'thumbnail' not in video_info:
2880                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2881                         video_thumbnail = ''
2882                 else:
2883                         video_thumbnail = video_info['thumbnail']
2884
2885                 # upload date
2886                 upload_date = u'NA'
2887                 if 'upload_date' in video_info:
2888                         upload_time = video_info['upload_date']
2889                         timetuple = email.utils.parsedate_tz(upload_time)
2890                         if timetuple is not None:
2891                                 try:
2892                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2893                                 except:
2894                                         pass
2895
2896                 # description
2897                 video_description = video_info.get('description', 'No description available.')
2898
2899                 url_map = video_info['video_urls']
2900                 if len(url_map.keys()) > 0:
2901                         # Decide which formats to download
2902                         req_format = self._downloader.params.get('format', None)
2903                         format_limit = self._downloader.params.get('format_limit', None)
2904
2905                         if format_limit is not None and format_limit in self._available_formats:
2906                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2907                         else:
2908                                 format_list = self._available_formats
2909                         existing_formats = [x for x in format_list if x in url_map]
2910                         if len(existing_formats) == 0:
2911                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2912                                 return
2913                         if req_format is None:
2914                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2915                         elif req_format == 'worst':
2916                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2917                         elif req_format == '-1':
2918                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2919                         else:
2920                                 # Specific format
2921                                 if req_format not in url_map:
2922                                         self._downloader.trouble(u'ERROR: requested format not available')
2923                                         return
2924                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2925
2926                 for format_param, video_real_url in video_url_list:
2927
2928                         # At this point we have a new video
2929                         self._downloader.increment_downloads()
2930
2931                         # Extension
2932                         video_extension = self._video_extensions.get(format_param, 'mp4')
2933
2934                         try:
2935                                 # Process video information
2936                                 self._downloader.process_info({
2937                                         'id':           video_id.decode('utf-8'),
2938                                         'url':          video_real_url.decode('utf-8'),
2939                                         'uploader':     video_uploader.decode('utf-8'),
2940                                         'upload_date':  upload_date,
2941                                         'title':        video_title,
2942                                         'stitle':       simple_title,
2943                                         'ext':          video_extension.decode('utf-8'),
2944                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2945                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2946                                         'description':  video_description.decode('utf-8'),
2947                                         'player_url':   None,
2948                                 })
2949                         except UnavailableVideoError, err:
2950                                 self._downloader.trouble(u'\nERROR: unable to download video')
2951
2952 class BlipTVIE(InfoExtractor):
2953         """Information extractor for blip.tv"""
2954
2955         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2956         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2957         IE_NAME = u'blip.tv'
2958
2959         def report_extraction(self, file_id):
2960                 """Report information extraction."""
2961                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2962
2963         def report_direct_download(self, title):
2964                 """Report information extraction."""
2965                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2966
2967         def _real_extract(self, url):
2968                 mobj = re.match(self._VALID_URL, url)
2969                 if mobj is None:
2970                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2971                         return
2972
2973                 if '?' in url:
2974                         cchar = '&'
2975                 else:
2976                         cchar = '?'
2977                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2978                 request = urllib2.Request(json_url)
2979                 self.report_extraction(mobj.group(1))
2980                 info = None
2981                 try:
2982                         urlh = urllib2.urlopen(request)
2983                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2984                                 basename = url.split('/')[-1]
2985                                 title,ext = os.path.splitext(basename)
2986                                 title = title.decode('UTF-8')
2987                                 ext = ext.replace('.', '')
2988                                 self.report_direct_download(title)
2989                                 info = {
2990                                         'id': title,
2991                                         'url': url,
2992                                         'title': title,
2993                                         'stitle': _simplify_title(title),
2994                                         'ext': ext,
2995                                         'urlhandle': urlh
2996                                 }
2997                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2998                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2999                         return
3000                 if info is None: # Regular URL
3001                         try:
3002                                 json_code = urlh.read()
3003                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3004                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3005                                 return
3006
3007                         try:
3008                                 json_data = json.loads(json_code)
3009                                 if 'Post' in json_data:
3010                                         data = json_data['Post']
3011                                 else:
3012                                         data = json_data
3013
3014                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3015                                 video_url = data['media']['url']
3016                                 umobj = re.match(self._URL_EXT, video_url)
3017                                 if umobj is None:
3018                                         raise ValueError('Can not determine filename extension')
3019                                 ext = umobj.group(1)
3020
3021                                 info = {
3022                                         'id': data['item_id'],
3023                                         'url': video_url,
3024                                         'uploader': data['display_name'],
3025                                         'upload_date': upload_date,
3026                                         'title': data['title'],
3027                                         'stitle': _simplify_title(data['title']),
3028                                         'ext': ext,
3029                                         'format': data['media']['mimeType'],
3030                                         'thumbnail': data['thumbnailUrl'],
3031                                         'description': data['description'],
3032                                         'player_url': data['embedUrl']
3033                                 }
3034                         except (ValueError,KeyError), err:
3035                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3036                                 return
3037
3038                 self._downloader.increment_downloads()
3039
3040                 try:
3041                         self._downloader.process_info(info)
3042                 except UnavailableVideoError, err:
3043                         self._downloader.trouble(u'\nERROR: unable to download video')
3044
3045
3046 class MyVideoIE(InfoExtractor):
3047         """Information Extractor for myvideo.de."""
3048
3049         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3050         IE_NAME = u'myvideo'
3051
3052         def __init__(self, downloader=None):
3053                 InfoExtractor.__init__(self, downloader)
3054
3055         def report_download_webpage(self, video_id):
3056                 """Report webpage download."""
3057                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3058
3059         def report_extraction(self, video_id):
3060                 """Report information extraction."""
3061                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3062
3063         def _real_extract(self,url):
3064                 mobj = re.match(self._VALID_URL, url)
3065                 if mobj is None:
3066                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3067                         return
3068
3069                 video_id = mobj.group(1)
3070
3071                 # Get video webpage
3072                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3073                 try:
3074                         self.report_download_webpage(video_id)
3075                         webpage = urllib2.urlopen(request).read()
3076                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3077                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3078                         return
3079
3080                 self.report_extraction(video_id)
3081                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3082                                  webpage)
3083                 if mobj is None:
3084                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3085                         return
3086                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3087
3088                 mobj = re.search('<title>([^<]+)</title>', webpage)
3089                 if mobj is None:
3090                         self._downloader.trouble(u'ERROR: unable to extract title')
3091                         return
3092
3093                 video_title = mobj.group(1)
3094                 video_title = sanitize_title(video_title)
3095
3096                 simple_title = _simplify_title(video_title)
3097
3098                 try:
3099                         self._downloader.process_info({
3100                                 'id':           video_id,
3101                                 'url':          video_url,
3102                                 'uploader':     u'NA',
3103                                 'upload_date':  u'NA',
3104                                 'title':        video_title,
3105                                 'stitle':       simple_title,
3106                                 'ext':          u'flv',
3107                                 'format':       u'NA',
3108                                 'player_url':   None,
3109                         })
3110                 except UnavailableVideoError:
3111                         self._downloader.trouble(u'\nERROR: Unable to download video')
3112
3113 class ComedyCentralIE(InfoExtractor):
3114         """Information extractor for The Daily Show and Colbert Report """
3115
3116         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3117         IE_NAME = u'comedycentral'
3118
3119         def report_extraction(self, episode_id):
3120                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3121
3122         def report_config_download(self, episode_id):
3123                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3124
3125         def report_index_download(self, episode_id):
3126                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3127
3128         def report_player_url(self, episode_id):
3129                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3130
3131         def _real_extract(self, url):
3132                 mobj = re.match(self._VALID_URL, url)
3133                 if mobj is None:
3134                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3135                         return
3136
3137                 if mobj.group('shortname'):
3138                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3139                                 url = u'http://www.thedailyshow.com/full-episodes/'
3140                         else:
3141                                 url = u'http://www.colbertnation.com/full-episodes/'
3142                         mobj = re.match(self._VALID_URL, url)
3143                         assert mobj is not None
3144
3145                 dlNewest = not mobj.group('episode')
3146                 if dlNewest:
3147                         epTitle = mobj.group('showname')
3148                 else:
3149                         epTitle = mobj.group('episode')
3150
3151                 req = urllib2.Request(url)
3152                 self.report_extraction(epTitle)
3153                 try:
3154                         htmlHandle = urllib2.urlopen(req)
3155                         html = htmlHandle.read()
3156                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3157                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3158                         return
3159                 if dlNewest:
3160                         url = htmlHandle.geturl()
3161                         mobj = re.match(self._VALID_URL, url)
3162                         if mobj is None:
3163                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3164                                 return
3165                         if mobj.group('episode') == '':
3166                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3167                                 return
3168                         epTitle = mobj.group('episode')
3169
3170                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3171                 if len(mMovieParams) == 0:
3172                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3173                         return
3174
3175                 playerUrl_raw = mMovieParams[0][0]
3176                 self.report_player_url(epTitle)
3177                 try:
3178                         urlHandle = urllib2.urlopen(playerUrl_raw)
3179                         playerUrl = urlHandle.geturl()
3180                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3181                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3182                         return
3183
3184                 uri = mMovieParams[0][1]
3185                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3186                 self.report_index_download(epTitle)
3187                 try:
3188                         indexXml = urllib2.urlopen(indexUrl).read()
3189                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3190                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3191                         return
3192
3193                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3194                 itemEls = idoc.findall('.//item')
3195                 for itemEl in itemEls:
3196                         mediaId = itemEl.findall('./guid')[0].text
3197                         shortMediaId = mediaId.split(':')[-1]
3198                         showId = mediaId.split(':')[-2].replace('.com', '')
3199                         officialTitle = itemEl.findall('./title')[0].text
3200                         officialDate = itemEl.findall('./pubDate')[0].text
3201
3202                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3203                                                 urllib.urlencode({'uri': mediaId}))
3204                         configReq = urllib2.Request(configUrl)
3205                         self.report_config_download(epTitle)
3206                         try:
3207                                 configXml = urllib2.urlopen(configReq).read()
3208                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3209                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3210                                 return
3211
3212                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3213                         turls = []
3214                         for rendition in cdoc.findall('.//rendition'):
3215                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3216                                 turls.append(finfo)
3217
3218                         if len(turls) == 0:
3219                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3220                                 continue
3221
3222                         # For now, just pick the highest bitrate
3223                         format,video_url = turls[-1]
3224
3225                         self._downloader.increment_downloads()
3226
3227                         effTitle = showId + u'-' + epTitle
3228                         info = {
3229                                 'id': shortMediaId,
3230                                 'url': video_url,
3231                                 'uploader': showId,
3232                                 'upload_date': officialDate,
3233                                 'title': effTitle,
3234                                 'stitle': _simplify_title(effTitle),
3235                                 'ext': 'mp4',
3236                                 'format': format,
3237                                 'thumbnail': None,
3238                                 'description': officialTitle,
3239                                 'player_url': playerUrl
3240                         }
3241
3242                         try:
3243                                 self._downloader.process_info(info)
3244                         except UnavailableVideoError, err:
3245                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3246                                 continue
3247
3248
3249 class EscapistIE(InfoExtractor):
3250         """Information extractor for The Escapist """
3251
3252         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3253         IE_NAME = u'escapist'
3254
3255         def report_extraction(self, showName):
3256                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3257
3258         def report_config_download(self, showName):
3259                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3260
3261         def _real_extract(self, url):
3262                 htmlParser = HTMLParser.HTMLParser()
3263
3264                 mobj = re.match(self._VALID_URL, url)
3265                 if mobj is None:
3266                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3267                         return
3268                 showName = mobj.group('showname')
3269                 videoId = mobj.group('episode')
3270
3271                 self.report_extraction(showName)
3272                 try:
3273                         webPage = urllib2.urlopen(url).read()
3274                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3276                         return
3277
3278                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3279                 description = htmlParser.unescape(descMatch.group(1))
3280                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3281                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3282                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3283                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3284                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3285                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3286
3287                 self.report_config_download(showName)
3288                 try:
3289                         configJSON = urllib2.urlopen(configUrl).read()
3290                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3291                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3292                         return
3293
3294                 # Technically, it's JavaScript, not JSON
3295                 configJSON = configJSON.replace("'", '"')
3296
3297                 try:
3298                         config = json.loads(configJSON)
3299                 except (ValueError,), err:
3300                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3301                         return
3302
3303                 playlist = config['playlist']
3304                 videoUrl = playlist[1]['url']
3305
3306                 self._downloader.increment_downloads()
3307                 info = {
3308                         'id': videoId,
3309                         'url': videoUrl,
3310                         'uploader': showName,
3311                         'upload_date': None,
3312                         'title': showName,
3313                         'stitle': _simplify_title(showName),
3314                         'ext': 'flv',
3315                         'format': 'flv',
3316                         'thumbnail': imgUrl,
3317                         'description': description,
3318                         'player_url': playerUrl,
3319                 }
3320
3321                 try:
3322                         self._downloader.process_info(info)
3323                 except UnavailableVideoError, err:
3324                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3325
3326
3327 class CollegeHumorIE(InfoExtractor):
3328         """Information extractor for collegehumor.com"""
3329
3330         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3331         IE_NAME = u'collegehumor'
3332
3333         def report_webpage(self, video_id):
3334                 """Report information extraction."""
3335                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3336
3337         def report_extraction(self, video_id):
3338                 """Report information extraction."""
3339                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3340
3341         def _real_extract(self, url):
3342                 htmlParser = HTMLParser.HTMLParser()
3343
3344                 mobj = re.match(self._VALID_URL, url)
3345                 if mobj is None:
3346                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3347                         return
3348                 video_id = mobj.group('videoid')
3349
3350                 self.report_webpage(video_id)
3351                 request = urllib2.Request(url)
3352                 try:
3353                         webpage = urllib2.urlopen(request).read()
3354                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3355                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3356                         return
3357
3358                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3359                 if m is None:
3360                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3361                         return
3362                 internal_video_id = m.group('internalvideoid')
3363
3364                 info = {
3365                         'id': video_id,
3366                         'internal_id': internal_video_id,
3367                 }
3368
3369                 self.report_extraction(video_id)
3370                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3371                 try:
3372                         metaXml = urllib2.urlopen(xmlUrl).read()
3373                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3374                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3375                         return
3376
3377                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3378                 try:
3379                         videoNode = mdoc.findall('./video')[0]
3380                         info['description'] = videoNode.findall('./description')[0].text
3381                         info['title'] = videoNode.findall('./caption')[0].text
3382                         info['stitle'] = _simplify_title(info['title'])
3383                         info['url'] = videoNode.findall('./file')[0].text
3384                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3385                         info['ext'] = info['url'].rpartition('.')[2]
3386                         info['format'] = info['ext']
3387                 except IndexError:
3388                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3389                         return
3390
3391                 self._downloader.increment_downloads()
3392
3393                 try:
3394                         self._downloader.process_info(info)
3395                 except UnavailableVideoError, err:
3396                         self._downloader.trouble(u'\nERROR: unable to download video')
3397
3398
3399 class XVideosIE(InfoExtractor):
3400         """Information extractor for xvideos.com"""
3401
3402         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3403         IE_NAME = u'xvideos'
3404
3405         def report_webpage(self, video_id):
3406                 """Report information extraction."""
3407                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3408
3409         def report_extraction(self, video_id):
3410                 """Report information extraction."""
3411                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3412
3413         def _real_extract(self, url):
3414                 htmlParser = HTMLParser.HTMLParser()
3415
3416                 mobj = re.match(self._VALID_URL, url)
3417                 if mobj is None:
3418                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3419                         return
3420                 video_id = mobj.group(1).decode('utf-8')
3421
3422                 self.report_webpage(video_id)
3423
3424                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3425                 try:
3426                         webpage = urllib2.urlopen(request).read()
3427                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3428                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3429                         return
3430
3431                 self.report_extraction(video_id)
3432
3433
3434                 # Extract video URL
3435                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3436                 if mobj is None:
3437                         self._downloader.trouble(u'ERROR: unable to extract video url')
3438                         return
3439                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3440
3441
3442                 # Extract title
3443                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3444                 if mobj is None:
3445                         self._downloader.trouble(u'ERROR: unable to extract video title')
3446                         return
3447                 video_title = mobj.group(1).decode('utf-8')
3448
3449
3450                 # Extract video thumbnail
3451                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3452                 if mobj is None:
3453                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3454                         return
3455                 video_thumbnail = mobj.group(1).decode('utf-8')
3456
3457
3458
3459                 self._downloader.increment_downloads()
3460                 info = {
3461                         'id': video_id,
3462                         'url': video_url,
3463                         'uploader': None,
3464                         'upload_date': None,
3465                         'title': video_title,
3466                         'stitle': _simplify_title(video_title),
3467                         'ext': 'flv',
3468                         'format': 'flv',
3469                         'thumbnail': video_thumbnail,
3470                         'description': None,
3471                         'player_url': None,
3472                 }
3473
3474                 try:
3475                         self._downloader.process_info(info)
3476                 except UnavailableVideoError, err:
3477                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3478
3479
3480 class SoundcloudIE(InfoExtractor):
3481         """Information extractor for soundcloud.com
3482            To access the media, the uid of the song and a stream token
3483            must be extracted from the page source and the script must make
3484            a request to media.soundcloud.com/crossdomain.xml. Then
3485            the media can be grabbed by requesting from an url composed
3486            of the stream token and uid
3487          """
3488
3489         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3490         IE_NAME = u'soundcloud'
3491
3492         def __init__(self, downloader=None):
3493                 InfoExtractor.__init__(self, downloader)
3494
3495         def report_webpage(self, video_id):
3496                 """Report information extraction."""
3497                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3498
3499         def report_extraction(self, video_id):
3500                 """Report information extraction."""
3501                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3502
3503         def _real_extract(self, url):
3504                 htmlParser = HTMLParser.HTMLParser()
3505
3506                 mobj = re.match(self._VALID_URL, url)
3507                 if mobj is None:
3508                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3509                         return
3510
3511                 # extract uploader (which is in the url)
3512                 uploader = mobj.group(1).decode('utf-8')
3513                 # extract simple title (uploader + slug of song title)
3514                 slug_title =  mobj.group(2).decode('utf-8')
3515                 simple_title = uploader + '-' + slug_title
3516
3517                 self.report_webpage('%s/%s' % (uploader, slug_title))
3518
3519                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3520                 try:
3521                         webpage = urllib2.urlopen(request).read()
3522                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3523                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3524                         return
3525
3526                 self.report_extraction('%s/%s' % (uploader, slug_title))
3527
3528                 # extract uid and stream token that soundcloud hands out for access
3529                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3530                 if mobj:
3531                         video_id = mobj.group(1)
3532                         stream_token = mobj.group(2)
3533
3534                 # extract unsimplified title
3535                 mobj = re.search('"title":"(.*?)",', webpage)
3536                 if mobj:
3537                         title = mobj.group(1)
3538
3539                 # construct media url (with uid/token)
3540                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3541                 mediaURL = mediaURL % (video_id, stream_token)
3542
3543                 # description
3544                 description = u'No description available'
3545                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3546                 if mobj:
3547                         description = mobj.group(1)
3548
3549                 # upload date
3550                 upload_date = None
3551                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3552                 if mobj:
3553                         try:
3554                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3555                         except Exception, e:
3556                                 print str(e)
3557
3558                 # for soundcloud, a request to a cross domain is required for cookies
3559                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3560
3561                 try:
3562                         self._downloader.process_info({
3563                                 'id':           video_id.decode('utf-8'),
3564                                 'url':          mediaURL,
3565                                 'uploader':     uploader.decode('utf-8'),
3566                                 'upload_date':  upload_date,
3567                                 'title':        simple_title.decode('utf-8'),
3568                                 'stitle':       simple_title.decode('utf-8'),
3569                                 'ext':          u'mp3',
3570                                 'format':       u'NA',
3571                                 'player_url':   None,
3572                                 'description': description.decode('utf-8')
3573                         })
3574                 except UnavailableVideoError:
3575                         self._downloader.trouble(u'\nERROR: unable to download video')
3576
3577
3578 class InfoQIE(InfoExtractor):
3579         """Information extractor for infoq.com"""
3580
3581         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3582         IE_NAME = u'infoq'
3583
3584         def report_webpage(self, video_id):
3585                 """Report information extraction."""
3586                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3587
3588         def report_extraction(self, video_id):
3589                 """Report information extraction."""
3590                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3591
3592         def _real_extract(self, url):
3593                 htmlParser = HTMLParser.HTMLParser()
3594
3595                 mobj = re.match(self._VALID_URL, url)
3596                 if mobj is None:
3597                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3598                         return
3599
3600                 self.report_webpage(url)
3601
3602                 request = urllib2.Request(url)
3603                 try:
3604                         webpage = urllib2.urlopen(request).read()
3605                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3606                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3607                         return
3608
3609                 self.report_extraction(url)
3610
3611
3612                 # Extract video URL
3613                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3614                 if mobj is None:
3615                         self._downloader.trouble(u'ERROR: unable to extract video url')
3616                         return
3617                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3618
3619
3620                 # Extract title
3621                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3622                 if mobj is None:
3623                         self._downloader.trouble(u'ERROR: unable to extract video title')
3624                         return
3625                 video_title = mobj.group(1).decode('utf-8')
3626
3627                 # Extract description
3628                 video_description = u'No description available.'
3629                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3630                 if mobj is not None:
3631                         video_description = mobj.group(1).decode('utf-8')
3632
3633                 video_filename = video_url.split('/')[-1]
3634                 video_id, extension = video_filename.split('.')
3635
3636                 self._downloader.increment_downloads()
3637                 info = {
3638                         'id': video_id,
3639                         'url': video_url,
3640                         'uploader': None,
3641                         'upload_date': None,
3642                         'title': video_title,
3643                         'stitle': _simplify_title(video_title),
3644                         'ext': extension,
3645                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3646                         'thumbnail': None,
3647                         'description': video_description,
3648                         'player_url': None,
3649                 }
3650
3651                 try:
3652                         self._downloader.process_info(info)
3653                 except UnavailableVideoError, err:
3654                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3655
3656 class MixcloudIE(InfoExtractor):
3657         """Information extractor for www.mixcloud.com"""
3658         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3659         IE_NAME = u'mixcloud'
3660
3661         def __init__(self, downloader=None):
3662                 InfoExtractor.__init__(self, downloader)
3663
3664         def report_download_json(self, file_id):
3665                 """Report JSON download."""
3666                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3667
3668         def report_extraction(self, file_id):
3669                 """Report information extraction."""
3670                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3671
3672         def get_urls(self, jsonData, fmt, bitrate='best'):
3673                 """Get urls from 'audio_formats' section in json"""
3674                 file_url = None
3675                 try:
3676                         bitrate_list = jsonData[fmt]
3677                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3678                                 bitrate = max(bitrate_list) # select highest
3679
3680                         url_list = jsonData[fmt][bitrate]
3681                 except TypeError: # we have no bitrate info.
3682                         url_list = jsonData[fmt]
3683
3684                 return url_list
3685
3686         def check_urls(self, url_list):
3687                 """Returns 1st active url from list"""
3688                 for url in url_list:
3689                         try:
3690                                 urllib2.urlopen(url)
3691                                 return url
3692                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3693                                 url = None
3694
3695                 return None
3696
3697         def _print_formats(self, formats):
3698                 print 'Available formats:'
3699                 for fmt in formats.keys():
3700                         for b in formats[fmt]:
3701                                 try:
3702                                         ext = formats[fmt][b][0]
3703                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3704                                 except TypeError: # we have no bitrate info
3705                                         ext = formats[fmt][0]
3706                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3707                                         break
3708
3709         def _real_extract(self, url):
3710                 mobj = re.match(self._VALID_URL, url)
3711                 if mobj is None:
3712                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3713                         return
3714                 # extract uploader & filename from url
3715                 uploader = mobj.group(1).decode('utf-8')
3716                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3717
3718                 # construct API request
3719                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3720                 # retrieve .json file with links to files
3721                 request = urllib2.Request(file_url)
3722                 try:
3723                         self.report_download_json(file_url)
3724                         jsonData = urllib2.urlopen(request).read()
3725                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3726                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3727                         return
3728
3729                 # parse JSON
3730                 json_data = json.loads(jsonData)
3731                 player_url = json_data['player_swf_url']
3732                 formats = dict(json_data['audio_formats'])
3733
3734                 req_format = self._downloader.params.get('format', None)
3735                 bitrate = None
3736
3737                 if self._downloader.params.get('listformats', None):
3738                         self._print_formats(formats)
3739                         return
3740
3741                 if req_format is None or req_format == 'best':
3742                         for format_param in formats.keys():
3743                                 url_list = self.get_urls(formats, format_param)
3744                                 # check urls
3745                                 file_url = self.check_urls(url_list)
3746                                 if file_url is not None:
3747                                         break # got it!
3748                 else:
3749                         if req_format not in formats.keys():
3750                                 self._downloader.trouble(u'ERROR: format is not available')
3751                                 return
3752
3753                         url_list = self.get_urls(formats, req_format)
3754                         file_url = self.check_urls(url_list)
3755                         format_param = req_format
3756
3757                 # We have audio
3758                 self._downloader.increment_downloads()
3759                 try:
3760                         # Process file information
3761                         self._downloader.process_info({
3762                                 'id': file_id.decode('utf-8'),
3763                                 'url': file_url.decode('utf-8'),
3764                                 'uploader':     uploader.decode('utf-8'),
3765                                 'upload_date': u'NA',
3766                                 'title': json_data['name'],
3767                                 'stitle': _simplify_title(json_data['name']),
3768                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3769                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3770                                 'thumbnail': json_data['thumbnail_url'],
3771                                 'description': json_data['description'],
3772                                 'player_url': player_url.decode('utf-8'),
3773                         })
3774                 except UnavailableVideoError, err:
3775                         self._downloader.trouble(u'ERROR: unable to download file')
3776
3777 class StanfordOpenClassroomIE(InfoExtractor):
3778         """Information extractor for Stanford's Open ClassRoom"""
3779
3780         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3781         IE_NAME = u'stanfordoc'
3782
3783         def report_download_webpage(self, objid):
3784                 """Report information extraction."""
3785                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3786
3787         def report_extraction(self, video_id):
3788                 """Report information extraction."""
3789                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3790
3791         def _real_extract(self, url):
3792                 mobj = re.match(self._VALID_URL, url)
3793                 if mobj is None:
3794                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3795                         return
3796
3797                 if mobj.group('course') and mobj.group('video'): # A specific video
3798                         course = mobj.group('course')
3799                         video = mobj.group('video')
3800                         info = {
3801                                 'id': _simplify_title(course + '_' + video),
3802                         }
3803
3804                         self.report_extraction(info['id'])
3805                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3806                         xmlUrl = baseUrl + video + '.xml'
3807                         try:
3808                                 metaXml = urllib2.urlopen(xmlUrl).read()
3809                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3810                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3811                                 return
3812                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3813                         try:
3814                                 info['title'] = mdoc.findall('./title')[0].text
3815                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3816                         except IndexError:
3817                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3818                                 return
3819                         info['stitle'] = _simplify_title(info['title'])
3820                         info['ext'] = info['url'].rpartition('.')[2]
3821                         info['format'] = info['ext']
3822                         self._downloader.increment_downloads()
3823                         try:
3824                                 self._downloader.process_info(info)
3825                         except UnavailableVideoError, err:
3826                                 self._downloader.trouble(u'\nERROR: unable to download video')
3827                 elif mobj.group('course'): # A course page
3828                         unescapeHTML = HTMLParser.HTMLParser().unescape
3829
3830                         course = mobj.group('course')
3831                         info = {
3832                                 'id': _simplify_title(course),
3833                                 'type': 'playlist',
3834                         }
3835
3836                         self.report_download_webpage(info['id'])
3837                         try:
3838                                 coursepage = urllib2.urlopen(url).read()
3839                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3840                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3841                                 return
3842
3843                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3844                         if m:
3845                                 info['title'] = unescapeHTML(m.group(1))
3846                         else:
3847                                 info['title'] = info['id']
3848                         info['stitle'] = _simplify_title(info['title'])
3849
3850                         m = re.search('<description>([^<]+)</description>', coursepage)
3851                         if m:
3852                                 info['description'] = unescapeHTML(m.group(1))
3853
3854                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3855                         info['list'] = [
3856                                 {
3857                                         'type': 'reference',
3858                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3859                                 }
3860                                         for vpage in links]
3861
3862                         for entry in info['list']:
3863                                 assert entry['type'] == 'reference'
3864                                 self.extract(entry['url'])
3865                 else: # Root page
3866                         unescapeHTML = HTMLParser.HTMLParser().unescape
3867
3868                         info = {
3869                                 'id': 'Stanford OpenClassroom',
3870                                 'type': 'playlist',
3871                         }
3872
3873                         self.report_download_webpage(info['id'])
3874                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3875                         try:
3876                                 rootpage = urllib2.urlopen(rootURL).read()
3877                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3878                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3879                                 return
3880
3881                         info['title'] = info['id']
3882                         info['stitle'] = _simplify_title(info['title'])
3883
3884                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3885                         info['list'] = [
3886                                 {
3887                                         'type': 'reference',
3888                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3889                                 }
3890                                         for cpage in links]
3891
3892                         for entry in info['list']:
3893                                 assert entry['type'] == 'reference'
3894                                 self.extract(entry['url'])
3895
3896 class MTVIE(InfoExtractor):
3897         """Information extractor for MTV.com"""
3898
3899         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3900         IE_NAME = u'mtv'
3901
3902         def report_webpage(self, video_id):
3903                 """Report information extraction."""
3904                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3905
3906         def report_extraction(self, video_id):
3907                 """Report information extraction."""
3908                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3909
3910         def _real_extract(self, url):
3911                 mobj = re.match(self._VALID_URL, url)
3912                 if mobj is None:
3913                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3914                         return
3915                 if not mobj.group('proto'):
3916                         url = 'http://' + url
3917                 video_id = mobj.group('videoid')
3918                 self.report_webpage(video_id)
3919
3920                 request = urllib2.Request(url)
3921                 try:
3922                         webpage = urllib2.urlopen(request).read()
3923                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3924                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3925                         return
3926
3927                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3928                 if mobj is None:
3929                         self._downloader.trouble(u'ERROR: unable to extract song name')
3930                         return
3931                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3932                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3933                 if mobj is None:
3934                         self._downloader.trouble(u'ERROR: unable to extract performer')
3935                         return
3936                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3937                 video_title = performer + ' - ' + song_name
3938
3939                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3940                 if mobj is None:
3941                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3942                         return
3943                 mtvn_uri = mobj.group(1)
3944
3945                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3946                 if mobj is None:
3947                         self._downloader.trouble(u'ERROR: unable to extract content id')
3948                         return
3949                 content_id = mobj.group(1)
3950
3951                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3952                 self.report_extraction(video_id)
3953                 request = urllib2.Request(videogen_url)
3954                 try:
3955                         metadataXml = urllib2.urlopen(request).read()
3956                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3957                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3958                         return
3959
3960                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3961                 renditions = mdoc.findall('.//rendition')
3962
3963                 # For now, always pick the highest quality.
3964                 rendition = renditions[-1]
3965
3966                 try:
3967                         _,_,ext = rendition.attrib['type'].partition('/')
3968                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3969                         video_url = rendition.find('./src').text
3970                 except KeyError:
3971                         self._downloader.trouble('Invalid rendition field.')
3972                         return
3973
3974                 self._downloader.increment_downloads()
3975                 info = {
3976                         'id': video_id,
3977                         'url': video_url,
3978                         'uploader': performer,
3979                         'title': video_title,
3980                         'stitle': _simplify_title(video_title),
3981                         'ext': ext,
3982                         'format': format,
3983                 }
3984
3985                 try:
3986                         self._downloader.process_info(info)
3987                 except UnavailableVideoError, err:
3988                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3989
3990
3991 class PostProcessor(object):
3992         """Post Processor class.
3993
3994         PostProcessor objects can be added to downloaders with their
3995         add_post_processor() method. When the downloader has finished a
3996         successful download, it will take its internal chain of PostProcessors
3997         and start calling the run() method on each one of them, first with
3998         an initial argument and then with the returned value of the previous
3999         PostProcessor.
4000
4001         The chain will be stopped if one of them ever returns None or the end
4002         of the chain is reached.
4003
4004         PostProcessor objects follow a "mutual registration" process similar
4005         to InfoExtractor objects.
4006         """
4007
4008         _downloader = None
4009
4010         def __init__(self, downloader=None):
4011                 self._downloader = downloader
4012
4013         def set_downloader(self, downloader):
4014                 """Sets the downloader for this PP."""
4015                 self._downloader = downloader
4016
4017         def run(self, information):
4018                 """Run the PostProcessor.
4019
4020                 The "information" argument is a dictionary like the ones
4021                 composed by InfoExtractors. The only difference is that this
4022                 one has an extra field called "filepath" that points to the
4023                 downloaded file.
4024
4025                 When this method returns None, the postprocessing chain is
4026                 stopped. However, this method may return an information
4027                 dictionary that will be passed to the next postprocessing
4028                 object in the chain. It can be the one it received after
4029                 changing some fields.
4030
4031                 In addition, this method may raise a PostProcessingError
4032                 exception that will be taken into account by the downloader
4033                 it was called from.
4034                 """
4035                 return information # by default, do nothing
4036
4037 class AudioConversionError(BaseException):
4038         def __init__(self, message):
4039                 self.message = message
4040
4041 class FFmpegExtractAudioPP(PostProcessor):
4042
4043         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4044                 PostProcessor.__init__(self, downloader)
4045                 if preferredcodec is None:
4046                         preferredcodec = 'best'
4047                 self._preferredcodec = preferredcodec
4048                 self._preferredquality = preferredquality
4049                 self._keepvideo = keepvideo
4050
4051         @staticmethod
4052         def get_audio_codec(path):
4053                 try:
4054                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4055                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4056                         output = handle.communicate()[0]
4057                         if handle.wait() != 0:
4058                                 return None
4059                 except (IOError, OSError):
4060                         return None
4061                 audio_codec = None
4062                 for line in output.split('\n'):
4063                         if line.startswith('codec_name='):
4064                                 audio_codec = line.split('=')[1].strip()
4065                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4066                                 return audio_codec
4067                 return None
4068
4069         @staticmethod
4070         def run_ffmpeg(path, out_path, codec, more_opts):
4071                 if codec is None:
4072                         acodec_opts = []
4073                 else:
4074                         acodec_opts = ['-acodec', codec]
4075                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4076                 try:
4077                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4078                         stdout,stderr = p.communicate()
4079                 except (IOError, OSError):
4080                         e = sys.exc_info()[1]
4081                         if isinstance(e, OSError) and e.errno == 2:
4082                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4083                         else:
4084                                 raise e
4085                 if p.returncode != 0:
4086                         msg = stderr.strip().split('\n')[-1]
4087                         raise AudioConversionError(msg)
4088
4089         def run(self, information):
4090                 path = information['filepath']
4091
4092                 filecodec = self.get_audio_codec(path)
4093                 if filecodec is None:
4094                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4095                         return None
4096
4097                 more_opts = []
4098                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4099                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4100                                 # Lossless, but in another container
4101                                 acodec = 'copy'
4102                                 extension = self._preferredcodec
4103                                 more_opts = ['-absf', 'aac_adtstoasc']
4104                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4105                                 # Lossless if possible
4106                                 acodec = 'copy'
4107                                 extension = filecodec
4108                                 if filecodec == 'aac':
4109                                         more_opts = ['-f', 'adts']
4110                                 if filecodec == 'vorbis':
4111                                         extension = 'ogg'
4112                         else:
4113                                 # MP3 otherwise.
4114                                 acodec = 'libmp3lame'
4115                                 extension = 'mp3'
4116                                 more_opts = []
4117                                 if self._preferredquality is not None:
4118                                         more_opts += ['-ab', self._preferredquality]
4119                 else:
4120                         # We convert the audio (lossy)
4121                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4122                         extension = self._preferredcodec
4123                         more_opts = []
4124                         if self._preferredquality is not None:
4125                                 more_opts += ['-ab', self._preferredquality]
4126                         if self._preferredcodec == 'aac':
4127                                 more_opts += ['-f', 'adts']
4128                         if self._preferredcodec == 'm4a':
4129                                 more_opts += ['-absf', 'aac_adtstoasc']
4130                         if self._preferredcodec == 'vorbis':
4131                                 extension = 'ogg'
4132                         if self._preferredcodec == 'wav':
4133                                 extension = 'wav'
4134                                 more_opts += ['-f', 'wav']
4135
4136                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4137                 new_path = prefix + sep + extension
4138                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4139                 try:
4140                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4141                 except:
4142                         etype,e,tb = sys.exc_info()
4143                         if isinstance(e, AudioConversionError):
4144                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4145                         else:
4146                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4147                         return None
4148
4149                 # Try to update the date time for extracted audio file.
4150                 if information.get('filetime') is not None:
4151                         try:
4152                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4153                         except:
4154                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4155
4156                 if not self._keepvideo:
4157                         try:
4158                                 os.remove(_encodeFilename(path))
4159                         except (IOError, OSError):
4160                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4161                                 return None
4162
4163                 information['filepath'] = new_path
4164                 return information
4165
4166
4167 def updateSelf(downloader, filename):
4168         ''' Update the program file with the latest version from the repository '''
4169         # Note: downloader only used for options
4170         if not os.access(filename, os.W_OK):
4171                 sys.exit('ERROR: no write permissions on %s' % filename)
4172
4173         downloader.to_screen(u'Updating to latest version...')
4174
4175         try:
4176                 try:
4177                         urlh = urllib.urlopen(UPDATE_URL)
4178                         newcontent = urlh.read()
4179
4180                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4181                         if vmatch is not None and vmatch.group(1) == __version__:
4182                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4183                                 return
4184                 finally:
4185                         urlh.close()
4186         except (IOError, OSError), err:
4187                 sys.exit('ERROR: unable to download latest version')
4188
4189         try:
4190                 outf = open(filename, 'wb')
4191                 try:
4192                         outf.write(newcontent)
4193                 finally:
4194                         outf.close()
4195         except (IOError, OSError), err:
4196                 sys.exit('ERROR: unable to overwrite current version')
4197
4198         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4199
4200 def parseOpts():
4201         # Deferred imports
4202         import getpass
4203         import optparse
4204         import shlex
4205
4206         def _readOptions(filename_bytes):
4207                 try:
4208                         optionf = open(filename_bytes)
4209                 except IOError:
4210                         return [] # silently skip if file is not present
4211                 try:
4212                         res = []
4213                         for l in optionf:
4214                                 res += shlex.split(l, comments=True)
4215                 finally:
4216                         optionf.close()
4217                 return res
4218
4219         def _format_option_string(option):
4220                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4221
4222                 opts = []
4223
4224                 if option._short_opts: opts.append(option._short_opts[0])
4225                 if option._long_opts: opts.append(option._long_opts[0])
4226                 if len(opts) > 1: opts.insert(1, ', ')
4227
4228                 if option.takes_value(): opts.append(' %s' % option.metavar)
4229
4230                 return "".join(opts)
4231
4232         def _find_term_columns():
4233                 columns = os.environ.get('COLUMNS', None)
4234                 if columns:
4235                         return int(columns)
4236
4237                 try:
4238                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4239                         out,err = sp.communicate()
4240                         return int(out.split()[1])
4241                 except:
4242                         pass
4243                 return None
4244
4245         max_width = 80
4246         max_help_position = 80
4247
4248         # No need to wrap help messages if we're on a wide console
4249         columns = _find_term_columns()
4250         if columns: max_width = columns
4251
4252         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4253         fmt.format_option_strings = _format_option_string
4254
4255         kw = {
4256                 'version'   : __version__,
4257                 'formatter' : fmt,
4258                 'usage' : '%prog [options] url [url...]',
4259                 'conflict_handler' : 'resolve',
4260         }
4261
4262         parser = optparse.OptionParser(**kw)
4263
4264         # option groups
4265         general        = optparse.OptionGroup(parser, 'General Options')
4266         selection      = optparse.OptionGroup(parser, 'Video Selection')
4267         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4268         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4269         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4270         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4271         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4272
4273         general.add_option('-h', '--help',
4274                         action='help', help='print this help text and exit')
4275         general.add_option('-v', '--version',
4276                         action='version', help='print program version and exit')
4277         general.add_option('-U', '--update',
4278                         action='store_true', dest='update_self', help='update this program to latest version')
4279         general.add_option('-i', '--ignore-errors',
4280                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4281         general.add_option('-r', '--rate-limit',
4282                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4283         general.add_option('-R', '--retries',
4284                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4285         general.add_option('--dump-user-agent',
4286                         action='store_true', dest='dump_user_agent',
4287                         help='display the current browser identification', default=False)
4288         general.add_option('--list-extractors',
4289                         action='store_true', dest='list_extractors',
4290                         help='List all supported extractors and the URLs they would handle', default=False)
4291
4292         selection.add_option('--playlist-start',
4293                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4294         selection.add_option('--playlist-end',
4295                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4296         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4297         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4298         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4299
4300         authentication.add_option('-u', '--username',
4301                         dest='username', metavar='USERNAME', help='account username')
4302         authentication.add_option('-p', '--password',
4303                         dest='password', metavar='PASSWORD', help='account password')
4304         authentication.add_option('-n', '--netrc',
4305                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4306
4307
4308         video_format.add_option('-f', '--format',
4309                         action='store', dest='format', metavar='FORMAT', help='video format code')
4310         video_format.add_option('--all-formats',
4311                         action='store_const', dest='format', help='download all available video formats', const='all')
4312         video_format.add_option('--prefer-free-formats',
4313                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4314         video_format.add_option('--max-quality',
4315                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4316         video_format.add_option('-F', '--list-formats',
4317                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4318
4319
4320         verbosity.add_option('-q', '--quiet',
4321                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4322         verbosity.add_option('-s', '--simulate',
4323                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4324         verbosity.add_option('--skip-download',
4325                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4326         verbosity.add_option('-g', '--get-url',
4327                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4328         verbosity.add_option('-e', '--get-title',
4329                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4330         verbosity.add_option('--get-thumbnail',
4331                         action='store_true', dest='getthumbnail',
4332                         help='simulate, quiet but print thumbnail URL', default=False)
4333         verbosity.add_option('--get-description',
4334                         action='store_true', dest='getdescription',
4335                         help='simulate, quiet but print video description', default=False)
4336         verbosity.add_option('--get-filename',
4337                         action='store_true', dest='getfilename',
4338                         help='simulate, quiet but print output filename', default=False)
4339         verbosity.add_option('--get-format',
4340                         action='store_true', dest='getformat',
4341                         help='simulate, quiet but print output format', default=False)
4342         verbosity.add_option('--no-progress',
4343                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4344         verbosity.add_option('--console-title',
4345                         action='store_true', dest='consoletitle',
4346                         help='display progress in console titlebar', default=False)
4347
4348
4349         filesystem.add_option('-t', '--title',
4350                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4351         filesystem.add_option('-l', '--literal',
4352                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4353         filesystem.add_option('-A', '--auto-number',
4354                         action='store_true', dest='autonumber',
4355                         help='number downloaded files starting from 00000', default=False)
4356         filesystem.add_option('-o', '--output',
4357                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4358         filesystem.add_option('-a', '--batch-file',
4359                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4360         filesystem.add_option('-w', '--no-overwrites',
4361                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4362         filesystem.add_option('-c', '--continue',
4363                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4364         filesystem.add_option('--no-continue',
4365                         action='store_false', dest='continue_dl',
4366                         help='do not resume partially downloaded files (restart from beginning)')
4367         filesystem.add_option('--cookies',
4368                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4369         filesystem.add_option('--no-part',
4370                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4371         filesystem.add_option('--no-mtime',
4372                         action='store_false', dest='updatetime',
4373                         help='do not use the Last-modified header to set the file modification time', default=True)
4374         filesystem.add_option('--write-description',
4375                         action='store_true', dest='writedescription',
4376                         help='write video description to a .description file', default=False)
4377         filesystem.add_option('--write-info-json',
4378                         action='store_true', dest='writeinfojson',
4379                         help='write video metadata to a .info.json file', default=False)
4380
4381
4382         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4383                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4384         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4385                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4386         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4387                         help='ffmpeg audio bitrate specification, 128k by default')
4388         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4389                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4390
4391
4392         parser.add_option_group(general)
4393         parser.add_option_group(selection)
4394         parser.add_option_group(filesystem)
4395         parser.add_option_group(verbosity)
4396         parser.add_option_group(video_format)
4397         parser.add_option_group(authentication)
4398         parser.add_option_group(postproc)
4399
4400         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4401         if xdg_config_home:
4402                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4403         else:
4404                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4405         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4406         opts, args = parser.parse_args(argv)
4407
4408         return parser, opts, args
4409
4410 def gen_extractors():
4411         """ Return a list of an instance of every supported extractor.
4412         The order does matter; the first extractor matched is the one handling the URL.
4413         """
4414         youtube_ie = YoutubeIE()
4415         google_ie = GoogleIE()
4416         yahoo_ie = YahooIE()
4417         return [
4418                 YoutubePlaylistIE(youtube_ie),
4419                 YoutubeUserIE(youtube_ie),
4420                 YoutubeSearchIE(youtube_ie),
4421                 youtube_ie,
4422                 MetacafeIE(youtube_ie),
4423                 DailymotionIE(),
4424                 google_ie,
4425                 GoogleSearchIE(google_ie),
4426                 PhotobucketIE(),
4427                 yahoo_ie,
4428                 YahooSearchIE(yahoo_ie),
4429                 DepositFilesIE(),
4430                 FacebookIE(),
4431                 BlipTVIE(),
4432                 VimeoIE(),
4433                 MyVideoIE(),
4434                 ComedyCentralIE(),
4435                 EscapistIE(),
4436                 CollegeHumorIE(),
4437                 XVideosIE(),
4438                 SoundcloudIE(),
4439                 InfoQIE(),
4440                 MixcloudIE(),
4441                 StanfordOpenClassroomIE(),
4442                 MTVIE(),
4443
4444                 GenericIE()
4445         ]
4446
4447 def _real_main():
4448         parser, opts, args = parseOpts()
4449
4450         # Open appropriate CookieJar
4451         if opts.cookiefile is None:
4452                 jar = cookielib.CookieJar()
4453         else:
4454                 try:
4455                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4456                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4457                                 jar.load()
4458                 except (IOError, OSError), err:
4459                         sys.exit(u'ERROR: unable to open cookie file')
4460
4461         # Dump user agent
4462         if opts.dump_user_agent:
4463                 print std_headers['User-Agent']
4464                 sys.exit(0)
4465
4466         # Batch file verification
4467         batchurls = []
4468         if opts.batchfile is not None:
4469                 try:
4470                         if opts.batchfile == '-':
4471                                 batchfd = sys.stdin
4472                         else:
4473                                 batchfd = open(opts.batchfile, 'r')
4474                         batchurls = batchfd.readlines()
4475                         batchurls = [x.strip() for x in batchurls]
4476                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4477                 except IOError:
4478                         sys.exit(u'ERROR: batch file could not be read')
4479         all_urls = batchurls + args
4480
4481         # General configuration
4482         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4483         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4484         urllib2.install_opener(opener)
4485         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4486
4487         extractors = gen_extractors()
4488
4489         if opts.list_extractors:
4490                 for ie in extractors:
4491                         print(ie.IE_NAME)
4492                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4493                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4494                         for mu in matchedUrls:
4495                                 print(u'  ' + mu)
4496                 sys.exit(0)
4497
4498         # Conflicting, missing and erroneous options
4499         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4500                 parser.error(u'using .netrc conflicts with giving username/password')
4501         if opts.password is not None and opts.username is None:
4502                 parser.error(u'account username missing')
4503         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4504                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4505         if opts.usetitle and opts.useliteral:
4506                 parser.error(u'using title conflicts with using literal title')
4507         if opts.username is not None and opts.password is None:
4508                 opts.password = getpass.getpass(u'Type account password and press return:')
4509         if opts.ratelimit is not None:
4510                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4511                 if numeric_limit is None:
4512                         parser.error(u'invalid rate limit specified')
4513                 opts.ratelimit = numeric_limit
4514         if opts.retries is not None:
4515                 try:
4516                         opts.retries = long(opts.retries)
4517                 except (TypeError, ValueError), err:
4518                         parser.error(u'invalid retry count specified')
4519         try:
4520                 opts.playliststart = int(opts.playliststart)
4521                 if opts.playliststart <= 0:
4522                         raise ValueError(u'Playlist start must be positive')
4523         except (TypeError, ValueError), err:
4524                 parser.error(u'invalid playlist start number specified')
4525         try:
4526                 opts.playlistend = int(opts.playlistend)
4527                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4528                         raise ValueError(u'Playlist end must be greater than playlist start')
4529         except (TypeError, ValueError), err:
4530                 parser.error(u'invalid playlist end number specified')
4531         if opts.extractaudio:
4532                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4533                         parser.error(u'invalid audio format specified')
4534
4535         # File downloader
4536         fd = FileDownloader({
4537                 'usenetrc': opts.usenetrc,
4538                 'username': opts.username,
4539                 'password': opts.password,
4540                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4541                 'forceurl': opts.geturl,
4542                 'forcetitle': opts.gettitle,
4543                 'forcethumbnail': opts.getthumbnail,
4544                 'forcedescription': opts.getdescription,
4545                 'forcefilename': opts.getfilename,
4546                 'forceformat': opts.getformat,
4547                 'simulate': opts.simulate,
4548                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4549                 'format': opts.format,
4550                 'format_limit': opts.format_limit,
4551                 'listformats': opts.listformats,
4552                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4553                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4554                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4555                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4556                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4557                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4558                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4559                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4560                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4561                         or u'%(id)s.%(ext)s'),
4562                 'ignoreerrors': opts.ignoreerrors,
4563                 'ratelimit': opts.ratelimit,
4564                 'nooverwrites': opts.nooverwrites,
4565                 'retries': opts.retries,
4566                 'continuedl': opts.continue_dl,
4567                 'noprogress': opts.noprogress,
4568                 'playliststart': opts.playliststart,
4569                 'playlistend': opts.playlistend,
4570                 'logtostderr': opts.outtmpl == '-',
4571                 'consoletitle': opts.consoletitle,
4572                 'nopart': opts.nopart,
4573                 'updatetime': opts.updatetime,
4574                 'writedescription': opts.writedescription,
4575                 'writeinfojson': opts.writeinfojson,
4576                 'matchtitle': opts.matchtitle,
4577                 'rejecttitle': opts.rejecttitle,
4578                 'max_downloads': opts.max_downloads,
4579                 'prefer_free_formats': opts.prefer_free_formats,
4580                 })
4581         for extractor in extractors:
4582                 fd.add_info_extractor(extractor)
4583
4584         # PostProcessors
4585         if opts.extractaudio:
4586                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4587
4588         # Update version
4589         if opts.update_self:
4590                 updateSelf(fd, sys.argv[0])
4591
4592         # Maybe do nothing
4593         if len(all_urls) < 1:
4594                 if not opts.update_self:
4595                         parser.error(u'you must provide at least one URL')
4596                 else:
4597                         sys.exit()
4598
4599         try:
4600                 retcode = fd.download(all_urls)
4601         except MaxDownloadsReached:
4602                 fd.to_screen(u'--max-download limit reached, aborting.')
4603                 retcode = 101
4604
4605         # Dump cookie jar if requested
4606         if opts.cookiefile is not None:
4607                 try:
4608                         jar.save()
4609                 except (IOError, OSError), err:
4610                         sys.exit(u'ERROR: unable to save cookie jar')
4611
4612         sys.exit(retcode)
4613
4614 def main():
4615         try:
4616                 _real_main()
4617         except DownloadError:
4618                 sys.exit(1)
4619         except SameFileError:
4620                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4621         except KeyboardInterrupt:
4622                 sys.exit(u'\nERROR: Interrupted by user')
4623
4624 if __name__ == '__main__':
4625         main()
4626
4627 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: