youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __authors__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         'Filippo Valsorda',
  19         )
  20
  21 __license__ = 'Public Domain'
  22 __version__ = '2012.02.27'
  23
  24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  25
  26
  27 import cookielib
  28 import datetime
  29 import getpass
  30 import gzip
  31 import htmlentitydefs
  32 import HTMLParser
  33 import httplib
  34 import locale
  35 import math
  36 import netrc
  37 import optparse
  38 import os
  39 import os.path
  40 import re
  41 import shlex
  42 import socket
  43 import string
  44 import subprocess
  45 import sys
  46 import time
  47 import urllib
  48 import urllib2
  49 import warnings
  50 import zlib
  51
  52 if os.name == 'nt':
  53         import ctypes
  54
  55 try:
  56         import email.utils
  57 except ImportError: # Python 2.4
  58         import email.Utils
  59 try:
  60         import cStringIO as StringIO
  61 except ImportError:
  62         import StringIO
  63
  64 # parse_qs was moved from the cgi module to the urlparse module recently.
  65 try:
  66         from urlparse import parse_qs
  67 except ImportError:
  68         from cgi import parse_qs
  69
  70 try:
  71         import lxml.etree
  72 except ImportError:
  73         pass # Handled below
  74
  75 try:
  76         import xml.etree.ElementTree
  77 except ImportError: # Python<2.5: Not officially supported, but let it slip
  78         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  79
  80 std_headers = {
  81         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  82         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  83         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  84         'Accept-Encoding': 'gzip, deflate',
  85         'Accept-Language': 'en-us,en;q=0.5',
  86 }
  87
  88 try:
  89         import json
  90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  91         import re
  92         class json(object):
  93                 @staticmethod
  94                 def loads(s):
  95                         s = s.decode('UTF-8')
  96                         def raiseError(msg, i):
  97                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  98                         def skipSpace(i, expectMore=True):
  99                                 while i < len(s) and s[i] in ' \t\r\n':
 100                                         i += 1
 101                                 if expectMore:
 102                                         if i >= len(s):
 103                                                 raiseError('Premature end', i)
 104                                 return i
 105                         def decodeEscape(match):
 106                                 esc = match.group(1)
 107                                 _STATIC = {
 108                                         '"': '"',
 109                                         '\\': '\\',
 110                                         '/': '/',
 111                                         'b': unichr(0x8),
 112                                         'f': unichr(0xc),
 113                                         'n': '\n',
 114                                         'r': '\r',
 115                                         't': '\t',
 116                                 }
 117                                 if esc in _STATIC:
 118                                         return _STATIC[esc]
 119                                 if esc[0] == 'u':
 120                                         if len(esc) == 1+4:
 121                                                 return unichr(int(esc[1:5], 16))
 122                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 123                                                 hi = int(esc[1:5], 16)
 124                                                 low = int(esc[7:11], 16)
 125                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 126                                 raise ValueError('Unknown escape ' + str(esc))
 127                         def parseString(i):
 128                                 i += 1
 129                                 e = i
 130                                 while True:
 131                                         e = s.index('"', e)
 132                                         bslashes = 0
 133                                         while s[e-bslashes-1] == '\\':
 134                                                 bslashes += 1
 135                                         if bslashes % 2 == 1:
 136                                                 e += 1
 137                                                 continue
 138                                         break
 139                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 140                                 stri = rexp.sub(decodeEscape, s[i:e])
 141                                 return (e+1,stri)
 142                         def parseObj(i):
 143                                 i += 1
 144                                 res = {}
 145                                 i = skipSpace(i)
 146                                 if s[i] == '}': # Empty dictionary
 147                                         return (i+1,res)
 148                                 while True:
 149                                         if s[i] != '"':
 150                                                 raiseError('Expected a string object key', i)
 151                                         i,key = parseString(i)
 152                                         i = skipSpace(i)
 153                                         if i >= len(s) or s[i] != ':':
 154                                                 raiseError('Expected a colon', i)
 155                                         i,val = parse(i+1)
 156                                         res[key] = val
 157                                         i = skipSpace(i)
 158                                         if s[i] == '}':
 159                                                 return (i+1, res)
 160                                         if s[i] != ',':
 161                                                 raiseError('Expected comma or closing curly brace', i)
 162                                         i = skipSpace(i+1)
 163                         def parseArray(i):
 164                                 res = []
 165                                 i = skipSpace(i+1)
 166                                 if s[i] == ']': # Empty array
 167                                         return (i+1,res)
 168                                 while True:
 169                                         i,val = parse(i)
 170                                         res.append(val)
 171                                         i = skipSpace(i) # Raise exception if premature end
 172                                         if s[i] == ']':
 173                                                 return (i+1, res)
 174                                         if s[i] != ',':
 175                                                 raiseError('Expected a comma or closing bracket', i)
 176                                         i = skipSpace(i+1)
 177                         def parseDiscrete(i):
 178                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 179                                         if s.startswith(k, i):
 180                                                 return (i+len(k), v)
 181                                 raiseError('Not a boolean (or null)', i)
 182                         def parseNumber(i):
 183                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 184                                 if mobj is None:
 185                                         raiseError('Not a number', i)
 186                                 nums = mobj.group(1)
 187                                 if '.' in nums or 'e' in nums or 'E' in nums:
 188                                         return (i+len(nums), float(nums))
 189                                 return (i+len(nums), int(nums))
 190                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 191                         def parse(i):
 192                                 i = skipSpace(i)
 193                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 194                                 i = skipSpace(i, False)
 195                                 return (i,res)
 196                         i,res = parse(0)
 197                         if i < len(s):
 198                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 199                         return res
 200
 201 def preferredencoding():
 202         """Get preferred encoding.
 203
 204         Returns the best encoding scheme for the system, based on
 205         locale.getpreferredencoding() and some further tweaks.
 206         """
 207         def yield_preferredencoding():
 208                 try:
 209                         pref = locale.getpreferredencoding()
 210                         u'TEST'.encode(pref)
 211                 except:
 212                         pref = 'UTF-8'
 213                 while True:
 214                         yield pref
 215         return yield_preferredencoding().next()
 216
 217
 218 def htmlentity_transform(matchobj):
 219         """Transforms an HTML entity to a Unicode character.
 220
 221         This function receives a match object and is intended to be used with
 222         the re.sub() function.
 223         """
 224         entity = matchobj.group(1)
 225
 226         # Known non-numeric HTML entity
 227         if entity in htmlentitydefs.name2codepoint:
 228                 return unichr(htmlentitydefs.name2codepoint[entity])
 229
 230         # Unicode character
 231         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 232         if mobj is not None:
 233                 numstr = mobj.group(1)
 234                 if numstr.startswith(u'x'):
 235                         base = 16
 236                         numstr = u'0%s' % numstr
 237                 else:
 238                         base = 10
 239                 return unichr(long(numstr, base))
 240
 241         # Unknown entity in name, return its literal representation
 242         return (u'&%s;' % entity)
 243
 244
 245 def sanitize_title(utitle):
 246         """Sanitizes a video title so it could be used as part of a filename."""
 247         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 248         return utitle.replace(unicode(os.sep), u'%')
 249
 250
 251 def sanitize_open(filename, open_mode):
 252         """Try to open the given filename, and slightly tweak it if this fails.
 253
 254         Attempts to open the given filename. If this fails, it tries to change
 255         the filename slightly, step by step, until it's either able to open it
 256         or it fails and raises a final exception, like the standard open()
 257         function.
 258
 259         It returns the tuple (stream, definitive_file_name).
 260         """
 261         try:
 262                 if filename == u'-':
 263                         if sys.platform == 'win32':
 264                                 import msvcrt
 265                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 266                         return (sys.stdout, filename)
 267                 stream = open(_encodeFilename(filename), open_mode)
 268                 return (stream, filename)
 269         except (IOError, OSError), err:
 270                 # In case of error, try to remove win32 forbidden chars
 271                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 272
 273                 # An exception here should be caught in the caller
 274                 stream = open(_encodeFilename(filename), open_mode)
 275                 return (stream, filename)
 276
 277
 278 def timeconvert(timestr):
 279         """Convert RFC 2822 defined time string into system timestamp"""
 280         timestamp = None
 281         timetuple = email.utils.parsedate_tz(timestr)
 282         if timetuple is not None:
 283                 timestamp = email.utils.mktime_tz(timetuple)
 284         return timestamp
 285
 286 def _simplify_title(title):
 287         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 288         return expr.sub(u'_', title).strip(u'_')
 289
 290 def _orderedSet(iterable):
 291         """ Remove all duplicates from the input iterable """
 292         res = []
 293         for el in iterable:
 294                 if el not in res:
 295                         res.append(el)
 296         return res
 297
 298 def _unescapeHTML(s):
 299         """
 300         @param s a string (of type unicode)
 301         """
 302         assert type(s) == type(u'')
 303
 304         htmlParser = HTMLParser.HTMLParser()
 305         return htmlParser.unescape(s)
 306
 307 def _encodeFilename(s):
 308         """
 309         @param s The name of the file (of type unicode)
 310         """
 311
 312         assert type(s) == type(u'')
 313
 314         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 315                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 316                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 317                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 318                 return s
 319         else:
 320                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 321
 322 class DownloadError(Exception):
 323         """Download Error exception.
 324
 325         This exception may be thrown by FileDownloader objects if they are not
 326         configured to continue on errors. They will contain the appropriate
 327         error message.
 328         """
 329         pass
 330
 331
 332 class SameFileError(Exception):
 333         """Same File exception.
 334
 335         This exception will be thrown by FileDownloader objects if they detect
 336         multiple files would have to be downloaded to the same file on disk.
 337         """
 338         pass
 339
 340
 341 class PostProcessingError(Exception):
 342         """Post Processing exception.
 343
 344         This exception may be raised by PostProcessor's .run() method to
 345         indicate an error in the postprocessing task.
 346         """
 347         pass
 348
 349 class MaxDownloadsReached(Exception):
 350         """ --max-downloads limit has been reached. """
 351         pass
 352
 353
 354 class UnavailableVideoError(Exception):
 355         """Unavailable Format exception.
 356
 357         This exception will be thrown when a video is requested
 358         in a format that is not available for that video.
 359         """
 360         pass
 361
 362
 363 class ContentTooShortError(Exception):
 364         """Content Too Short exception.
 365
 366         This exception may be raised by FileDownloader objects when a file they
 367         download is too small for what the server announced first, indicating
 368         the connection was probably interrupted.
 369         """
 370         # Both in bytes
 371         downloaded = None
 372         expected = None
 373
 374         def __init__(self, downloaded, expected):
 375                 self.downloaded = downloaded
 376                 self.expected = expected
 377
 378
 379 class YoutubeDLHandler(urllib2.HTTPHandler):
 380         """Handler for HTTP requests and responses.
 381
 382         This class, when installed with an OpenerDirector, automatically adds
 383         the standard headers to every HTTP request and handles gzipped and
 384         deflated responses from web servers. If compression is to be avoided in
 385         a particular request, the original request in the program code only has
 386         to include the HTTP header "Youtubedl-No-Compression", which will be
 387         removed before making the real request.
 388
 389         Part of this code was copied from:
 390
 391         http://techknack.net/python-urllib2-handlers/
 392
 393         Andrew Rowls, the author of that code, agreed to release it to the
 394         public domain.
 395         """
 396
 397         @staticmethod
 398         def deflate(data):
 399                 try:
 400                         return zlib.decompress(data, -zlib.MAX_WBITS)
 401                 except zlib.error:
 402                         return zlib.decompress(data)
 403
 404         @staticmethod
 405         def addinfourl_wrapper(stream, headers, url, code):
 406                 if hasattr(urllib2.addinfourl, 'getcode'):
 407                         return urllib2.addinfourl(stream, headers, url, code)
 408                 ret = urllib2.addinfourl(stream, headers, url)
 409                 ret.code = code
 410                 return ret
 411
 412         def http_request(self, req):
 413                 for h in std_headers:
 414                         if h in req.headers:
 415                                 del req.headers[h]
 416                         req.add_header(h, std_headers[h])
 417                 if 'Youtubedl-no-compression' in req.headers:
 418                         if 'Accept-encoding' in req.headers:
 419                                 del req.headers['Accept-encoding']
 420                         del req.headers['Youtubedl-no-compression']
 421                 return req
 422
 423         def http_response(self, req, resp):
 424                 old_resp = resp
 425                 # gzip
 426                 if resp.headers.get('Content-encoding', '') == 'gzip':
 427                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 428                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 429                         resp.msg = old_resp.msg
 430                 # deflate
 431                 if resp.headers.get('Content-encoding', '') == 'deflate':
 432                         gz = StringIO.StringIO(self.deflate(resp.read()))
 433                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 434                         resp.msg = old_resp.msg
 435                 return resp
 436
 437
 438 class FileDownloader(object):
 439         """File Downloader class.
 440
 441         File downloader objects are the ones responsible of downloading the
 442         actual video file and writing it to disk if the user has requested
 443         it, among some other tasks. In most cases there should be one per
 444         program. As, given a video URL, the downloader doesn't know how to
 445         extract all the needed information, task that InfoExtractors do, it
 446         has to pass the URL to one of them.
 447
 448         For this, file downloader objects have a method that allows
 449         InfoExtractors to be registered in a given order. When it is passed
 450         a URL, the file downloader handles it to the first InfoExtractor it
 451         finds that reports being able to handle it. The InfoExtractor extracts
 452         all the information about the video or videos the URL refers to, and
 453         asks the FileDownloader to process the video information, possibly
 454         downloading the video.
 455
 456         File downloaders accept a lot of parameters. In order not to saturate
 457         the object constructor with arguments, it receives a dictionary of
 458         options instead. These options are available through the params
 459         attribute for the InfoExtractors to use. The FileDownloader also
 460         registers itself as the downloader in charge for the InfoExtractors
 461         that are added to it, so this is a "mutual registration".
 462
 463         Available options:
 464
 465         username:         Username for authentication purposes.
 466         password:         Password for authentication purposes.
 467         usenetrc:         Use netrc for authentication instead.
 468         quiet:            Do not print messages to stdout.
 469         forceurl:         Force printing final URL.
 470         forcetitle:       Force printing title.
 471         forcethumbnail:   Force printing thumbnail URL.
 472         forcedescription: Force printing description.
 473         forcefilename:    Force printing final filename.
 474         simulate:         Do not download the video files.
 475         format:           Video format code.
 476         format_limit:     Highest quality format to try.
 477         outtmpl:          Template for output names.
 478         ignoreerrors:     Do not stop on download errors.
 479         ratelimit:        Download speed limit, in bytes/sec.
 480         nooverwrites:     Prevent overwriting files.
 481         retries:          Number of times to retry for HTTP error 5xx
 482         continuedl:       Try to continue downloads if possible.
 483         noprogress:       Do not print the progress bar.
 484         playliststart:    Playlist item to start at.
 485         playlistend:      Playlist item to end at.
 486         matchtitle:       Download only matching titles.
 487         rejecttitle:      Reject downloads for matching titles.
 488         logtostderr:      Log messages to stderr instead of stdout.
 489         consoletitle:     Display progress in console window's titlebar.
 490         nopart:           Do not use temporary .part files.
 491         updatetime:       Use the Last-modified header to set output file timestamps.
 492         writedescription: Write the video description to a .description file
 493         writeinfojson:    Write the video description to a .info.json file
 494         writesubtitles:   Write the video subtitles to a .srt file
 495         subtitleslang:    Language of the subtitles to download
 496         """
 497
 498         params = None
 499         _ies = []
 500         _pps = []
 501         _download_retcode = None
 502         _num_downloads = None
 503         _screen_file = None
 504
 505         def __init__(self, params):
 506                 """Create a FileDownloader object with the given options."""
 507                 self._ies = []
 508                 self._pps = []
 509                 self._download_retcode = 0
 510                 self._num_downloads = 0
 511                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 512                 self.params = params
 513
 514         @staticmethod
 515         def format_bytes(bytes):
 516                 if bytes is None:
 517                         return 'N/A'
 518                 if type(bytes) is str:
 519                         bytes = float(bytes)
 520                 if bytes == 0.0:
 521                         exponent = 0
 522                 else:
 523                         exponent = long(math.log(bytes, 1024.0))
 524                 suffix = 'bkMGTPEZY'[exponent]
 525                 converted = float(bytes) / float(1024 ** exponent)
 526                 return '%.2f%s' % (converted, suffix)
 527
 528         @staticmethod
 529         def calc_percent(byte_counter, data_len):
 530                 if data_len is None:
 531                         return '---.-%'
 532                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 533
 534         @staticmethod
 535         def calc_eta(start, now, total, current):
 536                 if total is None:
 537                         return '--:--'
 538                 dif = now - start
 539                 if current == 0 or dif < 0.001: # One millisecond
 540                         return '--:--'
 541                 rate = float(current) / dif
 542                 eta = long((float(total) - float(current)) / rate)
 543                 (eta_mins, eta_secs) = divmod(eta, 60)
 544                 if eta_mins > 99:
 545                         return '--:--'
 546                 return '%02d:%02d' % (eta_mins, eta_secs)
 547
 548         @staticmethod
 549         def calc_speed(start, now, bytes):
 550                 dif = now - start
 551                 if bytes == 0 or dif < 0.001: # One millisecond
 552                         return '%10s' % '---b/s'
 553                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 554
 555         @staticmethod
 556         def best_block_size(elapsed_time, bytes):
 557                 new_min = max(bytes / 2.0, 1.0)
 558                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 559                 if elapsed_time < 0.001:
 560                         return long(new_max)
 561                 rate = bytes / elapsed_time
 562                 if rate > new_max:
 563                         return long(new_max)
 564                 if rate < new_min:
 565                         return long(new_min)
 566                 return long(rate)
 567
 568         @staticmethod
 569         def parse_bytes(bytestr):
 570                 """Parse a string indicating a byte quantity into a long integer."""
 571                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 572                 if matchobj is None:
 573                         return None
 574                 number = float(matchobj.group(1))
 575                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 576                 return long(round(number * multiplier))
 577
 578         def add_info_extractor(self, ie):
 579                 """Add an InfoExtractor object to the end of the list."""
 580                 self._ies.append(ie)
 581                 ie.set_downloader(self)
 582
 583         def add_post_processor(self, pp):
 584                 """Add a PostProcessor object to the end of the chain."""
 585                 self._pps.append(pp)
 586                 pp.set_downloader(self)
 587
 588         def to_screen(self, message, skip_eol=False):
 589                 """Print message to stdout if not in quiet mode."""
 590                 assert type(message) == type(u'')
 591                 if not self.params.get('quiet', False):
 592                         terminator = [u'\n', u''][skip_eol]
 593                         output = message + terminator
 594
 595                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 596                                 output = output.encode(preferredencoding(), 'ignore')
 597                         self._screen_file.write(output)
 598                         self._screen_file.flush()
 599
 600         def to_stderr(self, message):
 601                 """Print message to stderr."""
 602                 print >>sys.stderr, message.encode(preferredencoding())
 603
 604         def to_cons_title(self, message):
 605                 """Set console/terminal window title to message."""
 606                 if not self.params.get('consoletitle', False):
 607                         return
 608                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 609                         # c_wchar_p() might not be necessary if `message` is
 610                         # already of type unicode()
 611                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 612                 elif 'TERM' in os.environ:
 613                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 614
 615         def fixed_template(self):
 616                 """Checks if the output template is fixed."""
 617                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 618
 619         def trouble(self, message=None):
 620                 """Determine action to take when a download problem appears.
 621
 622                 Depending on if the downloader has been configured to ignore
 623                 download errors or not, this method may throw an exception or
 624                 not when errors are found, after printing the message.
 625                 """
 626                 if message is not None:
 627                         self.to_stderr(message)
 628                 if not self.params.get('ignoreerrors', False):
 629                         raise DownloadError(message)
 630                 self._download_retcode = 1
 631
 632         def slow_down(self, start_time, byte_counter):
 633                 """Sleep if the download speed is over the rate limit."""
 634                 rate_limit = self.params.get('ratelimit', None)
 635                 if rate_limit is None or byte_counter == 0:
 636                         return
 637                 now = time.time()
 638                 elapsed = now - start_time
 639                 if elapsed <= 0.0:
 640                         return
 641                 speed = float(byte_counter) / elapsed
 642                 if speed > rate_limit:
 643                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 644
 645         def temp_name(self, filename):
 646                 """Returns a temporary filename for the given filename."""
 647                 if self.params.get('nopart', False) or filename == u'-' or \
 648                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
 649                         return filename
 650                 return filename + u'.part'
 651
 652         def undo_temp_name(self, filename):
 653                 if filename.endswith(u'.part'):
 654                         return filename[:-len(u'.part')]
 655                 return filename
 656
 657         def try_rename(self, old_filename, new_filename):
 658                 try:
 659                         if old_filename == new_filename:
 660                                 return
 661                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
 662                 except (IOError, OSError), err:
 663                         self.trouble(u'ERROR: unable to rename file')
 664
 665         def try_utime(self, filename, last_modified_hdr):
 666                 """Try to set the last-modified time of the given file."""
 667                 if last_modified_hdr is None:
 668                         return
 669                 if not os.path.isfile(_encodeFilename(filename)):
 670                         return
 671                 timestr = last_modified_hdr
 672                 if timestr is None:
 673                         return
 674                 filetime = timeconvert(timestr)
 675                 if filetime is None:
 676                         return filetime
 677                 try:
 678                         os.utime(filename, (time.time(), filetime))
 679                 except:
 680                         pass
 681                 return filetime
 682
 683         def report_writedescription(self, descfn):
 684                 """ Report that the description file is being written """
 685                 self.to_screen(u'[info] Writing video description to: ' + descfn)
 686
 687         def report_writesubtitles(self, srtfn):
 688                 """ Report that the subtitles file is being written """
 689                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
 690
 691         def report_writeinfojson(self, infofn):
 692                 """ Report that the metadata file has been written """
 693                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 694
 695         def report_destination(self, filename):
 696                 """Report destination filename."""
 697                 self.to_screen(u'[download] Destination: ' + filename)
 698
 699         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 700                 """Report download progress."""
 701                 if self.params.get('noprogress', False):
 702                         return
 703                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 704                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 705                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 706                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 707
 708         def report_resuming_byte(self, resume_len):
 709                 """Report attempt to resume at given byte."""
 710                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 711
 712         def report_retry(self, count, retries):
 713                 """Report retry in case of HTTP error 5xx"""
 714                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 715
 716         def report_file_already_downloaded(self, file_name):
 717                 """Report file has already been fully downloaded."""
 718                 try:
 719                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 720                 except (UnicodeEncodeError), err:
 721                         self.to_screen(u'[download] The file has already been downloaded')
 722
 723         def report_unable_to_resume(self):
 724                 """Report it was impossible to resume download."""
 725                 self.to_screen(u'[download] Unable to resume')
 726
 727         def report_finish(self):
 728                 """Report download finished."""
 729                 if self.params.get('noprogress', False):
 730                         self.to_screen(u'[download] Download completed')
 731                 else:
 732                         self.to_screen(u'')
 733
 734         def increment_downloads(self):
 735                 """Increment the ordinal that assigns a number to each file."""
 736                 self._num_downloads += 1
 737
 738         def prepare_filename(self, info_dict):
 739                 """Generate the output filename."""
 740                 try:
 741                         template_dict = dict(info_dict)
 742                         template_dict['epoch'] = unicode(long(time.time()))
 743                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 744                         filename = self.params['outtmpl'] % template_dict
 745                         return filename
 746                 except (ValueError, KeyError), err:
 747                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 748                         return None
 749
 750         def _match_entry(self, info_dict):
 751                 """ Returns None iff the file should be downloaded """
 752
 753                 title = info_dict['title']
 754                 matchtitle = self.params.get('matchtitle', False)
 755                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 756                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 757                 rejecttitle = self.params.get('rejecttitle', False)
 758                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 759                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 760                 return None
 761
 762         def process_info(self, info_dict):
 763                 """Process a single dictionary returned by an InfoExtractor."""
 764
 765                 reason = self._match_entry(info_dict)
 766                 if reason is not None:
 767                         self.to_screen(u'[download] ' + reason)
 768                         return
 769
 770                 max_downloads = self.params.get('max_downloads')
 771                 if max_downloads is not None:
 772                         if self._num_downloads > int(max_downloads):
 773                                 raise MaxDownloadsReached()
 774
 775                 filename = self.prepare_filename(info_dict)
 776
 777                 # Forced printings
 778                 if self.params.get('forcetitle', False):
 779                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 780                 if self.params.get('forceurl', False):
 781                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 782                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 783                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 784                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 785                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 786                 if self.params.get('forcefilename', False) and filename is not None:
 787                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 788                 if self.params.get('forceformat', False):
 789                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 790
 791                 # Do nothing else if in simulate mode
 792                 if self.params.get('simulate', False):
 793                         return
 794
 795                 if filename is None:
 796                         return
 797
 798                 try:
 799                         dn = os.path.dirname(_encodeFilename(filename))
 800                         if dn != '' and not os.path.exists(dn): # dn is already encoded
 801                                 os.makedirs(dn)
 802                 except (OSError, IOError), err:
 803                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 804                         return
 805
 806                 if self.params.get('writedescription', False):
 807                         try:
 808                                 descfn = filename + u'.description'
 809                                 self.report_writedescription(descfn)
 810                                 descfile = open(_encodeFilename(descfn), 'wb')
 811                                 try:
 812                                         descfile.write(info_dict['description'].encode('utf-8'))
 813                                 finally:
 814                                         descfile.close()
 815                         except (OSError, IOError):
 816                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 817                                 return
 818
 819                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
 820                         # subtitles download errors are already managed as troubles in relevant IE
 821                         # that way it will silently go on when used with unsupporting IE
 822                         try:
 823                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
 824                                 self.report_writesubtitles(srtfn)
 825                                 srtfile = open(_encodeFilename(srtfn), 'wb')
 826                                 try:
 827                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
 828                                 finally:
 829                                         srtfile.close()
 830                         except (OSError, IOError):
 831                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
 832                                 return
 833
 834                 if self.params.get('writeinfojson', False):
 835                         infofn = filename + u'.info.json'
 836                         self.report_writeinfojson(infofn)
 837                         try:
 838                                 json.dump
 839                         except (NameError,AttributeError):
 840                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 841                                 return
 842                         try:
 843                                 infof = open(_encodeFilename(infofn), 'wb')
 844                                 try:
 845                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 846                                         json.dump(json_info_dict, infof)
 847                                 finally:
 848                                         infof.close()
 849                         except (OSError, IOError):
 850                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 851                                 return
 852
 853                 if not self.params.get('skip_download', False):
 854                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
 855                                 success = True
 856                         else:
 857                                 try:
 858                                         success = self._do_download(filename, info_dict)
 859                                 except (OSError, IOError), err:
 860                                         raise UnavailableVideoError
 861                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 862                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 863                                         return
 864                                 except (ContentTooShortError, ), err:
 865                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 866                                         return
 867
 868                         if success:
 869                                 try:
 870                                         self.post_process(filename, info_dict)
 871                                 except (PostProcessingError), err:
 872                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 873                                         return
 874
 875         def download(self, url_list):
 876                 """Download a given list of URLs."""
 877                 if len(url_list) > 1 and self.fixed_template():
 878                         raise SameFileError(self.params['outtmpl'])
 879
 880                 for url in url_list:
 881                         suitable_found = False
 882                         for ie in self._ies:
 883                                 # Go to next InfoExtractor if not suitable
 884                                 if not ie.suitable(url):
 885                                         continue
 886
 887                                 # Suitable InfoExtractor found
 888                                 suitable_found = True
 889
 890                                 # Extract information from URL and process it
 891                                 ie.extract(url)
 892
 893                                 # Suitable InfoExtractor had been found; go to next URL
 894                                 break
 895
 896                         if not suitable_found:
 897                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 898
 899                 return self._download_retcode
 900
 901         def post_process(self, filename, ie_info):
 902                 """Run the postprocessing chain on the given file."""
 903                 info = dict(ie_info)
 904                 info['filepath'] = filename
 905                 for pp in self._pps:
 906                         info = pp.run(info)
 907                         if info is None:
 908                                 break
 909
 910         def _download_with_rtmpdump(self, filename, url, player_url):
 911                 self.report_destination(filename)
 912                 tmpfilename = self.temp_name(filename)
 913
 914                 # Check for rtmpdump first
 915                 try:
 916                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 917                 except (OSError, IOError):
 918                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 919                         return False
 920
 921                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 922                 # the connection was interrumpted and resuming appears to be
 923                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 924                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 925                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
 926                 if self.params.get('verbose', False):
 927                         try:
 928                                 import pipes
 929                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
 930                         except ImportError:
 931                                 shell_quote = repr
 932                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
 933                 retval = subprocess.call(args)
 934                 while retval == 2 or retval == 1:
 935                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
 936                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 937                         time.sleep(5.0) # This seems to be needed
 938                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 939                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
 940                         if prevsize == cursize and retval == 1:
 941                                 break
 942                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 943                         if prevsize == cursize and retval == 2 and cursize > 1024:
 944                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 945                                 retval = 0
 946                                 break
 947                 if retval == 0:
 948                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
 949                         self.try_rename(tmpfilename, filename)
 950                         return True
 951                 else:
 952                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 953                         return False
 954
 955         def _do_download(self, filename, info_dict):
 956                 url = info_dict['url']
 957                 player_url = info_dict.get('player_url', None)
 958
 959                 # Check file already present
 960                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
 961                         self.report_file_already_downloaded(filename)
 962                         return True
 963
 964                 # Attempt to download using rtmpdump
 965                 if url.startswith('rtmp'):
 966                         return self._download_with_rtmpdump(filename, url, player_url)
 967
 968                 tmpfilename = self.temp_name(filename)
 969                 stream = None
 970
 971                 # Do not include the Accept-Encoding header
 972                 headers = {'Youtubedl-no-compression': 'True'}
 973                 basic_request = urllib2.Request(url, None, headers)
 974                 request = urllib2.Request(url, None, headers)
 975
 976                 # Establish possible resume length
 977                 if os.path.isfile(_encodeFilename(tmpfilename)):
 978                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
 979                 else:
 980                         resume_len = 0
 981
 982                 open_mode = 'wb'
 983                 if resume_len != 0:
 984                         if self.params.get('continuedl', False):
 985                                 self.report_resuming_byte(resume_len)
 986                                 request.add_header('Range','bytes=%d-' % resume_len)
 987                                 open_mode = 'ab'
 988                         else:
 989                                 resume_len = 0
 990
 991                 count = 0
 992                 retries = self.params.get('retries', 0)
 993                 while count <= retries:
 994                         # Establish connection
 995                         try:
 996                                 if count == 0 and 'urlhandle' in info_dict:
 997                                         data = info_dict['urlhandle']
 998                                 data = urllib2.urlopen(request)
 999                                 break
1000                         except (urllib2.HTTPError, ), err:
1001                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1002                                         # Unexpected HTTP error
1003                                         raise
1004                                 elif err.code == 416:
1005                                         # Unable to resume (requested range not satisfiable)
1006                                         try:
1007                                                 # Open the connection again without the range header
1008                                                 data = urllib2.urlopen(basic_request)
1009                                                 content_length = data.info()['Content-Length']
1010                                         except (urllib2.HTTPError, ), err:
1011                                                 if err.code < 500 or err.code >= 600:
1012                                                         raise
1013                                         else:
1014                                                 # Examine the reported length
1015                                                 if (content_length is not None and
1016                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1017                                                         # The file had already been fully downloaded.
1018                                                         # Explanation to the above condition: in issue #175 it was revealed that
1019                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1020                                                         # changing the file size slightly and causing problems for some users. So
1021                                                         # I decided to implement a suggested change and consider the file
1022                                                         # completely downloaded if the file size differs less than 100 bytes from
1023                                                         # the one in the hard drive.
1024                                                         self.report_file_already_downloaded(filename)
1025                                                         self.try_rename(tmpfilename, filename)
1026                                                         return True
1027                                                 else:
1028                                                         # The length does not match, we start the download over
1029                                                         self.report_unable_to_resume()
1030                                                         open_mode = 'wb'
1031                                                         break
1032                         # Retry
1033                         count += 1
1034                         if count <= retries:
1035                                 self.report_retry(count, retries)
1036
1037                 if count > retries:
1038                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1039                         return False
1040
1041                 data_len = data.info().get('Content-length', None)
1042                 if data_len is not None:
1043                         data_len = long(data_len) + resume_len
1044                 data_len_str = self.format_bytes(data_len)
1045                 byte_counter = 0 + resume_len
1046                 block_size = 1024
1047                 start = time.time()
1048                 while True:
1049                         # Download and write
1050                         before = time.time()
1051                         data_block = data.read(block_size)
1052                         after = time.time()
1053                         if len(data_block) == 0:
1054                                 break
1055                         byte_counter += len(data_block)
1056
1057                         # Open file just in time
1058                         if stream is None:
1059                                 try:
1060                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1061                                         assert stream is not None
1062                                         filename = self.undo_temp_name(tmpfilename)
1063                                         self.report_destination(filename)
1064                                 except (OSError, IOError), err:
1065                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1066                                         return False
1067                         try:
1068                                 stream.write(data_block)
1069                         except (IOError, OSError), err:
1070                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1071                                 return False
1072                         block_size = self.best_block_size(after - before, len(data_block))
1073
1074                         # Progress message
1075                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1076                         if data_len is None:
1077                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1078                         else:
1079                                 percent_str = self.calc_percent(byte_counter, data_len)
1080                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1081                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1082
1083                         # Apply rate limit
1084                         self.slow_down(start, byte_counter - resume_len)
1085
1086                 if stream is None:
1087                         self.trouble(u'\nERROR: Did not get any data blocks')
1088                         return False
1089                 stream.close()
1090                 self.report_finish()
1091                 if data_len is not None and byte_counter != data_len:
1092                         raise ContentTooShortError(byte_counter, long(data_len))
1093                 self.try_rename(tmpfilename, filename)
1094
1095                 # Update file modification time
1096                 if self.params.get('updatetime', True):
1097                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1098
1099                 return True
1100
1101
1102 class InfoExtractor(object):
1103         """Information Extractor class.
1104
1105         Information extractors are the classes that, given a URL, extract
1106         information from the video (or videos) the URL refers to. This
1107         information includes the real video URL, the video title and simplified
1108         title, author and others. The information is stored in a dictionary
1109         which is then passed to the FileDownloader. The FileDownloader
1110         processes this information possibly downloading the video to the file
1111         system, among other possible outcomes. The dictionaries must include
1112         the following fields:
1113
1114         id:             Video identifier.
1115         url:            Final video URL.
1116         uploader:       Nickname of the video uploader.
1117         title:          Literal title.
1118         stitle:         Simplified title.
1119         ext:            Video filename extension.
1120         format:         Video format.
1121         player_url:     SWF Player URL (may be None).
1122
1123         The following fields are optional. Their primary purpose is to allow
1124         youtube-dl to serve as the backend for a video search function, such
1125         as the one in youtube2mp3.  They are only used when their respective
1126         forced printing functions are called:
1127
1128         thumbnail:      Full URL to a video thumbnail image.
1129         description:    One-line video description.
1130
1131         Subclasses of this one should re-define the _real_initialize() and
1132         _real_extract() methods and define a _VALID_URL regexp.
1133         Probably, they should also be added to the list of extractors.
1134         """
1135
1136         _ready = False
1137         _downloader = None
1138
1139         def __init__(self, downloader=None):
1140                 """Constructor. Receives an optional downloader."""
1141                 self._ready = False
1142                 self.set_downloader(downloader)
1143
1144         def suitable(self, url):
1145                 """Receives a URL and returns True if suitable for this IE."""
1146                 return re.match(self._VALID_URL, url) is not None
1147
1148         def initialize(self):
1149                 """Initializes an instance (authentication, etc)."""
1150                 if not self._ready:
1151                         self._real_initialize()
1152                         self._ready = True
1153
1154         def extract(self, url):
1155                 """Extracts URL information and returns it in list of dicts."""
1156                 self.initialize()
1157                 return self._real_extract(url)
1158
1159         def set_downloader(self, downloader):
1160                 """Sets the downloader for this IE."""
1161                 self._downloader = downloader
1162
1163         def _real_initialize(self):
1164                 """Real initialization process. Redefine in subclasses."""
1165                 pass
1166
1167         def _real_extract(self, url):
1168                 """Real extraction process. Redefine in subclasses."""
1169                 pass
1170
1171
1172 class YoutubeIE(InfoExtractor):
1173         """Information extractor for youtube.com."""
1174
1175         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1177         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1178         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1179         _NETRC_MACHINE = 'youtube'
1180         # Listed in order of quality
1181         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1182         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1183         _video_extensions = {
1184                 '13': '3gp',
1185                 '17': 'mp4',
1186                 '18': 'mp4',
1187                 '22': 'mp4',
1188                 '37': 'mp4',
1189                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1190                 '43': 'webm',
1191                 '44': 'webm',
1192                 '45': 'webm',
1193         }
1194         _video_dimensions = {
1195                 '5': '240x400',
1196                 '6': '???',
1197                 '13': '???',
1198                 '17': '144x176',
1199                 '18': '360x640',
1200                 '22': '720x1280',
1201                 '34': '360x640',
1202                 '35': '480x854',
1203                 '37': '1080x1920',
1204                 '38': '3072x4096',
1205                 '43': '360x640',
1206                 '44': '480x854',
1207                 '45': '720x1280',
1208         }
1209         IE_NAME = u'youtube'
1210
1211         def report_lang(self):
1212                 """Report attempt to set language."""
1213                 self._downloader.to_screen(u'[youtube] Setting language')
1214
1215         def report_login(self):
1216                 """Report attempt to log in."""
1217                 self._downloader.to_screen(u'[youtube] Logging in')
1218
1219         def report_age_confirmation(self):
1220                 """Report attempt to confirm age."""
1221                 self._downloader.to_screen(u'[youtube] Confirming age')
1222
1223         def report_video_webpage_download(self, video_id):
1224                 """Report attempt to download video webpage."""
1225                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1226
1227         def report_video_info_webpage_download(self, video_id):
1228                 """Report attempt to download video info webpage."""
1229                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1230
1231         def report_video_subtitles_download(self, video_id):
1232                 """Report attempt to download video info webpage."""
1233                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1234
1235         def report_information_extraction(self, video_id):
1236                 """Report attempt to extract video information."""
1237                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1238
1239         def report_unavailable_format(self, video_id, format):
1240                 """Report extracted video URL."""
1241                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1242
1243         def report_rtmp_download(self):
1244                 """Indicate the download will use the RTMP protocol."""
1245                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1246
1247         def _closed_captions_xml_to_srt(self, xml_string):
1248                 srt = ''
1249                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1250                 # TODO parse xml instead of regex
1251                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1252                         if not dur: dur = '4'
1253                         start = float(start)
1254                         end = start + float(dur)
1255                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1256                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1257                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1258                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1259                         srt += str(n) + '\n'
1260                         srt += start + ' --> ' + end + '\n'
1261                         srt += caption + '\n\n'
1262                 return srt
1263
1264         def _print_formats(self, formats):
1265                 print 'Available formats:'
1266                 for x in formats:
1267                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1268
1269         def _real_initialize(self):
1270                 if self._downloader is None:
1271                         return
1272
1273                 username = None
1274                 password = None
1275                 downloader_params = self._downloader.params
1276
1277                 # Attempt to use provided username and password or .netrc data
1278                 if downloader_params.get('username', None) is not None:
1279                         username = downloader_params['username']
1280                         password = downloader_params['password']
1281                 elif downloader_params.get('usenetrc', False):
1282                         try:
1283                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1284                                 if info is not None:
1285                                         username = info[0]
1286                                         password = info[2]
1287                                 else:
1288                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1289                         except (IOError, netrc.NetrcParseError), err:
1290                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1291                                 return
1292
1293                 # Set language
1294                 request = urllib2.Request(self._LANG_URL)
1295                 try:
1296                         self.report_lang()
1297                         urllib2.urlopen(request).read()
1298                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1299                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1300                         return
1301
1302                 # No authentication to be performed
1303                 if username is None:
1304                         return
1305
1306                 # Log in
1307                 login_form = {
1308                                 'current_form': 'loginForm',
1309                                 'next':         '/',
1310                                 'action_login': 'Log In',
1311                                 'username':     username,
1312                                 'password':     password,
1313                                 }
1314                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1315                 try:
1316                         self.report_login()
1317                         login_results = urllib2.urlopen(request).read()
1318                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1319                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1320                                 return
1321                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1322                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1323                         return
1324
1325                 # Confirm age
1326                 age_form = {
1327                                 'next_url':             '/',
1328                                 'action_confirm':       'Confirm',
1329                                 }
1330                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1331                 try:
1332                         self.report_age_confirmation()
1333                         age_results = urllib2.urlopen(request).read()
1334                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1335                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1336                         return
1337
1338         def _real_extract(self, url):
1339                 # Extract video id from URL
1340                 mobj = re.match(self._VALID_URL, url)
1341                 if mobj is None:
1342                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1343                         return
1344                 video_id = mobj.group(2)
1345
1346                 # Get video webpage
1347                 self.report_video_webpage_download(video_id)
1348                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1349                 try:
1350                         video_webpage = urllib2.urlopen(request).read()
1351                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1352                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1353                         return
1354
1355                 # Attempt to extract SWF player URL
1356                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1357                 if mobj is not None:
1358                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1359                 else:
1360                         player_url = None
1361
1362                 # Get video info
1363                 self.report_video_info_webpage_download(video_id)
1364                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1365                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1366                                         % (video_id, el_type))
1367                         request = urllib2.Request(video_info_url)
1368                         try:
1369                                 video_info_webpage = urllib2.urlopen(request).read()
1370                                 video_info = parse_qs(video_info_webpage)
1371                                 if 'token' in video_info:
1372                                         break
1373                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1374                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1375                                 return
1376                 if 'token' not in video_info:
1377                         if 'reason' in video_info:
1378                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1379                         else:
1380                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1381                         return
1382
1383                 # Start extracting information
1384                 self.report_information_extraction(video_id)
1385
1386                 # uploader
1387                 if 'author' not in video_info:
1388                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1389                         return
1390                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1391
1392                 # title
1393                 if 'title' not in video_info:
1394                         self._downloader.trouble(u'ERROR: unable to extract video title')
1395                         return
1396                 video_title = urllib.unquote_plus(video_info['title'][0])
1397                 video_title = video_title.decode('utf-8')
1398                 video_title = sanitize_title(video_title)
1399
1400                 # simplified title
1401                 simple_title = _simplify_title(video_title)
1402
1403                 # thumbnail image
1404                 if 'thumbnail_url' not in video_info:
1405                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1406                         video_thumbnail = ''
1407                 else:   # don't panic if we can't find it
1408                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1409
1410                 # upload date
1411                 upload_date = u'NA'
1412                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1413                 if mobj is not None:
1414                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1415                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1416                         for expression in format_expressions:
1417                                 try:
1418                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1419                                 except:
1420                                         pass
1421
1422                 # description
1423                 try:
1424                         lxml.etree
1425                 except NameError:
1426                         video_description = u'No description available.'
1427                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1428                         if mobj is not None:
1429                                 video_description = mobj.group(1).decode('utf-8')
1430                 else:
1431                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1432                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1433                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1434                         # TODO use another parser
1435
1436                 # closed captions
1437                 video_subtitles = None
1438                 if self._downloader.params.get('writesubtitles', False):
1439                         self.report_video_subtitles_download(video_id)
1440                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1441                         try:
1442                                 srt_list = urllib2.urlopen(request).read()
1443                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1444                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1445                         else:
1446                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1447                                 if srt_lang_list:
1448                                         if self._downloader.params.get('subtitleslang', False):
1449                                                 srt_lang = self._downloader.params.get('subtitleslang')
1450                                         elif 'en' in srt_lang_list:
1451                                                 srt_lang = 'en'
1452                                         else:
1453                                                 srt_lang = srt_lang_list[0]
1454                                         if not srt_lang in srt_lang_list:
1455                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1456                                         else:
1457                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1458                                                 try:
1459                                                         srt_xml = urllib2.urlopen(request).read()
1460                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1461                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1462                                                 else:
1463                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1464                                 else:
1465                                         self._downloader.trouble(u'WARNING: video has no closed captions')
1466
1467                 # token
1468                 video_token = urllib.unquote_plus(video_info['token'][0])
1469
1470                 # Decide which formats to download
1471                 req_format = self._downloader.params.get('format', None)
1472
1473                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1474                         self.report_rtmp_download()
1475                         video_url_list = [(None, video_info['conn'][0])]
1476                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1477                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1478                         url_data = [parse_qs(uds) for uds in url_data_strs]
1479                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1480                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1481
1482                         format_limit = self._downloader.params.get('format_limit', None)
1483                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1484                         if format_limit is not None and format_limit in available_formats:
1485                                 format_list = available_formats[available_formats.index(format_limit):]
1486                         else:
1487                                 format_list = available_formats
1488                         existing_formats = [x for x in format_list if x in url_map]
1489                         if len(existing_formats) == 0:
1490                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1491                                 return
1492                         if self._downloader.params.get('listformats', None):
1493                                 self._print_formats(existing_formats)
1494                                 return
1495                         if req_format is None or req_format == 'best':
1496                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1497                         elif req_format == 'worst':
1498                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1499                         elif req_format in ('-1', 'all'):
1500                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1501                         else:
1502                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1503                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1504                                 req_formats = req_format.split('/')
1505                                 video_url_list = None
1506                                 for rf in req_formats:
1507                                         if rf in url_map:
1508                                                 video_url_list = [(rf, url_map[rf])]
1509                                                 break
1510                                 if video_url_list is None:
1511                                         self._downloader.trouble(u'ERROR: requested format not available')
1512                                         return
1513                 else:
1514                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1515                         return
1516
1517                 for format_param, video_real_url in video_url_list:
1518                         # At this point we have a new video
1519                         self._downloader.increment_downloads()
1520
1521                         # Extension
1522                         video_extension = self._video_extensions.get(format_param, 'flv')
1523
1524                         try:
1525                                 # Process video information
1526                                 self._downloader.process_info({
1527                                         'id':           video_id.decode('utf-8'),
1528                                         'url':          video_real_url.decode('utf-8'),
1529                                         'uploader':     video_uploader.decode('utf-8'),
1530                                         'upload_date':  upload_date,
1531                                         'title':        video_title,
1532                                         'stitle':       simple_title,
1533                                         'ext':          video_extension.decode('utf-8'),
1534                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1535                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1536                                         'description':  video_description,
1537                                         'player_url':   player_url,
1538                                         'subtitles':    video_subtitles
1539                                 })
1540                         except UnavailableVideoError, err:
1541                                 self._downloader.trouble(u'\nERROR: unable to download video')
1542
1543
1544 class MetacafeIE(InfoExtractor):
1545         """Information Extractor for metacafe.com."""
1546
1547         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1548         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1549         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1550         _youtube_ie = None
1551         IE_NAME = u'metacafe'
1552
1553         def __init__(self, youtube_ie, downloader=None):
1554                 InfoExtractor.__init__(self, downloader)
1555                 self._youtube_ie = youtube_ie
1556
1557         def report_disclaimer(self):
1558                 """Report disclaimer retrieval."""
1559                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1560
1561         def report_age_confirmation(self):
1562                 """Report attempt to confirm age."""
1563                 self._downloader.to_screen(u'[metacafe] Confirming age')
1564
1565         def report_download_webpage(self, video_id):
1566                 """Report webpage download."""
1567                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1568
1569         def report_extraction(self, video_id):
1570                 """Report information extraction."""
1571                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1572
1573         def _real_initialize(self):
1574                 # Retrieve disclaimer
1575                 request = urllib2.Request(self._DISCLAIMER)
1576                 try:
1577                         self.report_disclaimer()
1578                         disclaimer = urllib2.urlopen(request).read()
1579                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1580                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1581                         return
1582
1583                 # Confirm age
1584                 disclaimer_form = {
1585                         'filters': '0',
1586                         'submit': "Continue - I'm over 18",
1587                         }
1588                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1589                 try:
1590                         self.report_age_confirmation()
1591                         disclaimer = urllib2.urlopen(request).read()
1592                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1593                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1594                         return
1595
1596         def _real_extract(self, url):
1597                 # Extract id and simplified title from URL
1598                 mobj = re.match(self._VALID_URL, url)
1599                 if mobj is None:
1600                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1601                         return
1602
1603                 video_id = mobj.group(1)
1604
1605                 # Check if video comes from YouTube
1606                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1607                 if mobj2 is not None:
1608                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1609                         return
1610
1611                 # At this point we have a new video
1612                 self._downloader.increment_downloads()
1613
1614                 simple_title = mobj.group(2).decode('utf-8')
1615
1616                 # Retrieve video webpage to extract further information
1617                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1618                 try:
1619                         self.report_download_webpage(video_id)
1620                         webpage = urllib2.urlopen(request).read()
1621                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1622                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1623                         return
1624
1625                 # Extract URL, uploader and title from webpage
1626                 self.report_extraction(video_id)
1627                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1628                 if mobj is not None:
1629                         mediaURL = urllib.unquote(mobj.group(1))
1630                         video_extension = mediaURL[-3:]
1631
1632                         # Extract gdaKey if available
1633                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1634                         if mobj is None:
1635                                 video_url = mediaURL
1636                         else:
1637                                 gdaKey = mobj.group(1)
1638                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1639                 else:
1640                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1641                         if mobj is None:
1642                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1643                                 return
1644                         vardict = parse_qs(mobj.group(1))
1645                         if 'mediaData' not in vardict:
1646                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1647                                 return
1648                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1649                         if mobj is None:
1650                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1651                                 return
1652                         mediaURL = mobj.group(1).replace('\\/', '/')
1653                         video_extension = mediaURL[-3:]
1654                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1655
1656                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1657                 if mobj is None:
1658                         self._downloader.trouble(u'ERROR: unable to extract title')
1659                         return
1660                 video_title = mobj.group(1).decode('utf-8')
1661                 video_title = sanitize_title(video_title)
1662
1663                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1664                 if mobj is None:
1665                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1666                         return
1667                 video_uploader = mobj.group(1)
1668
1669                 try:
1670                         # Process video information
1671                         self._downloader.process_info({
1672                                 'id':           video_id.decode('utf-8'),
1673                                 'url':          video_url.decode('utf-8'),
1674                                 'uploader':     video_uploader.decode('utf-8'),
1675                                 'upload_date':  u'NA',
1676                                 'title':        video_title,
1677                                 'stitle':       simple_title,
1678                                 'ext':          video_extension.decode('utf-8'),
1679                                 'format':       u'NA',
1680                                 'player_url':   None,
1681                         })
1682                 except UnavailableVideoError:
1683                         self._downloader.trouble(u'\nERROR: unable to download video')
1684
1685
1686 class DailymotionIE(InfoExtractor):
1687         """Information Extractor for Dailymotion"""
1688
1689         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1690         IE_NAME = u'dailymotion'
1691
1692         def __init__(self, downloader=None):
1693                 InfoExtractor.__init__(self, downloader)
1694
1695         def report_download_webpage(self, video_id):
1696                 """Report webpage download."""
1697                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1698
1699         def report_extraction(self, video_id):
1700                 """Report information extraction."""
1701                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1702
1703         def _real_extract(self, url):
1704                 # Extract id and simplified title from URL
1705                 mobj = re.match(self._VALID_URL, url)
1706                 if mobj is None:
1707                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1708                         return
1709
1710                 # At this point we have a new video
1711                 self._downloader.increment_downloads()
1712                 video_id = mobj.group(1)
1713
1714                 video_extension = 'flv'
1715
1716                 # Retrieve video webpage to extract further information
1717                 request = urllib2.Request(url)
1718                 request.add_header('Cookie', 'family_filter=off')
1719                 try:
1720                         self.report_download_webpage(video_id)
1721                         webpage = urllib2.urlopen(request).read()
1722                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1723                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1724                         return
1725
1726                 # Extract URL, uploader and title from webpage
1727                 self.report_extraction(video_id)
1728                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1729                 if mobj is None:
1730                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1731                         return
1732                 sequence = urllib.unquote(mobj.group(1))
1733                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1734                 if mobj is None:
1735                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1736                         return
1737                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1738
1739                 # if needed add http://www.dailymotion.com/ if relative URL
1740
1741                 video_url = mediaURL
1742
1743                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1744                 if mobj is None:
1745                         self._downloader.trouble(u'ERROR: unable to extract title')
1746                         return
1747                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1748                 video_title = sanitize_title(video_title)
1749                 simple_title = _simplify_title(video_title)
1750
1751                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1752                 if mobj is None:
1753                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1754                         return
1755                 video_uploader = mobj.group(1)
1756
1757                 try:
1758                         # Process video information
1759                         self._downloader.process_info({
1760                                 'id':           video_id.decode('utf-8'),
1761                                 'url':          video_url.decode('utf-8'),
1762                                 'uploader':     video_uploader.decode('utf-8'),
1763                                 'upload_date':  u'NA',
1764                                 'title':        video_title,
1765                                 'stitle':       simple_title,
1766                                 'ext':          video_extension.decode('utf-8'),
1767                                 'format':       u'NA',
1768                                 'player_url':   None,
1769                         })
1770                 except UnavailableVideoError:
1771                         self._downloader.trouble(u'\nERROR: unable to download video')
1772
1773
1774 class GoogleIE(InfoExtractor):
1775         """Information extractor for video.google.com."""
1776
1777         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1778         IE_NAME = u'video.google'
1779
1780         def __init__(self, downloader=None):
1781                 InfoExtractor.__init__(self, downloader)
1782
1783         def report_download_webpage(self, video_id):
1784                 """Report webpage download."""
1785                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1786
1787         def report_extraction(self, video_id):
1788                 """Report information extraction."""
1789                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1790
1791         def _real_extract(self, url):
1792                 # Extract id from URL
1793                 mobj = re.match(self._VALID_URL, url)
1794                 if mobj is None:
1795                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1796                         return
1797
1798                 # At this point we have a new video
1799                 self._downloader.increment_downloads()
1800                 video_id = mobj.group(1)
1801
1802                 video_extension = 'mp4'
1803
1804                 # Retrieve video webpage to extract further information
1805                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1806                 try:
1807                         self.report_download_webpage(video_id)
1808                         webpage = urllib2.urlopen(request).read()
1809                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1810                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1811                         return
1812
1813                 # Extract URL, uploader, and title from webpage
1814                 self.report_extraction(video_id)
1815                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1816                 if mobj is None:
1817                         video_extension = 'flv'
1818                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1819                 if mobj is None:
1820                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1821                         return
1822                 mediaURL = urllib.unquote(mobj.group(1))
1823                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1824                 mediaURL = mediaURL.replace('\\x26', '\x26')
1825
1826                 video_url = mediaURL
1827
1828                 mobj = re.search(r'<title>(.*)</title>', webpage)
1829                 if mobj is None:
1830                         self._downloader.trouble(u'ERROR: unable to extract title')
1831                         return
1832                 video_title = mobj.group(1).decode('utf-8')
1833                 video_title = sanitize_title(video_title)
1834                 simple_title = _simplify_title(video_title)
1835
1836                 # Extract video description
1837                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1838                 if mobj is None:
1839                         self._downloader.trouble(u'ERROR: unable to extract video description')
1840                         return
1841                 video_description = mobj.group(1).decode('utf-8')
1842                 if not video_description:
1843                         video_description = 'No description available.'
1844
1845                 # Extract video thumbnail
1846                 if self._downloader.params.get('forcethumbnail', False):
1847                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1848                         try:
1849                                 webpage = urllib2.urlopen(request).read()
1850                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1851                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1852                                 return
1853                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1854                         if mobj is None:
1855                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1856                                 return
1857                         video_thumbnail = mobj.group(1)
1858                 else:   # we need something to pass to process_info
1859                         video_thumbnail = ''
1860
1861                 try:
1862                         # Process video information
1863                         self._downloader.process_info({
1864                                 'id':           video_id.decode('utf-8'),
1865                                 'url':          video_url.decode('utf-8'),
1866                                 'uploader':     u'NA',
1867                                 'upload_date':  u'NA',
1868                                 'title':        video_title,
1869                                 'stitle':       simple_title,
1870                                 'ext':          video_extension.decode('utf-8'),
1871                                 'format':       u'NA',
1872                                 'player_url':   None,
1873                         })
1874                 except UnavailableVideoError:
1875                         self._downloader.trouble(u'\nERROR: unable to download video')
1876
1877
1878 class PhotobucketIE(InfoExtractor):
1879         """Information extractor for photobucket.com."""
1880
1881         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1882         IE_NAME = u'photobucket'
1883
1884         def __init__(self, downloader=None):
1885                 InfoExtractor.__init__(self, downloader)
1886
1887         def report_download_webpage(self, video_id):
1888                 """Report webpage download."""
1889                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1890
1891         def report_extraction(self, video_id):
1892                 """Report information extraction."""
1893                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1894
1895         def _real_extract(self, url):
1896                 # Extract id from URL
1897                 mobj = re.match(self._VALID_URL, url)
1898                 if mobj is None:
1899                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1900                         return
1901
1902                 # At this point we have a new video
1903                 self._downloader.increment_downloads()
1904                 video_id = mobj.group(1)
1905
1906                 video_extension = 'flv'
1907
1908                 # Retrieve video webpage to extract further information
1909                 request = urllib2.Request(url)
1910                 try:
1911                         self.report_download_webpage(video_id)
1912                         webpage = urllib2.urlopen(request).read()
1913                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1914                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1915                         return
1916
1917                 # Extract URL, uploader, and title from webpage
1918                 self.report_extraction(video_id)
1919                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1920                 if mobj is None:
1921                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1922                         return
1923                 mediaURL = urllib.unquote(mobj.group(1))
1924
1925                 video_url = mediaURL
1926
1927                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1928                 if mobj is None:
1929                         self._downloader.trouble(u'ERROR: unable to extract title')
1930                         return
1931                 video_title = mobj.group(1).decode('utf-8')
1932                 video_title = sanitize_title(video_title)
1933                 simple_title = _simplify_title(vide_title)
1934
1935                 video_uploader = mobj.group(2).decode('utf-8')
1936
1937                 try:
1938                         # Process video information
1939                         self._downloader.process_info({
1940                                 'id':           video_id.decode('utf-8'),
1941                                 'url':          video_url.decode('utf-8'),
1942                                 'uploader':     video_uploader,
1943                                 'upload_date':  u'NA',
1944                                 'title':        video_title,
1945                                 'stitle':       simple_title,
1946                                 'ext':          video_extension.decode('utf-8'),
1947                                 'format':       u'NA',
1948                                 'player_url':   None,
1949                         })
1950                 except UnavailableVideoError:
1951                         self._downloader.trouble(u'\nERROR: unable to download video')
1952
1953
1954 class YahooIE(InfoExtractor):
1955         """Information extractor for video.yahoo.com."""
1956
1957         # _VALID_URL matches all Yahoo! Video URLs
1958         # _VPAGE_URL matches only the extractable '/watch/' URLs
1959         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1960         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1961         IE_NAME = u'video.yahoo'
1962
1963         def __init__(self, downloader=None):
1964                 InfoExtractor.__init__(self, downloader)
1965
1966         def report_download_webpage(self, video_id):
1967                 """Report webpage download."""
1968                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1969
1970         def report_extraction(self, video_id):
1971                 """Report information extraction."""
1972                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1973
1974         def _real_extract(self, url, new_video=True):
1975                 # Extract ID from URL
1976                 mobj = re.match(self._VALID_URL, url)
1977                 if mobj is None:
1978                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1979                         return
1980
1981                 # At this point we have a new video
1982                 self._downloader.increment_downloads()
1983                 video_id = mobj.group(2)
1984                 video_extension = 'flv'
1985
1986                 # Rewrite valid but non-extractable URLs as
1987                 # extractable English language /watch/ URLs
1988                 if re.match(self._VPAGE_URL, url) is None:
1989                         request = urllib2.Request(url)
1990                         try:
1991                                 webpage = urllib2.urlopen(request).read()
1992                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1993                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1994                                 return
1995
1996                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1997                         if mobj is None:
1998                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1999                                 return
2000                         yahoo_id = mobj.group(1)
2001
2002                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2003                         if mobj is None:
2004                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2005                                 return
2006                         yahoo_vid = mobj.group(1)
2007
2008                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2009                         return self._real_extract(url, new_video=False)
2010
2011                 # Retrieve video webpage to extract further information
2012                 request = urllib2.Request(url)
2013                 try:
2014                         self.report_download_webpage(video_id)
2015                         webpage = urllib2.urlopen(request).read()
2016                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2017                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2018                         return
2019
2020                 # Extract uploader and title from webpage
2021                 self.report_extraction(video_id)
2022                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2023                 if mobj is None:
2024                         self._downloader.trouble(u'ERROR: unable to extract video title')
2025                         return
2026                 video_title = mobj.group(1).decode('utf-8')
2027                 simple_title = _simplify_title(video_title)
2028
2029                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2030                 if mobj is None:
2031                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2032                         return
2033                 video_uploader = mobj.group(1).decode('utf-8')
2034
2035                 # Extract video thumbnail
2036                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2037                 if mobj is None:
2038                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2039                         return
2040                 video_thumbnail = mobj.group(1).decode('utf-8')
2041
2042                 # Extract video description
2043                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2044                 if mobj is None:
2045                         self._downloader.trouble(u'ERROR: unable to extract video description')
2046                         return
2047                 video_description = mobj.group(1).decode('utf-8')
2048                 if not video_description:
2049                         video_description = 'No description available.'
2050
2051                 # Extract video height and width
2052                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2053                 if mobj is None:
2054                         self._downloader.trouble(u'ERROR: unable to extract video height')
2055                         return
2056                 yv_video_height = mobj.group(1)
2057
2058                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2059                 if mobj is None:
2060                         self._downloader.trouble(u'ERROR: unable to extract video width')
2061                         return
2062                 yv_video_width = mobj.group(1)
2063
2064                 # Retrieve video playlist to extract media URL
2065                 # I'm not completely sure what all these options are, but we
2066                 # seem to need most of them, otherwise the server sends a 401.
2067                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2068                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2069                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2070                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2071                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2072                 try:
2073                         self.report_download_webpage(video_id)
2074                         webpage = urllib2.urlopen(request).read()
2075                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2076                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2077                         return
2078
2079                 # Extract media URL from playlist XML
2080                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2081                 if mobj is None:
2082                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2083                         return
2084                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2085                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2086
2087                 try:
2088                         # Process video information
2089                         self._downloader.process_info({
2090                                 'id':           video_id.decode('utf-8'),
2091                                 'url':          video_url,
2092                                 'uploader':     video_uploader,
2093                                 'upload_date':  u'NA',
2094                                 'title':        video_title,
2095                                 'stitle':       simple_title,
2096                                 'ext':          video_extension.decode('utf-8'),
2097                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2098                                 'description':  video_description,
2099                                 'thumbnail':    video_thumbnail,
2100                                 'player_url':   None,
2101                         })
2102                 except UnavailableVideoError:
2103                         self._downloader.trouble(u'\nERROR: unable to download video')
2104
2105
2106 class VimeoIE(InfoExtractor):
2107         """Information extractor for vimeo.com."""
2108
2109         # _VALID_URL matches Vimeo URLs
2110         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2111         IE_NAME = u'vimeo'
2112
2113         def __init__(self, downloader=None):
2114                 InfoExtractor.__init__(self, downloader)
2115
2116         def report_download_webpage(self, video_id):
2117                 """Report webpage download."""
2118                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2119
2120         def report_extraction(self, video_id):
2121                 """Report information extraction."""
2122                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2123
2124         def _real_extract(self, url, new_video=True):
2125                 # Extract ID from URL
2126                 mobj = re.match(self._VALID_URL, url)
2127                 if mobj is None:
2128                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2129                         return
2130
2131                 # At this point we have a new video
2132                 self._downloader.increment_downloads()
2133                 video_id = mobj.group(1)
2134
2135                 # Retrieve video webpage to extract further information
2136                 request = urllib2.Request(url, None, std_headers)
2137                 try:
2138                         self.report_download_webpage(video_id)
2139                         webpage = urllib2.urlopen(request).read()
2140                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2141                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2142                         return
2143
2144                 # Now we begin extracting as much information as we can from what we
2145                 # retrieved. First we extract the information common to all extractors,
2146                 # and latter we extract those that are Vimeo specific.
2147                 self.report_extraction(video_id)
2148
2149                 # Extract the config JSON
2150                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2151                 try:
2152                         config = json.loads(config)
2153                 except:
2154                         self._downloader.trouble(u'ERROR: unable to extract info section')
2155                         return
2156
2157                 # Extract title
2158                 video_title = config["video"]["title"]
2159                 simple_title = _simplify_title(video_title)
2160
2161                 # Extract uploader
2162                 video_uploader = config["video"]["owner"]["name"]
2163
2164                 # Extract video thumbnail
2165                 video_thumbnail = config["video"]["thumbnail"]
2166
2167                 # Extract video description
2168                 try:
2169                         lxml.etree
2170                 except NameError:
2171                         video_description = u'No description available.'
2172                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2173                         if mobj is not None:
2174                                 video_description = mobj.group(1)
2175                 else:
2176                         html_parser = lxml.etree.HTMLParser()
2177                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2178                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2179                         # TODO use another parser
2180
2181                 # Extract upload date
2182                 video_upload_date = u'NA'
2183                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2184                 if mobj is not None:
2185                         video_upload_date = mobj.group(1)
2186
2187                 # Vimeo specific: extract request signature and timestamp
2188                 sig = config['request']['signature']
2189                 timestamp = config['request']['timestamp']
2190
2191                 # Vimeo specific: extract video codec and quality information
2192                 # TODO bind to format param
2193                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2194                 for codec in codecs:
2195                         if codec[0] in config["video"]["files"]:
2196                                 video_codec = codec[0]
2197                                 video_extension = codec[1]
2198                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2199                                 else: quality = 'sd'
2200                                 break
2201                 else:
2202                         self._downloader.trouble(u'ERROR: no known codec found')
2203                         return
2204
2205                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2206                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2207
2208                 try:
2209                         # Process video information
2210                         self._downloader.process_info({
2211                                 'id':           video_id,
2212                                 'url':          video_url,
2213                                 'uploader':     video_uploader,
2214                                 'upload_date':  video_upload_date,
2215                                 'title':        video_title,
2216                                 'stitle':       simple_title,
2217                                 'ext':          video_extension,
2218                                 'thumbnail':    video_thumbnail,
2219                                 'description':  video_description,
2220                                 'player_url':   None,
2221                         })
2222                 except UnavailableVideoError:
2223                         self._downloader.trouble(u'ERROR: unable to download video')
2224
2225
2226 class GenericIE(InfoExtractor):
2227         """Generic last-resort information extractor."""
2228
2229         _VALID_URL = r'.*'
2230         IE_NAME = u'generic'
2231
2232         def __init__(self, downloader=None):
2233                 InfoExtractor.__init__(self, downloader)
2234
2235         def report_download_webpage(self, video_id):
2236                 """Report webpage download."""
2237                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2238                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2239
2240         def report_extraction(self, video_id):
2241                 """Report information extraction."""
2242                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2243
2244         def _real_extract(self, url):
2245                 # At this point we have a new video
2246                 self._downloader.increment_downloads()
2247
2248                 video_id = url.split('/')[-1]
2249                 request = urllib2.Request(url)
2250                 try:
2251                         self.report_download_webpage(video_id)
2252                         webpage = urllib2.urlopen(request).read()
2253                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2254                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2255                         return
2256                 except ValueError, err:
2257                         # since this is the last-resort InfoExtractor, if
2258                         # this error is thrown, it'll be thrown here
2259                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2260                         return
2261
2262                 self.report_extraction(video_id)
2263                 # Start with something easy: JW Player in SWFObject
2264                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2265                 if mobj is None:
2266                         # Broaden the search a little bit
2267                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2268                 if mobj is None:
2269                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2270                         return
2271
2272                 # It's possible that one of the regexes
2273                 # matched, but returned an empty group:
2274                 if mobj.group(1) is None:
2275                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2276                         return
2277
2278                 video_url = urllib.unquote(mobj.group(1))
2279                 video_id = os.path.basename(video_url)
2280
2281                 # here's a fun little line of code for you:
2282                 video_extension = os.path.splitext(video_id)[1][1:]
2283                 video_id = os.path.splitext(video_id)[0]
2284
2285                 # it's tempting to parse this further, but you would
2286                 # have to take into account all the variations like
2287                 #   Video Title - Site Name
2288                 #   Site Name | Video Title
2289                 #   Video Title - Tagline | Site Name
2290                 # and so on and so forth; it's just not practical
2291                 mobj = re.search(r'<title>(.*)</title>', webpage)
2292                 if mobj is None:
2293                         self._downloader.trouble(u'ERROR: unable to extract title')
2294                         return
2295                 video_title = mobj.group(1).decode('utf-8')
2296                 video_title = sanitize_title(video_title)
2297                 simple_title = _simplify_title(video_title)
2298
2299                 # video uploader is domain name
2300                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2301                 if mobj is None:
2302                         self._downloader.trouble(u'ERROR: unable to extract title')
2303                         return
2304                 video_uploader = mobj.group(1).decode('utf-8')
2305
2306                 try:
2307                         # Process video information
2308                         self._downloader.process_info({
2309                                 'id':           video_id.decode('utf-8'),
2310                                 'url':          video_url.decode('utf-8'),
2311                                 'uploader':     video_uploader,
2312                                 'upload_date':  u'NA',
2313                                 'title':        video_title,
2314                                 'stitle':       simple_title,
2315                                 'ext':          video_extension.decode('utf-8'),
2316                                 'format':       u'NA',
2317                                 'player_url':   None,
2318                         })
2319                 except UnavailableVideoError, err:
2320                         self._downloader.trouble(u'\nERROR: unable to download video')
2321
2322
2323 class YoutubeSearchIE(InfoExtractor):
2324         """Information Extractor for YouTube search queries."""
2325         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2326         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2327         _youtube_ie = None
2328         _max_youtube_results = 1000
2329         IE_NAME = u'youtube:search'
2330
2331         def __init__(self, youtube_ie, downloader=None):
2332                 InfoExtractor.__init__(self, downloader)
2333                 self._youtube_ie = youtube_ie
2334
2335         def report_download_page(self, query, pagenum):
2336                 """Report attempt to download playlist page with given number."""
2337                 query = query.decode(preferredencoding())
2338                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2339
2340         def _real_initialize(self):
2341                 self._youtube_ie.initialize()
2342
2343         def _real_extract(self, query):
2344                 mobj = re.match(self._VALID_URL, query)
2345                 if mobj is None:
2346                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2347                         return
2348
2349                 prefix, query = query.split(':')
2350                 prefix = prefix[8:]
2351                 query = query.encode('utf-8')
2352                 if prefix == '':
2353                         self._download_n_results(query, 1)
2354                         return
2355                 elif prefix == 'all':
2356                         self._download_n_results(query, self._max_youtube_results)
2357                         return
2358                 else:
2359                         try:
2360                                 n = long(prefix)
2361                                 if n <= 0:
2362                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2363                                         return
2364                                 elif n > self._max_youtube_results:
2365                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2366                                         n = self._max_youtube_results
2367                                 self._download_n_results(query, n)
2368                                 return
2369                         except ValueError: # parsing prefix as integer fails
2370                                 self._download_n_results(query, 1)
2371                                 return
2372
2373         def _download_n_results(self, query, n):
2374                 """Downloads a specified number of results for a query"""
2375
2376                 video_ids = []
2377                 pagenum = 0
2378                 limit = n
2379
2380                 while (50 * pagenum) < limit:
2381                         self.report_download_page(query, pagenum+1)
2382                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2383                         request = urllib2.Request(result_url)
2384                         try:
2385                                 data = urllib2.urlopen(request).read()
2386                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2387                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2388                                 return
2389                         api_response = json.loads(data)['data']
2390
2391                         new_ids = list(video['id'] for video in api_response['items'])
2392                         video_ids += new_ids
2393
2394                         limit = min(n, api_response['totalItems'])
2395                         pagenum += 1
2396
2397                 if len(video_ids) > n:
2398                         video_ids = video_ids[:n]
2399                 for id in video_ids:
2400                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2401                 return
2402
2403
2404 class GoogleSearchIE(InfoExtractor):
2405         """Information Extractor for Google Video search queries."""
2406         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2407         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2408         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2409         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2410         _google_ie = None
2411         _max_google_results = 1000
2412         IE_NAME = u'video.google:search'
2413
2414         def __init__(self, google_ie, downloader=None):
2415                 InfoExtractor.__init__(self, downloader)
2416                 self._google_ie = google_ie
2417
2418         def report_download_page(self, query, pagenum):
2419                 """Report attempt to download playlist page with given number."""
2420                 query = query.decode(preferredencoding())
2421                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2422
2423         def _real_initialize(self):
2424                 self._google_ie.initialize()
2425
2426         def _real_extract(self, query):
2427                 mobj = re.match(self._VALID_URL, query)
2428                 if mobj is None:
2429                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2430                         return
2431
2432                 prefix, query = query.split(':')
2433                 prefix = prefix[8:]
2434                 query = query.encode('utf-8')
2435                 if prefix == '':
2436                         self._download_n_results(query, 1)
2437                         return
2438                 elif prefix == 'all':
2439                         self._download_n_results(query, self._max_google_results)
2440                         return
2441                 else:
2442                         try:
2443                                 n = long(prefix)
2444                                 if n <= 0:
2445                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2446                                         return
2447                                 elif n > self._max_google_results:
2448                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2449                                         n = self._max_google_results
2450                                 self._download_n_results(query, n)
2451                                 return
2452                         except ValueError: # parsing prefix as integer fails
2453                                 self._download_n_results(query, 1)
2454                                 return
2455
2456         def _download_n_results(self, query, n):
2457                 """Downloads a specified number of results for a query"""
2458
2459                 video_ids = []
2460                 pagenum = 0
2461
2462                 while True:
2463                         self.report_download_page(query, pagenum)
2464                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2465                         request = urllib2.Request(result_url)
2466                         try:
2467                                 page = urllib2.urlopen(request).read()
2468                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2469                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2470                                 return
2471
2472                         # Extract video identifiers
2473                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2474                                 video_id = mobj.group(1)
2475                                 if video_id not in video_ids:
2476                                         video_ids.append(video_id)
2477                                         if len(video_ids) == n:
2478                                                 # Specified n videos reached
2479                                                 for id in video_ids:
2480                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2481                                                 return
2482
2483                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2484                                 for id in video_ids:
2485                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2486                                 return
2487
2488                         pagenum = pagenum + 1
2489
2490
2491 class YahooSearchIE(InfoExtractor):
2492         """Information Extractor for Yahoo! Video search queries."""
2493         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2494         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2495         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2496         _MORE_PAGES_INDICATOR = r'\s*Next'
2497         _yahoo_ie = None
2498         _max_yahoo_results = 1000
2499         IE_NAME = u'video.yahoo:search'
2500
2501         def __init__(self, yahoo_ie, downloader=None):
2502                 InfoExtractor.__init__(self, downloader)
2503                 self._yahoo_ie = yahoo_ie
2504
2505         def report_download_page(self, query, pagenum):
2506                 """Report attempt to download playlist page with given number."""
2507                 query = query.decode(preferredencoding())
2508                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2509
2510         def _real_initialize(self):
2511                 self._yahoo_ie.initialize()
2512
2513         def _real_extract(self, query):
2514                 mobj = re.match(self._VALID_URL, query)
2515                 if mobj is None:
2516                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2517                         return
2518
2519                 prefix, query = query.split(':')
2520                 prefix = prefix[8:]
2521                 query = query.encode('utf-8')
2522                 if prefix == '':
2523                         self._download_n_results(query, 1)
2524                         return
2525                 elif prefix == 'all':
2526                         self._download_n_results(query, self._max_yahoo_results)
2527                         return
2528                 else:
2529                         try:
2530                                 n = long(prefix)
2531                                 if n <= 0:
2532                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2533                                         return
2534                                 elif n > self._max_yahoo_results:
2535                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2536                                         n = self._max_yahoo_results
2537                                 self._download_n_results(query, n)
2538                                 return
2539                         except ValueError: # parsing prefix as integer fails
2540                                 self._download_n_results(query, 1)
2541                                 return
2542
2543         def _download_n_results(self, query, n):
2544                 """Downloads a specified number of results for a query"""
2545
2546                 video_ids = []
2547                 already_seen = set()
2548                 pagenum = 1
2549
2550                 while True:
2551                         self.report_download_page(query, pagenum)
2552                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2553                         request = urllib2.Request(result_url)
2554                         try:
2555                                 page = urllib2.urlopen(request).read()
2556                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2557                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2558                                 return
2559
2560                         # Extract video identifiers
2561                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2562                                 video_id = mobj.group(1)
2563                                 if video_id not in already_seen:
2564                                         video_ids.append(video_id)
2565                                         already_seen.add(video_id)
2566                                         if len(video_ids) == n:
2567                                                 # Specified n videos reached
2568                                                 for id in video_ids:
2569                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2570                                                 return
2571
2572                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2573                                 for id in video_ids:
2574                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2575                                 return
2576
2577                         pagenum = pagenum + 1
2578
2579
2580 class YoutubePlaylistIE(InfoExtractor):
2581         """Information Extractor for YouTube playlists."""
2582
2583         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2584         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2585         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2586         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2587         _youtube_ie = None
2588         IE_NAME = u'youtube:playlist'
2589
2590         def __init__(self, youtube_ie, downloader=None):
2591                 InfoExtractor.__init__(self, downloader)
2592                 self._youtube_ie = youtube_ie
2593
2594         def report_download_page(self, playlist_id, pagenum):
2595                 """Report attempt to download playlist page with given number."""
2596                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2597
2598         def _real_initialize(self):
2599                 self._youtube_ie.initialize()
2600
2601         def _real_extract(self, url):
2602                 # Extract playlist id
2603                 mobj = re.match(self._VALID_URL, url)
2604                 if mobj is None:
2605                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2606                         return
2607
2608                 # Single video case
2609                 if mobj.group(3) is not None:
2610                         self._youtube_ie.extract(mobj.group(3))
2611                         return
2612
2613                 # Download playlist pages
2614                 # prefix is 'p' as default for playlists but there are other types that need extra care
2615                 playlist_prefix = mobj.group(1)
2616                 if playlist_prefix == 'a':
2617                         playlist_access = 'artist'
2618                 else:
2619                         playlist_prefix = 'p'
2620                         playlist_access = 'view_play_list'
2621                 playlist_id = mobj.group(2)
2622                 video_ids = []
2623                 pagenum = 1
2624
2625                 while True:
2626                         self.report_download_page(playlist_id, pagenum)
2627                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2628                         request = urllib2.Request(url)
2629                         try:
2630                                 page = urllib2.urlopen(request).read()
2631                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2632                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2633                                 return
2634
2635                         # Extract video identifiers
2636                         ids_in_page = []
2637                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2638                                 if mobj.group(1) not in ids_in_page:
2639                                         ids_in_page.append(mobj.group(1))
2640                         video_ids.extend(ids_in_page)
2641
2642                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2643                                 break
2644                         pagenum = pagenum + 1
2645
2646                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2647                 playlistend = self._downloader.params.get('playlistend', -1)
2648                 if playlistend == -1:
2649                         video_ids = video_ids[playliststart:]
2650                 else:
2651                         video_ids = video_ids[playliststart:playlistend]
2652
2653                 for id in video_ids:
2654                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2655                 return
2656
2657
2658 class YoutubeUserIE(InfoExtractor):
2659         """Information Extractor for YouTube users."""
2660
2661         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2662         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2663         _GDATA_PAGE_SIZE = 50
2664         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2665         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2666         _youtube_ie = None
2667         IE_NAME = u'youtube:user'
2668
2669         def __init__(self, youtube_ie, downloader=None):
2670                 InfoExtractor.__init__(self, downloader)
2671                 self._youtube_ie = youtube_ie
2672
2673         def report_download_page(self, username, start_index):
2674                 """Report attempt to download user page."""
2675                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2676                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2677
2678         def _real_initialize(self):
2679                 self._youtube_ie.initialize()
2680
2681         def _real_extract(self, url):
2682                 # Extract username
2683                 mobj = re.match(self._VALID_URL, url)
2684                 if mobj is None:
2685                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2686                         return
2687
2688                 username = mobj.group(1)
2689
2690                 # Download video ids using YouTube Data API. Result size per
2691                 # query is limited (currently to 50 videos) so we need to query
2692                 # page by page until there are no video ids - it means we got
2693                 # all of them.
2694
2695                 video_ids = []
2696                 pagenum = 0
2697
2698                 while True:
2699                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2700                         self.report_download_page(username, start_index)
2701
2702                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2703
2704                         try:
2705                                 page = urllib2.urlopen(request).read()
2706                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2707                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2708                                 return
2709
2710                         # Extract video identifiers
2711                         ids_in_page = []
2712
2713                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2714                                 if mobj.group(1) not in ids_in_page:
2715                                         ids_in_page.append(mobj.group(1))
2716
2717                         video_ids.extend(ids_in_page)
2718
2719                         # A little optimization - if current page is not
2720                         # "full", ie. does not contain PAGE_SIZE video ids then
2721                         # we can assume that this page is the last one - there
2722                         # are no more ids on further pages - no need to query
2723                         # again.
2724
2725                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2726                                 break
2727
2728                         pagenum += 1
2729
2730                 all_ids_count = len(video_ids)
2731                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2732                 playlistend = self._downloader.params.get('playlistend', -1)
2733
2734                 if playlistend == -1:
2735                         video_ids = video_ids[playliststart:]
2736                 else:
2737                         video_ids = video_ids[playliststart:playlistend]
2738
2739                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2740                                 (username, all_ids_count, len(video_ids)))
2741
2742                 for video_id in video_ids:
2743                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2744
2745
2746 class DepositFilesIE(InfoExtractor):
2747         """Information extractor for depositfiles.com"""
2748
2749         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2750         IE_NAME = u'DepositFiles'
2751
2752         def __init__(self, downloader=None):
2753                 InfoExtractor.__init__(self, downloader)
2754
2755         def report_download_webpage(self, file_id):
2756                 """Report webpage download."""
2757                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2758
2759         def report_extraction(self, file_id):
2760                 """Report information extraction."""
2761                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2762
2763         def _real_extract(self, url):
2764                 # At this point we have a new file
2765                 self._downloader.increment_downloads()
2766
2767                 file_id = url.split('/')[-1]
2768                 # Rebuild url in english locale
2769                 url = 'http://depositfiles.com/en/files/' + file_id
2770
2771                 # Retrieve file webpage with 'Free download' button pressed
2772                 free_download_indication = { 'gateway_result' : '1' }
2773                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2774                 try:
2775                         self.report_download_webpage(file_id)
2776                         webpage = urllib2.urlopen(request).read()
2777                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2778                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2779                         return
2780
2781                 # Search for the real file URL
2782                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2783                 if (mobj is None) or (mobj.group(1) is None):
2784                         # Try to figure out reason of the error.
2785                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2786                         if (mobj is not None) and (mobj.group(1) is not None):
2787                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2788                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2789                         else:
2790                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2791                         return
2792
2793                 file_url = mobj.group(1)
2794                 file_extension = os.path.splitext(file_url)[1][1:]
2795
2796                 # Search for file title
2797                 mobj = re.search(r'<b title="(.*?)">', webpage)
2798                 if mobj is None:
2799                         self._downloader.trouble(u'ERROR: unable to extract title')
2800                         return
2801                 file_title = mobj.group(1).decode('utf-8')
2802
2803                 try:
2804                         # Process file information
2805                         self._downloader.process_info({
2806                                 'id':           file_id.decode('utf-8'),
2807                                 'url':          file_url.decode('utf-8'),
2808                                 'uploader':     u'NA',
2809                                 'upload_date':  u'NA',
2810                                 'title':        file_title,
2811                                 'stitle':       file_title,
2812                                 'ext':          file_extension.decode('utf-8'),
2813                                 'format':       u'NA',
2814                                 'player_url':   None,
2815                         })
2816                 except UnavailableVideoError, err:
2817                         self._downloader.trouble(u'ERROR: unable to download file')
2818
2819
2820 class FacebookIE(InfoExtractor):
2821         """Information Extractor for Facebook"""
2822
2823         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2824         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2825         _NETRC_MACHINE = 'facebook'
2826         _available_formats = ['video', 'highqual', 'lowqual']
2827         _video_extensions = {
2828                 'video': 'mp4',
2829                 'highqual': 'mp4',
2830                 'lowqual': 'mp4',
2831         }
2832         IE_NAME = u'facebook'
2833
2834         def __init__(self, downloader=None):
2835                 InfoExtractor.__init__(self, downloader)
2836
2837         def _reporter(self, message):
2838                 """Add header and report message."""
2839                 self._downloader.to_screen(u'[facebook] %s' % message)
2840
2841         def report_login(self):
2842                 """Report attempt to log in."""
2843                 self._reporter(u'Logging in')
2844
2845         def report_video_webpage_download(self, video_id):
2846                 """Report attempt to download video webpage."""
2847                 self._reporter(u'%s: Downloading video webpage' % video_id)
2848
2849         def report_information_extraction(self, video_id):
2850                 """Report attempt to extract video information."""
2851                 self._reporter(u'%s: Extracting video information' % video_id)
2852
2853         def _parse_page(self, video_webpage):
2854                 """Extract video information from page"""
2855                 # General data
2856                 data = {'title': r'\("video_title", "(.*?)"\)',
2857                         'description': r'<div class="datawrap">(.*?)</div>',
2858                         'owner': r'\("video_owner_name", "(.*?)"\)',
2859                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2860                         }
2861                 video_info = {}
2862                 for piece in data.keys():
2863                         mobj = re.search(data[piece], video_webpage)
2864                         if mobj is not None:
2865                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2866
2867                 # Video urls
2868                 video_urls = {}
2869                 for fmt in self._available_formats:
2870                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2871                         if mobj is not None:
2872                                 # URL is in a Javascript segment inside an escaped Unicode format within
2873                                 # the generally utf-8 page
2874                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2875                 video_info['video_urls'] = video_urls
2876
2877                 return video_info
2878
2879         def _real_initialize(self):
2880                 if self._downloader is None:
2881                         return
2882
2883                 useremail = None
2884                 password = None
2885                 downloader_params = self._downloader.params
2886
2887                 # Attempt to use provided username and password or .netrc data
2888                 if downloader_params.get('username', None) is not None:
2889                         useremail = downloader_params['username']
2890                         password = downloader_params['password']
2891                 elif downloader_params.get('usenetrc', False):
2892                         try:
2893                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2894                                 if info is not None:
2895                                         useremail = info[0]
2896                                         password = info[2]
2897                                 else:
2898                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2899                         except (IOError, netrc.NetrcParseError), err:
2900                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2901                                 return
2902
2903                 if useremail is None:
2904                         return
2905
2906                 # Log in
2907                 login_form = {
2908                         'email': useremail,
2909                         'pass': password,
2910                         'login': 'Log+In'
2911                         }
2912                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2913                 try:
2914                         self.report_login()
2915                         login_results = urllib2.urlopen(request).read()
2916                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2917                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2918                                 return
2919                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2920                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2921                         return
2922
2923         def _real_extract(self, url):
2924                 mobj = re.match(self._VALID_URL, url)
2925                 if mobj is None:
2926                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2927                         return
2928                 video_id = mobj.group('ID')
2929
2930                 # Get video webpage
2931                 self.report_video_webpage_download(video_id)
2932                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2933                 try:
2934                         page = urllib2.urlopen(request)
2935                         video_webpage = page.read()
2936                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2937                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2938                         return
2939
2940                 # Start extracting information
2941                 self.report_information_extraction(video_id)
2942
2943                 # Extract information
2944                 video_info = self._parse_page(video_webpage)
2945
2946                 # uploader
2947                 if 'owner' not in video_info:
2948                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2949                         return
2950                 video_uploader = video_info['owner']
2951
2952                 # title
2953                 if 'title' not in video_info:
2954                         self._downloader.trouble(u'ERROR: unable to extract video title')
2955                         return
2956                 video_title = video_info['title']
2957                 video_title = video_title.decode('utf-8')
2958                 video_title = sanitize_title(video_title)
2959
2960                 simple_title = _simplify_title(video_title)
2961
2962                 # thumbnail image
2963                 if 'thumbnail' not in video_info:
2964                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2965                         video_thumbnail = ''
2966                 else:
2967                         video_thumbnail = video_info['thumbnail']
2968
2969                 # upload date
2970                 upload_date = u'NA'
2971                 if 'upload_date' in video_info:
2972                         upload_time = video_info['upload_date']
2973                         timetuple = email.utils.parsedate_tz(upload_time)
2974                         if timetuple is not None:
2975                                 try:
2976                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2977                                 except:
2978                                         pass
2979
2980                 # description
2981                 video_description = video_info.get('description', 'No description available.')
2982
2983                 url_map = video_info['video_urls']
2984                 if len(url_map.keys()) > 0:
2985                         # Decide which formats to download
2986                         req_format = self._downloader.params.get('format', None)
2987                         format_limit = self._downloader.params.get('format_limit', None)
2988
2989                         if format_limit is not None and format_limit in self._available_formats:
2990                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2991                         else:
2992                                 format_list = self._available_formats
2993                         existing_formats = [x for x in format_list if x in url_map]
2994                         if len(existing_formats) == 0:
2995                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2996                                 return
2997                         if req_format is None:
2998                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2999                         elif req_format == 'worst':
3000                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3001                         elif req_format == '-1':
3002                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3003                         else:
3004                                 # Specific format
3005                                 if req_format not in url_map:
3006                                         self._downloader.trouble(u'ERROR: requested format not available')
3007                                         return
3008                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3009
3010                 for format_param, video_real_url in video_url_list:
3011
3012                         # At this point we have a new video
3013                         self._downloader.increment_downloads()
3014
3015                         # Extension
3016                         video_extension = self._video_extensions.get(format_param, 'mp4')
3017
3018                         try:
3019                                 # Process video information
3020                                 self._downloader.process_info({
3021                                         'id':           video_id.decode('utf-8'),
3022                                         'url':          video_real_url.decode('utf-8'),
3023                                         'uploader':     video_uploader.decode('utf-8'),
3024                                         'upload_date':  upload_date,
3025                                         'title':        video_title,
3026                                         'stitle':       simple_title,
3027                                         'ext':          video_extension.decode('utf-8'),
3028                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3029                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3030                                         'description':  video_description.decode('utf-8'),
3031                                         'player_url':   None,
3032                                 })
3033                         except UnavailableVideoError, err:
3034                                 self._downloader.trouble(u'\nERROR: unable to download video')
3035
3036 class BlipTVIE(InfoExtractor):
3037         """Information extractor for blip.tv"""
3038
3039         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3040         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3041         IE_NAME = u'blip.tv'
3042
3043         def report_extraction(self, file_id):
3044                 """Report information extraction."""
3045                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3046
3047         def report_direct_download(self, title):
3048                 """Report information extraction."""
3049                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3050
3051         def _real_extract(self, url):
3052                 mobj = re.match(self._VALID_URL, url)
3053                 if mobj is None:
3054                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3055                         return
3056
3057                 if '?' in url:
3058                         cchar = '&'
3059                 else:
3060                         cchar = '?'
3061                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3062                 request = urllib2.Request(json_url)
3063                 self.report_extraction(mobj.group(1))
3064                 info = None
3065                 try:
3066                         urlh = urllib2.urlopen(request)
3067                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3068                                 basename = url.split('/')[-1]
3069                                 title,ext = os.path.splitext(basename)
3070                                 title = title.decode('UTF-8')
3071                                 ext = ext.replace('.', '')
3072                                 self.report_direct_download(title)
3073                                 info = {
3074                                         'id': title,
3075                                         'url': url,
3076                                         'title': title,
3077                                         'stitle': _simplify_title(title),
3078                                         'ext': ext,
3079                                         'urlhandle': urlh
3080                                 }
3081                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3082                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3083                         return
3084                 if info is None: # Regular URL
3085                         try:
3086                                 json_code = urlh.read()
3087                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3088                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3089                                 return
3090
3091                         try:
3092                                 json_data = json.loads(json_code)
3093                                 if 'Post' in json_data:
3094                                         data = json_data['Post']
3095                                 else:
3096                                         data = json_data
3097
3098                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3099                                 video_url = data['media']['url']
3100                                 umobj = re.match(self._URL_EXT, video_url)
3101                                 if umobj is None:
3102                                         raise ValueError('Can not determine filename extension')
3103                                 ext = umobj.group(1)
3104
3105                                 info = {
3106                                         'id': data['item_id'],
3107                                         'url': video_url,
3108                                         'uploader': data['display_name'],
3109                                         'upload_date': upload_date,
3110                                         'title': data['title'],
3111                                         'stitle': _simplify_title(data['title']),
3112                                         'ext': ext,
3113                                         'format': data['media']['mimeType'],
3114                                         'thumbnail': data['thumbnailUrl'],
3115                                         'description': data['description'],
3116                                         'player_url': data['embedUrl']
3117                                 }
3118                         except (ValueError,KeyError), err:
3119                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3120                                 return
3121
3122                 self._downloader.increment_downloads()
3123
3124                 try:
3125                         self._downloader.process_info(info)
3126                 except UnavailableVideoError, err:
3127                         self._downloader.trouble(u'\nERROR: unable to download video')
3128
3129
3130 class MyVideoIE(InfoExtractor):
3131         """Information Extractor for myvideo.de."""
3132
3133         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3134         IE_NAME = u'myvideo'
3135
3136         def __init__(self, downloader=None):
3137                 InfoExtractor.__init__(self, downloader)
3138
3139         def report_download_webpage(self, video_id):
3140                 """Report webpage download."""
3141                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3142
3143         def report_extraction(self, video_id):
3144                 """Report information extraction."""
3145                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3146
3147         def _real_extract(self,url):
3148                 mobj = re.match(self._VALID_URL, url)
3149                 if mobj is None:
3150                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3151                         return
3152
3153                 video_id = mobj.group(1)
3154
3155                 # Get video webpage
3156                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3157                 try:
3158                         self.report_download_webpage(video_id)
3159                         webpage = urllib2.urlopen(request).read()
3160                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3161                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3162                         return
3163
3164                 self.report_extraction(video_id)
3165                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3166                                  webpage)
3167                 if mobj is None:
3168                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3169                         return
3170                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3171
3172                 mobj = re.search('<title>([^<]+)</title>', webpage)
3173                 if mobj is None:
3174                         self._downloader.trouble(u'ERROR: unable to extract title')
3175                         return
3176
3177                 video_title = mobj.group(1)
3178                 video_title = sanitize_title(video_title)
3179
3180                 simple_title = _simplify_title(video_title)
3181
3182                 try:
3183                         self._downloader.process_info({
3184                                 'id':           video_id,
3185                                 'url':          video_url,
3186                                 'uploader':     u'NA',
3187                                 'upload_date':  u'NA',
3188                                 'title':        video_title,
3189                                 'stitle':       simple_title,
3190                                 'ext':          u'flv',
3191                                 'format':       u'NA',
3192                                 'player_url':   None,
3193                         })
3194                 except UnavailableVideoError:
3195                         self._downloader.trouble(u'\nERROR: Unable to download video')
3196
3197 class ComedyCentralIE(InfoExtractor):
3198         """Information extractor for The Daily Show and Colbert Report """
3199
3200         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3201         IE_NAME = u'comedycentral'
3202
3203         def report_extraction(self, episode_id):
3204                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3205
3206         def report_config_download(self, episode_id):
3207                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3208
3209         def report_index_download(self, episode_id):
3210                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3211
3212         def report_player_url(self, episode_id):
3213                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3214
3215         def _real_extract(self, url):
3216                 mobj = re.match(self._VALID_URL, url)
3217                 if mobj is None:
3218                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3219                         return
3220
3221                 if mobj.group('shortname'):
3222                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3223                                 url = u'http://www.thedailyshow.com/full-episodes/'
3224                         else:
3225                                 url = u'http://www.colbertnation.com/full-episodes/'
3226                         mobj = re.match(self._VALID_URL, url)
3227                         assert mobj is not None
3228
3229                 dlNewest = not mobj.group('episode')
3230                 if dlNewest:
3231                         epTitle = mobj.group('showname')
3232                 else:
3233                         epTitle = mobj.group('episode')
3234
3235                 req = urllib2.Request(url)
3236                 self.report_extraction(epTitle)
3237                 try:
3238                         htmlHandle = urllib2.urlopen(req)
3239                         html = htmlHandle.read()
3240                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3241                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3242                         return
3243                 if dlNewest:
3244                         url = htmlHandle.geturl()
3245                         mobj = re.match(self._VALID_URL, url)
3246                         if mobj is None:
3247                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3248                                 return
3249                         if mobj.group('episode') == '':
3250                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3251                                 return
3252                         epTitle = mobj.group('episode')
3253
3254                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3255                 if len(mMovieParams) == 0:
3256                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3257                         return
3258
3259                 playerUrl_raw = mMovieParams[0][0]
3260                 self.report_player_url(epTitle)
3261                 try:
3262                         urlHandle = urllib2.urlopen(playerUrl_raw)
3263                         playerUrl = urlHandle.geturl()
3264                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3265                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3266                         return
3267
3268                 uri = mMovieParams[0][1]
3269                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3270                 self.report_index_download(epTitle)
3271                 try:
3272                         indexXml = urllib2.urlopen(indexUrl).read()
3273                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3274                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3275                         return
3276
3277                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3278                 itemEls = idoc.findall('.//item')
3279                 for itemEl in itemEls:
3280                         mediaId = itemEl.findall('./guid')[0].text
3281                         shortMediaId = mediaId.split(':')[-1]
3282                         showId = mediaId.split(':')[-2].replace('.com', '')
3283                         officialTitle = itemEl.findall('./title')[0].text
3284                         officialDate = itemEl.findall('./pubDate')[0].text
3285
3286                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3287                                                 urllib.urlencode({'uri': mediaId}))
3288                         configReq = urllib2.Request(configUrl)
3289                         self.report_config_download(epTitle)
3290                         try:
3291                                 configXml = urllib2.urlopen(configReq).read()
3292                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3293                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3294                                 return
3295
3296                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3297                         turls = []
3298                         for rendition in cdoc.findall('.//rendition'):
3299                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3300                                 turls.append(finfo)
3301
3302                         if len(turls) == 0:
3303                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3304                                 continue
3305
3306                         # For now, just pick the highest bitrate
3307                         format,video_url = turls[-1]
3308
3309                         self._downloader.increment_downloads()
3310
3311                         effTitle = showId + u'-' + epTitle
3312                         info = {
3313                                 'id': shortMediaId,
3314                                 'url': video_url,
3315                                 'uploader': showId,
3316                                 'upload_date': officialDate,
3317                                 'title': effTitle,
3318                                 'stitle': _simplify_title(effTitle),
3319                                 'ext': 'mp4',
3320                                 'format': format,
3321                                 'thumbnail': None,
3322                                 'description': officialTitle,
3323                                 'player_url': playerUrl
3324                         }
3325
3326                         try:
3327                                 self._downloader.process_info(info)
3328                         except UnavailableVideoError, err:
3329                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3330                                 continue
3331
3332
3333 class EscapistIE(InfoExtractor):
3334         """Information extractor for The Escapist """
3335
3336         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3337         IE_NAME = u'escapist'
3338
3339         def report_extraction(self, showName):
3340                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3341
3342         def report_config_download(self, showName):
3343                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3344
3345         def _real_extract(self, url):
3346                 htmlParser = HTMLParser.HTMLParser()
3347
3348                 mobj = re.match(self._VALID_URL, url)
3349                 if mobj is None:
3350                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3351                         return
3352                 showName = mobj.group('showname')
3353                 videoId = mobj.group('episode')
3354
3355                 self.report_extraction(showName)
3356                 try:
3357                         webPage = urllib2.urlopen(url).read()
3358                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3359                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3360                         return
3361
3362                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3363                 description = htmlParser.unescape(descMatch.group(1))
3364                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3365                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3366                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3367                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3368                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3369                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3370
3371                 self.report_config_download(showName)
3372                 try:
3373                         configJSON = urllib2.urlopen(configUrl).read()
3374                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3375                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3376                         return
3377
3378                 # Technically, it's JavaScript, not JSON
3379                 configJSON = configJSON.replace("'", '"')
3380
3381                 try:
3382                         config = json.loads(configJSON)
3383                 except (ValueError,), err:
3384                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3385                         return
3386
3387                 playlist = config['playlist']
3388                 videoUrl = playlist[1]['url']
3389
3390                 self._downloader.increment_downloads()
3391                 info = {
3392                         'id': videoId,
3393                         'url': videoUrl,
3394                         'uploader': showName,
3395                         'upload_date': None,
3396                         'title': showName,
3397                         'stitle': _simplify_title(showName),
3398                         'ext': 'flv',
3399                         'format': 'flv',
3400                         'thumbnail': imgUrl,
3401                         'description': description,
3402                         'player_url': playerUrl,
3403                 }
3404
3405                 try:
3406                         self._downloader.process_info(info)
3407                 except UnavailableVideoError, err:
3408                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3409
3410
3411 class CollegeHumorIE(InfoExtractor):
3412         """Information extractor for collegehumor.com"""
3413
3414         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3415         IE_NAME = u'collegehumor'
3416
3417         def report_webpage(self, video_id):
3418                 """Report information extraction."""
3419                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3420
3421         def report_extraction(self, video_id):
3422                 """Report information extraction."""
3423                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3424
3425         def _real_extract(self, url):
3426                 htmlParser = HTMLParser.HTMLParser()
3427
3428                 mobj = re.match(self._VALID_URL, url)
3429                 if mobj is None:
3430                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3431                         return
3432                 video_id = mobj.group('videoid')
3433
3434                 self.report_webpage(video_id)
3435                 request = urllib2.Request(url)
3436                 try:
3437                         webpage = urllib2.urlopen(request).read()
3438                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3439                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3440                         return
3441
3442                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3443                 if m is None:
3444                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3445                         return
3446                 internal_video_id = m.group('internalvideoid')
3447
3448                 info = {
3449                         'id': video_id,
3450                         'internal_id': internal_video_id,
3451                 }
3452
3453                 self.report_extraction(video_id)
3454                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3455                 try:
3456                         metaXml = urllib2.urlopen(xmlUrl).read()
3457                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3458                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3459                         return
3460
3461                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3462                 try:
3463                         videoNode = mdoc.findall('./video')[0]
3464                         info['description'] = videoNode.findall('./description')[0].text
3465                         info['title'] = videoNode.findall('./caption')[0].text
3466                         info['stitle'] = _simplify_title(info['title'])
3467                         info['url'] = videoNode.findall('./file')[0].text
3468                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3469                         info['ext'] = info['url'].rpartition('.')[2]
3470                         info['format'] = info['ext']
3471                 except IndexError:
3472                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3473                         return
3474
3475                 self._downloader.increment_downloads()
3476
3477                 try:
3478                         self._downloader.process_info(info)
3479                 except UnavailableVideoError, err:
3480                         self._downloader.trouble(u'\nERROR: unable to download video')
3481
3482
3483 class XVideosIE(InfoExtractor):
3484         """Information extractor for xvideos.com"""
3485
3486         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3487         IE_NAME = u'xvideos'
3488
3489         def report_webpage(self, video_id):
3490                 """Report information extraction."""
3491                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3492
3493         def report_extraction(self, video_id):
3494                 """Report information extraction."""
3495                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3496
3497         def _real_extract(self, url):
3498                 htmlParser = HTMLParser.HTMLParser()
3499
3500                 mobj = re.match(self._VALID_URL, url)
3501                 if mobj is None:
3502                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3503                         return
3504                 video_id = mobj.group(1).decode('utf-8')
3505
3506                 self.report_webpage(video_id)
3507
3508                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3509                 try:
3510                         webpage = urllib2.urlopen(request).read()
3511                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3512                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3513                         return
3514
3515                 self.report_extraction(video_id)
3516
3517
3518                 # Extract video URL
3519                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3520                 if mobj is None:
3521                         self._downloader.trouble(u'ERROR: unable to extract video url')
3522                         return
3523                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3524
3525
3526                 # Extract title
3527                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3528                 if mobj is None:
3529                         self._downloader.trouble(u'ERROR: unable to extract video title')
3530                         return
3531                 video_title = mobj.group(1).decode('utf-8')
3532
3533
3534                 # Extract video thumbnail
3535                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3536                 if mobj is None:
3537                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3538                         return
3539                 video_thumbnail = mobj.group(1).decode('utf-8')
3540
3541
3542
3543                 self._downloader.increment_downloads()
3544                 info = {
3545                         'id': video_id,
3546                         'url': video_url,
3547                         'uploader': None,
3548                         'upload_date': None,
3549                         'title': video_title,
3550                         'stitle': _simplify_title(video_title),
3551                         'ext': 'flv',
3552                         'format': 'flv',
3553                         'thumbnail': video_thumbnail,
3554                         'description': None,
3555                         'player_url': None,
3556                 }
3557
3558                 try:
3559                         self._downloader.process_info(info)
3560                 except UnavailableVideoError, err:
3561                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3562
3563
3564 class SoundcloudIE(InfoExtractor):
3565         """Information extractor for soundcloud.com
3566            To access the media, the uid of the song and a stream token
3567            must be extracted from the page source and the script must make
3568            a request to media.soundcloud.com/crossdomain.xml. Then
3569            the media can be grabbed by requesting from an url composed
3570            of the stream token and uid
3571          """
3572
3573         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3574         IE_NAME = u'soundcloud'
3575
3576         def __init__(self, downloader=None):
3577                 InfoExtractor.__init__(self, downloader)
3578
3579         def report_webpage(self, video_id):
3580                 """Report information extraction."""
3581                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3582
3583         def report_extraction(self, video_id):
3584                 """Report information extraction."""
3585                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3586
3587         def _real_extract(self, url):
3588                 htmlParser = HTMLParser.HTMLParser()
3589
3590                 mobj = re.match(self._VALID_URL, url)
3591                 if mobj is None:
3592                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3593                         return
3594
3595                 # extract uploader (which is in the url)
3596                 uploader = mobj.group(1).decode('utf-8')
3597                 # extract simple title (uploader + slug of song title)
3598                 slug_title =  mobj.group(2).decode('utf-8')
3599                 simple_title = uploader + '-' + slug_title
3600
3601                 self.report_webpage('%s/%s' % (uploader, slug_title))
3602
3603                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3604                 try:
3605                         webpage = urllib2.urlopen(request).read()
3606                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3607                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3608                         return
3609
3610                 self.report_extraction('%s/%s' % (uploader, slug_title))
3611
3612                 # extract uid and stream token that soundcloud hands out for access
3613                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3614                 if mobj:
3615                         video_id = mobj.group(1)
3616                         stream_token = mobj.group(2)
3617
3618                 # extract unsimplified title
3619                 mobj = re.search('"title":"(.*?)",', webpage)
3620                 if mobj:
3621                         title = mobj.group(1)
3622
3623                 # construct media url (with uid/token)
3624                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3625                 mediaURL = mediaURL % (video_id, stream_token)
3626
3627                 # description
3628                 description = u'No description available'
3629                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3630                 if mobj:
3631                         description = mobj.group(1)
3632
3633                 # upload date
3634                 upload_date = None
3635                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3636                 if mobj:
3637                         try:
3638                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3639                         except Exception, e:
3640                                 print str(e)
3641
3642                 # for soundcloud, a request to a cross domain is required for cookies
3643                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3644
3645                 try:
3646                         self._downloader.process_info({
3647                                 'id':           video_id.decode('utf-8'),
3648                                 'url':          mediaURL,
3649                                 'uploader':     uploader.decode('utf-8'),
3650                                 'upload_date':  upload_date,
3651                                 'title':        simple_title.decode('utf-8'),
3652                                 'stitle':       simple_title.decode('utf-8'),
3653                                 'ext':          u'mp3',
3654                                 'format':       u'NA',
3655                                 'player_url':   None,
3656                                 'description': description.decode('utf-8')
3657                         })
3658                 except UnavailableVideoError:
3659                         self._downloader.trouble(u'\nERROR: unable to download video')
3660
3661
3662 class InfoQIE(InfoExtractor):
3663         """Information extractor for infoq.com"""
3664
3665         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3666         IE_NAME = u'infoq'
3667
3668         def report_webpage(self, video_id):
3669                 """Report information extraction."""
3670                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3671
3672         def report_extraction(self, video_id):
3673                 """Report information extraction."""
3674                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3675
3676         def _real_extract(self, url):
3677                 htmlParser = HTMLParser.HTMLParser()
3678
3679                 mobj = re.match(self._VALID_URL, url)
3680                 if mobj is None:
3681                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3682                         return
3683
3684                 self.report_webpage(url)
3685
3686                 request = urllib2.Request(url)
3687                 try:
3688                         webpage = urllib2.urlopen(request).read()
3689                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3690                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3691                         return
3692
3693                 self.report_extraction(url)
3694
3695
3696                 # Extract video URL
3697                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3698                 if mobj is None:
3699                         self._downloader.trouble(u'ERROR: unable to extract video url')
3700                         return
3701                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3702
3703
3704                 # Extract title
3705                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3706                 if mobj is None:
3707                         self._downloader.trouble(u'ERROR: unable to extract video title')
3708                         return
3709                 video_title = mobj.group(1).decode('utf-8')
3710
3711                 # Extract description
3712                 video_description = u'No description available.'
3713                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3714                 if mobj is not None:
3715                         video_description = mobj.group(1).decode('utf-8')
3716
3717                 video_filename = video_url.split('/')[-1]
3718                 video_id, extension = video_filename.split('.')
3719
3720                 self._downloader.increment_downloads()
3721                 info = {
3722                         'id': video_id,
3723                         'url': video_url,
3724                         'uploader': None,
3725                         'upload_date': None,
3726                         'title': video_title,
3727                         'stitle': _simplify_title(video_title),
3728                         'ext': extension,
3729                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3730                         'thumbnail': None,
3731                         'description': video_description,
3732                         'player_url': None,
3733                 }
3734
3735                 try:
3736                         self._downloader.process_info(info)
3737                 except UnavailableVideoError, err:
3738                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3739
3740 class MixcloudIE(InfoExtractor):
3741         """Information extractor for www.mixcloud.com"""
3742         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3743         IE_NAME = u'mixcloud'
3744
3745         def __init__(self, downloader=None):
3746                 InfoExtractor.__init__(self, downloader)
3747
3748         def report_download_json(self, file_id):
3749                 """Report JSON download."""
3750                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3751
3752         def report_extraction(self, file_id):
3753                 """Report information extraction."""
3754                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3755
3756         def get_urls(self, jsonData, fmt, bitrate='best'):
3757                 """Get urls from 'audio_formats' section in json"""
3758                 file_url = None
3759                 try:
3760                         bitrate_list = jsonData[fmt]
3761                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3762                                 bitrate = max(bitrate_list) # select highest
3763
3764                         url_list = jsonData[fmt][bitrate]
3765                 except TypeError: # we have no bitrate info.
3766                         url_list = jsonData[fmt]
3767                 return url_list
3768
3769         def check_urls(self, url_list):
3770                 """Returns 1st active url from list"""
3771                 for url in url_list:
3772                         try:
3773                                 urllib2.urlopen(url)
3774                                 return url
3775                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3776                                 url = None
3777
3778                 return None
3779
3780         def _print_formats(self, formats):
3781                 print 'Available formats:'
3782                 for fmt in formats.keys():
3783                         for b in formats[fmt]:
3784                                 try:
3785                                         ext = formats[fmt][b][0]
3786                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3787                                 except TypeError: # we have no bitrate info
3788                                         ext = formats[fmt][0]
3789                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3790                                         break
3791
3792         def _real_extract(self, url):
3793                 mobj = re.match(self._VALID_URL, url)
3794                 if mobj is None:
3795                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3796                         return
3797                 # extract uploader & filename from url
3798                 uploader = mobj.group(1).decode('utf-8')
3799                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3800
3801                 # construct API request
3802                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3803                 # retrieve .json file with links to files
3804                 request = urllib2.Request(file_url)
3805                 try:
3806                         self.report_download_json(file_url)
3807                         jsonData = urllib2.urlopen(request).read()
3808                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3809                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3810                         return
3811
3812                 # parse JSON
3813                 json_data = json.loads(jsonData)
3814                 player_url = json_data['player_swf_url']
3815                 formats = dict(json_data['audio_formats'])
3816
3817                 req_format = self._downloader.params.get('format', None)
3818                 bitrate = None
3819
3820                 if self._downloader.params.get('listformats', None):
3821                         self._print_formats(formats)
3822                         return
3823
3824                 if req_format is None or req_format == 'best':
3825                         for format_param in formats.keys():
3826                                 url_list = self.get_urls(formats, format_param)
3827                                 # check urls
3828                                 file_url = self.check_urls(url_list)
3829                                 if file_url is not None:
3830                                         break # got it!
3831                 else:
3832                         if req_format not in formats.keys():
3833                                 self._downloader.trouble(u'ERROR: format is not available')
3834                                 return
3835
3836                         url_list = self.get_urls(formats, req_format)
3837                         file_url = self.check_urls(url_list)
3838                         format_param = req_format
3839
3840                 # We have audio
3841                 self._downloader.increment_downloads()
3842                 try:
3843                         # Process file information
3844                         self._downloader.process_info({
3845                                 'id': file_id.decode('utf-8'),
3846                                 'url': file_url.decode('utf-8'),
3847                                 'uploader':     uploader.decode('utf-8'),
3848                                 'upload_date': u'NA',
3849                                 'title': json_data['name'],
3850                                 'stitle': _simplify_title(json_data['name']),
3851                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3852                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3853                                 'thumbnail': json_data['thumbnail_url'],
3854                                 'description': json_data['description'],
3855                                 'player_url': player_url.decode('utf-8'),
3856                         })
3857                 except UnavailableVideoError, err:
3858                         self._downloader.trouble(u'ERROR: unable to download file')
3859
3860 class StanfordOpenClassroomIE(InfoExtractor):
3861         """Information extractor for Stanford's Open ClassRoom"""
3862
3863         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3864         IE_NAME = u'stanfordoc'
3865
3866         def report_download_webpage(self, objid):
3867                 """Report information extraction."""
3868                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3869
3870         def report_extraction(self, video_id):
3871                 """Report information extraction."""
3872                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3873
3874         def _real_extract(self, url):
3875                 mobj = re.match(self._VALID_URL, url)
3876                 if mobj is None:
3877                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3878                         return
3879
3880                 if mobj.group('course') and mobj.group('video'): # A specific video
3881                         course = mobj.group('course')
3882                         video = mobj.group('video')
3883                         info = {
3884                                 'id': _simplify_title(course + '_' + video),
3885                         }
3886
3887                         self.report_extraction(info['id'])
3888                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3889                         xmlUrl = baseUrl + video + '.xml'
3890                         try:
3891                                 metaXml = urllib2.urlopen(xmlUrl).read()
3892                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3893                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3894                                 return
3895                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3896                         try:
3897                                 info['title'] = mdoc.findall('./title')[0].text
3898                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3899                         except IndexError:
3900                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3901                                 return
3902                         info['stitle'] = _simplify_title(info['title'])
3903                         info['ext'] = info['url'].rpartition('.')[2]
3904                         info['format'] = info['ext']
3905                         self._downloader.increment_downloads()
3906                         try:
3907                                 self._downloader.process_info(info)
3908                         except UnavailableVideoError, err:
3909                                 self._downloader.trouble(u'\nERROR: unable to download video')
3910                 elif mobj.group('course'): # A course page
3911                         unescapeHTML = HTMLParser.HTMLParser().unescape
3912
3913                         course = mobj.group('course')
3914                         info = {
3915                                 'id': _simplify_title(course),
3916                                 'type': 'playlist',
3917                         }
3918
3919                         self.report_download_webpage(info['id'])
3920                         try:
3921                                 coursepage = urllib2.urlopen(url).read()
3922                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3923                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3924                                 return
3925
3926                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3927                         if m:
3928                                 info['title'] = unescapeHTML(m.group(1))
3929                         else:
3930                                 info['title'] = info['id']
3931                         info['stitle'] = _simplify_title(info['title'])
3932
3933                         m = re.search('<description>([^<]+)</description>', coursepage)
3934                         if m:
3935                                 info['description'] = unescapeHTML(m.group(1))
3936
3937                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3938                         info['list'] = [
3939                                 {
3940                                         'type': 'reference',
3941                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3942                                 }
3943                                         for vpage in links]
3944
3945                         for entry in info['list']:
3946                                 assert entry['type'] == 'reference'
3947                                 self.extract(entry['url'])
3948                 else: # Root page
3949                         unescapeHTML = HTMLParser.HTMLParser().unescape
3950
3951                         info = {
3952                                 'id': 'Stanford OpenClassroom',
3953                                 'type': 'playlist',
3954                         }
3955
3956                         self.report_download_webpage(info['id'])
3957                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3958                         try:
3959                                 rootpage = urllib2.urlopen(rootURL).read()
3960                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3961                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3962                                 return
3963
3964                         info['title'] = info['id']
3965                         info['stitle'] = _simplify_title(info['title'])
3966
3967                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3968                         info['list'] = [
3969                                 {
3970                                         'type': 'reference',
3971                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3972                                 }
3973                                         for cpage in links]
3974
3975                         for entry in info['list']:
3976                                 assert entry['type'] == 'reference'
3977                                 self.extract(entry['url'])
3978
3979 class MTVIE(InfoExtractor):
3980         """Information extractor for MTV.com"""
3981
3982         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3983         IE_NAME = u'mtv'
3984
3985         def report_webpage(self, video_id):
3986                 """Report information extraction."""
3987                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3988
3989         def report_extraction(self, video_id):
3990                 """Report information extraction."""
3991                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3992
3993         def _real_extract(self, url):
3994                 mobj = re.match(self._VALID_URL, url)
3995                 if mobj is None:
3996                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3997                         return
3998                 if not mobj.group('proto'):
3999                         url = 'http://' + url
4000                 video_id = mobj.group('videoid')
4001                 self.report_webpage(video_id)
4002
4003                 request = urllib2.Request(url)
4004                 try:
4005                         webpage = urllib2.urlopen(request).read()
4006                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4007                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4008                         return
4009
4010                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4011                 if mobj is None:
4012                         self._downloader.trouble(u'ERROR: unable to extract song name')
4013                         return
4014                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4015                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4016                 if mobj is None:
4017                         self._downloader.trouble(u'ERROR: unable to extract performer')
4018                         return
4019                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4020                 video_title = performer + ' - ' + song_name
4021
4022                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4023                 if mobj is None:
4024                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4025                         return
4026                 mtvn_uri = mobj.group(1)
4027
4028                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4029                 if mobj is None:
4030                         self._downloader.trouble(u'ERROR: unable to extract content id')
4031                         return
4032                 content_id = mobj.group(1)
4033
4034                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4035                 self.report_extraction(video_id)
4036                 request = urllib2.Request(videogen_url)
4037                 try:
4038                         metadataXml = urllib2.urlopen(request).read()
4039                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4040                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4041                         return
4042
4043                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4044                 renditions = mdoc.findall('.//rendition')
4045
4046                 # For now, always pick the highest quality.
4047                 rendition = renditions[-1]
4048
4049                 try:
4050                         _,_,ext = rendition.attrib['type'].partition('/')
4051                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4052                         video_url = rendition.find('./src').text
4053                 except KeyError:
4054                         self._downloader.trouble('Invalid rendition field.')
4055                         return
4056
4057                 self._downloader.increment_downloads()
4058                 info = {
4059                         'id': video_id,
4060                         'url': video_url,
4061                         'uploader': performer,
4062                         'title': video_title,
4063                         'stitle': _simplify_title(video_title),
4064                         'ext': ext,
4065                         'format': format,
4066                 }
4067
4068                 try:
4069                         self._downloader.process_info(info)
4070                 except UnavailableVideoError, err:
4071                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4072
4073
4074 class PostProcessor(object):
4075         """Post Processor class.
4076
4077         PostProcessor objects can be added to downloaders with their
4078         add_post_processor() method. When the downloader has finished a
4079         successful download, it will take its internal chain of PostProcessors
4080         and start calling the run() method on each one of them, first with
4081         an initial argument and then with the returned value of the previous
4082         PostProcessor.
4083
4084         The chain will be stopped if one of them ever returns None or the end
4085         of the chain is reached.
4086
4087         PostProcessor objects follow a "mutual registration" process similar
4088         to InfoExtractor objects.
4089         """
4090
4091         _downloader = None
4092
4093         def __init__(self, downloader=None):
4094                 self._downloader = downloader
4095
4096         def set_downloader(self, downloader):
4097                 """Sets the downloader for this PP."""
4098                 self._downloader = downloader
4099
4100         def run(self, information):
4101                 """Run the PostProcessor.
4102
4103                 The "information" argument is a dictionary like the ones
4104                 composed by InfoExtractors. The only difference is that this
4105                 one has an extra field called "filepath" that points to the
4106                 downloaded file.
4107
4108                 When this method returns None, the postprocessing chain is
4109                 stopped. However, this method may return an information
4110                 dictionary that will be passed to the next postprocessing
4111                 object in the chain. It can be the one it received after
4112                 changing some fields.
4113
4114                 In addition, this method may raise a PostProcessingError
4115                 exception that will be taken into account by the downloader
4116                 it was called from.
4117                 """
4118                 return information # by default, do nothing
4119
4120 class AudioConversionError(BaseException):
4121         def __init__(self, message):
4122                 self.message = message
4123
4124 class FFmpegExtractAudioPP(PostProcessor):
4125
4126         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4127                 PostProcessor.__init__(self, downloader)
4128                 if preferredcodec is None:
4129                         preferredcodec = 'best'
4130                 self._preferredcodec = preferredcodec
4131                 self._preferredquality = preferredquality
4132                 self._keepvideo = keepvideo
4133
4134         @staticmethod
4135         def get_audio_codec(path):
4136                 try:
4137                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4138                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4139                         output = handle.communicate()[0]
4140                         if handle.wait() != 0:
4141                                 return None
4142                 except (IOError, OSError):
4143                         return None
4144                 audio_codec = None
4145                 for line in output.split('\n'):
4146                         if line.startswith('codec_name='):
4147                                 audio_codec = line.split('=')[1].strip()
4148                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4149                                 return audio_codec
4150                 return None
4151
4152         @staticmethod
4153         def run_ffmpeg(path, out_path, codec, more_opts):
4154                 if codec is None:
4155                         acodec_opts = []
4156                 else:
4157                         acodec_opts = ['-acodec', codec]
4158                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4159                 try:
4160                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4161                         stdout,stderr = p.communicate()
4162                 except (IOError, OSError):
4163                         e = sys.exc_info()[1]
4164                         if isinstance(e, OSError) and e.errno == 2:
4165                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4166                         else:
4167                                 raise e
4168                 if p.returncode != 0:
4169                         msg = stderr.strip().split('\n')[-1]
4170                         raise AudioConversionError(msg)
4171
4172         def run(self, information):
4173                 path = information['filepath']
4174
4175                 filecodec = self.get_audio_codec(path)
4176                 if filecodec is None:
4177                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4178                         return None
4179
4180                 more_opts = []
4181                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4182                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4183                                 # Lossless, but in another container
4184                                 acodec = 'copy'
4185                                 extension = self._preferredcodec
4186                                 more_opts = ['-absf', 'aac_adtstoasc']
4187                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4188                                 # Lossless if possible
4189                                 acodec = 'copy'
4190                                 extension = filecodec
4191                                 if filecodec == 'aac':
4192                                         more_opts = ['-f', 'adts']
4193                                 if filecodec == 'vorbis':
4194                                         extension = 'ogg'
4195                         else:
4196                                 # MP3 otherwise.
4197                                 acodec = 'libmp3lame'
4198                                 extension = 'mp3'
4199                                 more_opts = []
4200                                 if self._preferredquality is not None:
4201                                         more_opts += ['-ab', self._preferredquality]
4202                 else:
4203                         # We convert the audio (lossy)
4204                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4205                         extension = self._preferredcodec
4206                         more_opts = []
4207                         if self._preferredquality is not None:
4208                                 more_opts += ['-ab', self._preferredquality]
4209                         if self._preferredcodec == 'aac':
4210                                 more_opts += ['-f', 'adts']
4211                         if self._preferredcodec == 'm4a':
4212                                 more_opts += ['-absf', 'aac_adtstoasc']
4213                         if self._preferredcodec == 'vorbis':
4214                                 extension = 'ogg'
4215                         if self._preferredcodec == 'wav':
4216                                 extension = 'wav'
4217                                 more_opts += ['-f', 'wav']
4218
4219                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4220                 new_path = prefix + sep + extension
4221                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4222                 try:
4223                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4224                 except:
4225                         etype,e,tb = sys.exc_info()
4226                         if isinstance(e, AudioConversionError):
4227                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4228                         else:
4229                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4230                         return None
4231
4232                 # Try to update the date time for extracted audio file.
4233                 if information.get('filetime') is not None:
4234                         try:
4235                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4236                         except:
4237                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4238
4239                 if not self._keepvideo:
4240                         try:
4241                                 os.remove(_encodeFilename(path))
4242                         except (IOError, OSError):
4243                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4244                                 return None
4245
4246                 information['filepath'] = new_path
4247                 return information
4248
4249
4250 def updateSelf(downloader, filename):
4251         ''' Update the program file with the latest version from the repository '''
4252         # Note: downloader only used for options
4253         if not os.access(filename, os.W_OK):
4254                 sys.exit('ERROR: no write permissions on %s' % filename)
4255
4256         downloader.to_screen(u'Updating to latest version...')
4257
4258         try:
4259                 try:
4260                         urlh = urllib.urlopen(UPDATE_URL)
4261                         newcontent = urlh.read()
4262
4263                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4264                         if vmatch is not None and vmatch.group(1) == __version__:
4265                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4266                                 return
4267                 finally:
4268                         urlh.close()
4269         except (IOError, OSError), err:
4270                 sys.exit('ERROR: unable to download latest version')
4271
4272         try:
4273                 outf = open(filename, 'wb')
4274                 try:
4275                         outf.write(newcontent)
4276                 finally:
4277                         outf.close()
4278         except (IOError, OSError), err:
4279                 sys.exit('ERROR: unable to overwrite current version')
4280
4281         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4282
4283 def parseOpts():
4284         def _readOptions(filename_bytes):
4285                 try:
4286                         optionf = open(filename_bytes)
4287                 except IOError:
4288                         return [] # silently skip if file is not present
4289                 try:
4290                         res = []
4291                         for l in optionf:
4292                                 res += shlex.split(l, comments=True)
4293                 finally:
4294                         optionf.close()
4295                 return res
4296
4297         def _format_option_string(option):
4298                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4299
4300                 opts = []
4301
4302                 if option._short_opts: opts.append(option._short_opts[0])
4303                 if option._long_opts: opts.append(option._long_opts[0])
4304                 if len(opts) > 1: opts.insert(1, ', ')
4305
4306                 if option.takes_value(): opts.append(' %s' % option.metavar)
4307
4308                 return "".join(opts)
4309
4310         def _find_term_columns():
4311                 columns = os.environ.get('COLUMNS', None)
4312                 if columns:
4313                         return int(columns)
4314
4315                 try:
4316                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4317                         out,err = sp.communicate()
4318                         return int(out.split()[1])
4319                 except:
4320                         pass
4321                 return None
4322
4323         max_width = 80
4324         max_help_position = 80
4325
4326         # No need to wrap help messages if we're on a wide console
4327         columns = _find_term_columns()
4328         if columns: max_width = columns
4329
4330         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4331         fmt.format_option_strings = _format_option_string
4332
4333         kw = {
4334                 'version'   : __version__,
4335                 'formatter' : fmt,
4336                 'usage' : '%prog [options] url [url...]',
4337                 'conflict_handler' : 'resolve',
4338         }
4339
4340         parser = optparse.OptionParser(**kw)
4341
4342         # option groups
4343         general        = optparse.OptionGroup(parser, 'General Options')
4344         selection      = optparse.OptionGroup(parser, 'Video Selection')
4345         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4346         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4347         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4348         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4349         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4350
4351         general.add_option('-h', '--help',
4352                         action='help', help='print this help text and exit')
4353         general.add_option('-v', '--version',
4354                         action='version', help='print program version and exit')
4355         general.add_option('-U', '--update',
4356                         action='store_true', dest='update_self', help='update this program to latest version')
4357         general.add_option('-i', '--ignore-errors',
4358                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4359         general.add_option('-r', '--rate-limit',
4360                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4361         general.add_option('-R', '--retries',
4362                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4363         general.add_option('--dump-user-agent',
4364                         action='store_true', dest='dump_user_agent',
4365                         help='display the current browser identification', default=False)
4366         general.add_option('--list-extractors',
4367                         action='store_true', dest='list_extractors',
4368                         help='List all supported extractors and the URLs they would handle', default=False)
4369
4370         selection.add_option('--playlist-start',
4371                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4372         selection.add_option('--playlist-end',
4373                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4374         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4375         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4376         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4377
4378         authentication.add_option('-u', '--username',
4379                         dest='username', metavar='USERNAME', help='account username')
4380         authentication.add_option('-p', '--password',
4381                         dest='password', metavar='PASSWORD', help='account password')
4382         authentication.add_option('-n', '--netrc',
4383                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4384
4385
4386         video_format.add_option('-f', '--format',
4387                         action='store', dest='format', metavar='FORMAT', help='video format code')
4388         video_format.add_option('--all-formats',
4389                         action='store_const', dest='format', help='download all available video formats', const='all')
4390         video_format.add_option('--prefer-free-formats',
4391                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4392         video_format.add_option('--max-quality',
4393                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4394         video_format.add_option('-F', '--list-formats',
4395                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4396         video_format.add_option('--write-srt',
4397                         action='store_true', dest='writesubtitles',
4398                         help='write video closed captions to a .srt file (currently youtube only)', default=False)
4399         video_format.add_option('--srt-lang',
4400                         action='store', dest='subtitleslang', metavar='LANG',
4401                         help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4402
4403
4404         verbosity.add_option('-q', '--quiet',
4405                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4406         verbosity.add_option('-s', '--simulate',
4407                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4408         verbosity.add_option('--skip-download',
4409                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4410         verbosity.add_option('-g', '--get-url',
4411                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4412         verbosity.add_option('-e', '--get-title',
4413                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4414         verbosity.add_option('--get-thumbnail',
4415                         action='store_true', dest='getthumbnail',
4416                         help='simulate, quiet but print thumbnail URL', default=False)
4417         verbosity.add_option('--get-description',
4418                         action='store_true', dest='getdescription',
4419                         help='simulate, quiet but print video description', default=False)
4420         verbosity.add_option('--get-filename',
4421                         action='store_true', dest='getfilename',
4422                         help='simulate, quiet but print output filename', default=False)
4423         verbosity.add_option('--get-format',
4424                         action='store_true', dest='getformat',
4425                         help='simulate, quiet but print output format', default=False)
4426         verbosity.add_option('--no-progress',
4427                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4428         verbosity.add_option('--console-title',
4429                         action='store_true', dest='consoletitle',
4430                         help='display progress in console titlebar', default=False)
4431         verbosity.add_option('-v', '--verbose',
4432                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4433
4434
4435         filesystem.add_option('-t', '--title',
4436                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4437         filesystem.add_option('-l', '--literal',
4438                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4439         filesystem.add_option('-A', '--auto-number',
4440                         action='store_true', dest='autonumber',
4441                         help='number downloaded files starting from 00000', default=False)
4442         filesystem.add_option('-o', '--output',
4443                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4444         filesystem.add_option('-a', '--batch-file',
4445                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4446         filesystem.add_option('-w', '--no-overwrites',
4447                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4448         filesystem.add_option('-c', '--continue',
4449                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4450         filesystem.add_option('--no-continue',
4451                         action='store_false', dest='continue_dl',
4452                         help='do not resume partially downloaded files (restart from beginning)')
4453         filesystem.add_option('--cookies',
4454                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4455         filesystem.add_option('--no-part',
4456                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4457         filesystem.add_option('--no-mtime',
4458                         action='store_false', dest='updatetime',
4459                         help='do not use the Last-modified header to set the file modification time', default=True)
4460         filesystem.add_option('--write-description',
4461                         action='store_true', dest='writedescription',
4462                         help='write video description to a .description file', default=False)
4463         filesystem.add_option('--write-info-json',
4464                         action='store_true', dest='writeinfojson',
4465                         help='write video metadata to a .info.json file', default=False)
4466
4467
4468         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4469                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4470         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4471                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4472         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4473                         help='ffmpeg audio bitrate specification, 128k by default')
4474         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4475                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4476
4477
4478         parser.add_option_group(general)
4479         parser.add_option_group(selection)
4480         parser.add_option_group(filesystem)
4481         parser.add_option_group(verbosity)
4482         parser.add_option_group(video_format)
4483         parser.add_option_group(authentication)
4484         parser.add_option_group(postproc)
4485
4486         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4487         if xdg_config_home:
4488                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4489         else:
4490                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4491         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4492         opts, args = parser.parse_args(argv)
4493
4494         return parser, opts, args
4495
4496 def gen_extractors():
4497         """ Return a list of an instance of every supported extractor.
4498         The order does matter; the first extractor matched is the one handling the URL.
4499         """
4500         youtube_ie = YoutubeIE()
4501         google_ie = GoogleIE()
4502         yahoo_ie = YahooIE()
4503         return [
4504                 YoutubePlaylistIE(youtube_ie),
4505                 YoutubeUserIE(youtube_ie),
4506                 YoutubeSearchIE(youtube_ie),
4507                 youtube_ie,
4508                 MetacafeIE(youtube_ie),
4509                 DailymotionIE(),
4510                 google_ie,
4511                 GoogleSearchIE(google_ie),
4512                 PhotobucketIE(),
4513                 yahoo_ie,
4514                 YahooSearchIE(yahoo_ie),
4515                 DepositFilesIE(),
4516                 FacebookIE(),
4517                 BlipTVIE(),
4518                 VimeoIE(),
4519                 MyVideoIE(),
4520                 ComedyCentralIE(),
4521                 EscapistIE(),
4522                 CollegeHumorIE(),
4523                 XVideosIE(),
4524                 SoundcloudIE(),
4525                 InfoQIE(),
4526                 MixcloudIE(),
4527                 StanfordOpenClassroomIE(),
4528                 MTVIE(),
4529
4530                 GenericIE()
4531         ]
4532
4533 def _real_main():
4534         parser, opts, args = parseOpts()
4535
4536         # Open appropriate CookieJar
4537         if opts.cookiefile is None:
4538                 jar = cookielib.CookieJar()
4539         else:
4540                 try:
4541                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4542                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4543                                 jar.load()
4544                 except (IOError, OSError), err:
4545                         sys.exit(u'ERROR: unable to open cookie file')
4546
4547         # Dump user agent
4548         if opts.dump_user_agent:
4549                 print std_headers['User-Agent']
4550                 sys.exit(0)
4551
4552         # Batch file verification
4553         batchurls = []
4554         if opts.batchfile is not None:
4555                 try:
4556                         if opts.batchfile == '-':
4557                                 batchfd = sys.stdin
4558                         else:
4559                                 batchfd = open(opts.batchfile, 'r')
4560                         batchurls = batchfd.readlines()
4561                         batchurls = [x.strip() for x in batchurls]
4562                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4563                 except IOError:
4564                         sys.exit(u'ERROR: batch file could not be read')
4565         all_urls = batchurls + args
4566
4567         # General configuration
4568         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4569         proxy_handler = urllib2.ProxyHandler()
4570         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4571         urllib2.install_opener(opener)
4572         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4573
4574         if opts.verbose:
4575                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4576
4577         extractors = gen_extractors()
4578
4579         if opts.list_extractors:
4580                 for ie in extractors:
4581                         print(ie.IE_NAME)
4582                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4583                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4584                         for mu in matchedUrls:
4585                                 print(u'  ' + mu)
4586                 sys.exit(0)
4587
4588         # Conflicting, missing and erroneous options
4589         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4590                 parser.error(u'using .netrc conflicts with giving username/password')
4591         if opts.password is not None and opts.username is None:
4592                 parser.error(u'account username missing')
4593         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4594                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4595         if opts.usetitle and opts.useliteral:
4596                 parser.error(u'using title conflicts with using literal title')
4597         if opts.username is not None and opts.password is None:
4598                 opts.password = getpass.getpass(u'Type account password and press return:')
4599         if opts.ratelimit is not None:
4600                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4601                 if numeric_limit is None:
4602                         parser.error(u'invalid rate limit specified')
4603                 opts.ratelimit = numeric_limit
4604         if opts.retries is not None:
4605                 try:
4606                         opts.retries = long(opts.retries)
4607                 except (TypeError, ValueError), err:
4608                         parser.error(u'invalid retry count specified')
4609         try:
4610                 opts.playliststart = int(opts.playliststart)
4611                 if opts.playliststart <= 0:
4612                         raise ValueError(u'Playlist start must be positive')
4613         except (TypeError, ValueError), err:
4614                 parser.error(u'invalid playlist start number specified')
4615         try:
4616                 opts.playlistend = int(opts.playlistend)
4617                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4618                         raise ValueError(u'Playlist end must be greater than playlist start')
4619         except (TypeError, ValueError), err:
4620                 parser.error(u'invalid playlist end number specified')
4621         if opts.extractaudio:
4622                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4623                         parser.error(u'invalid audio format specified')
4624
4625         # File downloader
4626         fd = FileDownloader({
4627                 'usenetrc': opts.usenetrc,
4628                 'username': opts.username,
4629                 'password': opts.password,
4630                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4631                 'forceurl': opts.geturl,
4632                 'forcetitle': opts.gettitle,
4633                 'forcethumbnail': opts.getthumbnail,
4634                 'forcedescription': opts.getdescription,
4635                 'forcefilename': opts.getfilename,
4636                 'forceformat': opts.getformat,
4637                 'simulate': opts.simulate,
4638                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4639                 'format': opts.format,
4640                 'format_limit': opts.format_limit,
4641                 'listformats': opts.listformats,
4642                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4643                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4644                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4645                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4646                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4647                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4648                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4649                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4650                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4651                         or u'%(id)s.%(ext)s'),
4652                 'ignoreerrors': opts.ignoreerrors,
4653                 'ratelimit': opts.ratelimit,
4654                 'nooverwrites': opts.nooverwrites,
4655                 'retries': opts.retries,
4656                 'continuedl': opts.continue_dl,
4657                 'noprogress': opts.noprogress,
4658                 'playliststart': opts.playliststart,
4659                 'playlistend': opts.playlistend,
4660                 'logtostderr': opts.outtmpl == '-',
4661                 'consoletitle': opts.consoletitle,
4662                 'nopart': opts.nopart,
4663                 'updatetime': opts.updatetime,
4664                 'writedescription': opts.writedescription,
4665                 'writeinfojson': opts.writeinfojson,
4666                 'writesubtitles': opts.writesubtitles,
4667                 'subtitleslang': opts.subtitleslang,
4668                 'matchtitle': opts.matchtitle,
4669                 'rejecttitle': opts.rejecttitle,
4670                 'max_downloads': opts.max_downloads,
4671                 'prefer_free_formats': opts.prefer_free_formats,
4672                 'verbose': opts.verbose,
4673                 })
4674         for extractor in extractors:
4675                 fd.add_info_extractor(extractor)
4676
4677         # PostProcessors
4678         if opts.extractaudio:
4679                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4680
4681         # Update version
4682         if opts.update_self:
4683                 updateSelf(fd, sys.argv[0])
4684
4685         # Maybe do nothing
4686         if len(all_urls) < 1:
4687                 if not opts.update_self:
4688                         parser.error(u'you must provide at least one URL')
4689                 else:
4690                         sys.exit()
4691
4692         try:
4693                 retcode = fd.download(all_urls)
4694         except MaxDownloadsReached:
4695                 fd.to_screen(u'--max-download limit reached, aborting.')
4696                 retcode = 101
4697
4698         # Dump cookie jar if requested
4699         if opts.cookiefile is not None:
4700                 try:
4701                         jar.save()
4702                 except (IOError, OSError), err:
4703                         sys.exit(u'ERROR: unable to save cookie jar')
4704
4705         sys.exit(retcode)
4706
4707 def main():
4708         try:
4709                 _real_main()
4710         except DownloadError:
4711                 sys.exit(1)
4712         except SameFileError:
4713                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4714         except KeyboardInterrupt:
4715                 sys.exit(u'\nERROR: Interrupted by user')
4716
4717 if __name__ == '__main__':
4718         main()
4719
4720 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: