youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __authors__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         )
  19
  20 __license__ = 'Public Domain'
  21 __version__ = '2012.02.27'
  22
  23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  24
  25
  26 import cookielib
  27 import datetime
  28 import getpass
  29 import gzip
  30 import htmlentitydefs
  31 import HTMLParser
  32 import httplib
  33 import locale
  34 import math
  35 import netrc
  36 import optparse
  37 import os
  38 import os.path
  39 import re
  40 import shlex
  41 import socket
  42 import string
  43 import subprocess
  44 import sys
  45 import time
  46 import urllib
  47 import urllib2
  48 import warnings
  49 import zlib
  50
  51 if os.name == 'nt':
  52         import ctypes
  53
  54 try:
  55         import email.utils
  56 except ImportError: # Python 2.4
  57         import email.Utils
  58 try:
  59         import cStringIO as StringIO
  60 except ImportError:
  61         import StringIO
  62
  63 # parse_qs was moved from the cgi module to the urlparse module recently.
  64 try:
  65         from urlparse import parse_qs
  66 except ImportError:
  67         from cgi import parse_qs
  68
  69 try:
  70         import lxml.etree
  71 except ImportError:
  72         pass # Handled below
  73
  74 try:
  75         import xml.etree.ElementTree
  76 except ImportError: # Python<2.5: Not officially supported, but let it slip
  77         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  78
  79 std_headers = {
  80         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  81         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  82         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  83         'Accept-Encoding': 'gzip, deflate',
  84         'Accept-Language': 'en-us,en;q=0.5',
  85 }
  86
  87 try:
  88         import json
  89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  90         import re
  91         class json(object):
  92                 @staticmethod
  93                 def loads(s):
  94                         s = s.decode('UTF-8')
  95                         def raiseError(msg, i):
  96                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  97                         def skipSpace(i, expectMore=True):
  98                                 while i < len(s) and s[i] in ' \t\r\n':
  99                                         i += 1
 100                                 if expectMore:
 101                                         if i >= len(s):
 102                                                 raiseError('Premature end', i)
 103                                 return i
 104                         def decodeEscape(match):
 105                                 esc = match.group(1)
 106                                 _STATIC = {
 107                                         '"': '"',
 108                                         '\\': '\\',
 109                                         '/': '/',
 110                                         'b': unichr(0x8),
 111                                         'f': unichr(0xc),
 112                                         'n': '\n',
 113                                         'r': '\r',
 114                                         't': '\t',
 115                                 }
 116                                 if esc in _STATIC:
 117                                         return _STATIC[esc]
 118                                 if esc[0] == 'u':
 119                                         if len(esc) == 1+4:
 120                                                 return unichr(int(esc[1:5], 16))
 121                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 122                                                 hi = int(esc[1:5], 16)
 123                                                 low = int(esc[7:11], 16)
 124                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 125                                 raise ValueError('Unknown escape ' + str(esc))
 126                         def parseString(i):
 127                                 i += 1
 128                                 e = i
 129                                 while True:
 130                                         e = s.index('"', e)
 131                                         bslashes = 0
 132                                         while s[e-bslashes-1] == '\\':
 133                                                 bslashes += 1
 134                                         if bslashes % 2 == 1:
 135                                                 e += 1
 136                                                 continue
 137                                         break
 138                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 139                                 stri = rexp.sub(decodeEscape, s[i:e])
 140                                 return (e+1,stri)
 141                         def parseObj(i):
 142                                 i += 1
 143                                 res = {}
 144                                 i = skipSpace(i)
 145                                 if s[i] == '}': # Empty dictionary
 146                                         return (i+1,res)
 147                                 while True:
 148                                         if s[i] != '"':
 149                                                 raiseError('Expected a string object key', i)
 150                                         i,key = parseString(i)
 151                                         i = skipSpace(i)
 152                                         if i >= len(s) or s[i] != ':':
 153                                                 raiseError('Expected a colon', i)
 154                                         i,val = parse(i+1)
 155                                         res[key] = val
 156                                         i = skipSpace(i)
 157                                         if s[i] == '}':
 158                                                 return (i+1, res)
 159                                         if s[i] != ',':
 160                                                 raiseError('Expected comma or closing curly brace', i)
 161                                         i = skipSpace(i+1)
 162                         def parseArray(i):
 163                                 res = []
 164                                 i = skipSpace(i+1)
 165                                 if s[i] == ']': # Empty array
 166                                         return (i+1,res)
 167                                 while True:
 168                                         i,val = parse(i)
 169                                         res.append(val)
 170                                         i = skipSpace(i) # Raise exception if premature end
 171                                         if s[i] == ']':
 172                                                 return (i+1, res)
 173                                         if s[i] != ',':
 174                                                 raiseError('Expected a comma or closing bracket', i)
 175                                         i = skipSpace(i+1)
 176                         def parseDiscrete(i):
 177                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 178                                         if s.startswith(k, i):
 179                                                 return (i+len(k), v)
 180                                 raiseError('Not a boolean (or null)', i)
 181                         def parseNumber(i):
 182                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 183                                 if mobj is None:
 184                                         raiseError('Not a number', i)
 185                                 nums = mobj.group(1)
 186                                 if '.' in nums or 'e' in nums or 'E' in nums:
 187                                         return (i+len(nums), float(nums))
 188                                 return (i+len(nums), int(nums))
 189                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 190                         def parse(i):
 191                                 i = skipSpace(i)
 192                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 193                                 i = skipSpace(i, False)
 194                                 return (i,res)
 195                         i,res = parse(0)
 196                         if i < len(s):
 197                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 198                         return res
 199
 200 def preferredencoding():
 201         """Get preferred encoding.
 202
 203         Returns the best encoding scheme for the system, based on
 204         locale.getpreferredencoding() and some further tweaks.
 205         """
 206         def yield_preferredencoding():
 207                 try:
 208                         pref = locale.getpreferredencoding()
 209                         u'TEST'.encode(pref)
 210                 except:
 211                         pref = 'UTF-8'
 212                 while True:
 213                         yield pref
 214         return yield_preferredencoding().next()
 215
 216
 217 def htmlentity_transform(matchobj):
 218         """Transforms an HTML entity to a Unicode character.
 219
 220         This function receives a match object and is intended to be used with
 221         the re.sub() function.
 222         """
 223         entity = matchobj.group(1)
 224
 225         # Known non-numeric HTML entity
 226         if entity in htmlentitydefs.name2codepoint:
 227                 return unichr(htmlentitydefs.name2codepoint[entity])
 228
 229         # Unicode character
 230         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 231         if mobj is not None:
 232                 numstr = mobj.group(1)
 233                 if numstr.startswith(u'x'):
 234                         base = 16
 235                         numstr = u'0%s' % numstr
 236                 else:
 237                         base = 10
 238                 return unichr(long(numstr, base))
 239
 240         # Unknown entity in name, return its literal representation
 241         return (u'&%s;' % entity)
 242
 243
 244 def sanitize_title(utitle):
 245         """Sanitizes a video title so it could be used as part of a filename."""
 246         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 247         return utitle.replace(unicode(os.sep), u'%')
 248
 249
 250 def sanitize_open(filename, open_mode):
 251         """Try to open the given filename, and slightly tweak it if this fails.
 252
 253         Attempts to open the given filename. If this fails, it tries to change
 254         the filename slightly, step by step, until it's either able to open it
 255         or it fails and raises a final exception, like the standard open()
 256         function.
 257
 258         It returns the tuple (stream, definitive_file_name).
 259         """
 260         try:
 261                 if filename == u'-':
 262                         if sys.platform == 'win32':
 263                                 import msvcrt
 264                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 265                         return (sys.stdout, filename)
 266                 stream = open(_encodeFilename(filename), open_mode)
 267                 return (stream, filename)
 268         except (IOError, OSError), err:
 269                 # In case of error, try to remove win32 forbidden chars
 270                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 271
 272                 # An exception here should be caught in the caller
 273                 stream = open(_encodeFilename(filename), open_mode)
 274                 return (stream, filename)
 275
 276
 277 def timeconvert(timestr):
 278         """Convert RFC 2822 defined time string into system timestamp"""
 279         timestamp = None
 280         timetuple = email.utils.parsedate_tz(timestr)
 281         if timetuple is not None:
 282                 timestamp = email.utils.mktime_tz(timetuple)
 283         return timestamp
 284
 285 def _simplify_title(title):
 286         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 287         return expr.sub(u'_', title).strip(u'_')
 288
 289 def _orderedSet(iterable):
 290         """ Remove all duplicates from the input iterable """
 291         res = []
 292         for el in iterable:
 293                 if el not in res:
 294                         res.append(el)
 295         return res
 296
 297 def _unescapeHTML(s):
 298         """
 299         @param s a string (of type unicode)
 300         """
 301         assert type(s) == type(u'')
 302
 303         htmlParser = HTMLParser.HTMLParser()
 304         return htmlParser.unescape(s)
 305
 306 def _encodeFilename(s):
 307         """
 308         @param s The name of the file (of type unicode)
 309         """
 310
 311         assert type(s) == type(u'')
 312
 313         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 314                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 315                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 316                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 317                 return s
 318         else:
 319                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 320
 321 class DownloadError(Exception):
 322         """Download Error exception.
 323
 324         This exception may be thrown by FileDownloader objects if they are not
 325         configured to continue on errors. They will contain the appropriate
 326         error message.
 327         """
 328         pass
 329
 330
 331 class SameFileError(Exception):
 332         """Same File exception.
 333
 334         This exception will be thrown by FileDownloader objects if they detect
 335         multiple files would have to be downloaded to the same file on disk.
 336         """
 337         pass
 338
 339
 340 class PostProcessingError(Exception):
 341         """Post Processing exception.
 342
 343         This exception may be raised by PostProcessor's .run() method to
 344         indicate an error in the postprocessing task.
 345         """
 346         pass
 347
 348 class MaxDownloadsReached(Exception):
 349         """ --max-downloads limit has been reached. """
 350         pass
 351
 352
 353 class UnavailableVideoError(Exception):
 354         """Unavailable Format exception.
 355
 356         This exception will be thrown when a video is requested
 357         in a format that is not available for that video.
 358         """
 359         pass
 360
 361
 362 class ContentTooShortError(Exception):
 363         """Content Too Short exception.
 364
 365         This exception may be raised by FileDownloader objects when a file they
 366         download is too small for what the server announced first, indicating
 367         the connection was probably interrupted.
 368         """
 369         # Both in bytes
 370         downloaded = None
 371         expected = None
 372
 373         def __init__(self, downloaded, expected):
 374                 self.downloaded = downloaded
 375                 self.expected = expected
 376
 377
 378 class YoutubeDLHandler(urllib2.HTTPHandler):
 379         """Handler for HTTP requests and responses.
 380
 381         This class, when installed with an OpenerDirector, automatically adds
 382         the standard headers to every HTTP request and handles gzipped and
 383         deflated responses from web servers. If compression is to be avoided in
 384         a particular request, the original request in the program code only has
 385         to include the HTTP header "Youtubedl-No-Compression", which will be
 386         removed before making the real request.
 387
 388         Part of this code was copied from:
 389
 390         http://techknack.net/python-urllib2-handlers/
 391
 392         Andrew Rowls, the author of that code, agreed to release it to the
 393         public domain.
 394         """
 395
 396         @staticmethod
 397         def deflate(data):
 398                 try:
 399                         return zlib.decompress(data, -zlib.MAX_WBITS)
 400                 except zlib.error:
 401                         return zlib.decompress(data)
 402
 403         @staticmethod
 404         def addinfourl_wrapper(stream, headers, url, code):
 405                 if hasattr(urllib2.addinfourl, 'getcode'):
 406                         return urllib2.addinfourl(stream, headers, url, code)
 407                 ret = urllib2.addinfourl(stream, headers, url)
 408                 ret.code = code
 409                 return ret
 410
 411         def http_request(self, req):
 412                 for h in std_headers:
 413                         if h in req.headers:
 414                                 del req.headers[h]
 415                         req.add_header(h, std_headers[h])
 416                 if 'Youtubedl-no-compression' in req.headers:
 417                         if 'Accept-encoding' in req.headers:
 418                                 del req.headers['Accept-encoding']
 419                         del req.headers['Youtubedl-no-compression']
 420                 return req
 421
 422         def http_response(self, req, resp):
 423                 old_resp = resp
 424                 # gzip
 425                 if resp.headers.get('Content-encoding', '') == 'gzip':
 426                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 427                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 428                         resp.msg = old_resp.msg
 429                 # deflate
 430                 if resp.headers.get('Content-encoding', '') == 'deflate':
 431                         gz = StringIO.StringIO(self.deflate(resp.read()))
 432                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 433                         resp.msg = old_resp.msg
 434                 return resp
 435
 436
 437 class FileDownloader(object):
 438         """File Downloader class.
 439
 440         File downloader objects are the ones responsible of downloading the
 441         actual video file and writing it to disk if the user has requested
 442         it, among some other tasks. In most cases there should be one per
 443         program. As, given a video URL, the downloader doesn't know how to
 444         extract all the needed information, task that InfoExtractors do, it
 445         has to pass the URL to one of them.
 446
 447         For this, file downloader objects have a method that allows
 448         InfoExtractors to be registered in a given order. When it is passed
 449         a URL, the file downloader handles it to the first InfoExtractor it
 450         finds that reports being able to handle it. The InfoExtractor extracts
 451         all the information about the video or videos the URL refers to, and
 452         asks the FileDownloader to process the video information, possibly
 453         downloading the video.
 454
 455         File downloaders accept a lot of parameters. In order not to saturate
 456         the object constructor with arguments, it receives a dictionary of
 457         options instead. These options are available through the params
 458         attribute for the InfoExtractors to use. The FileDownloader also
 459         registers itself as the downloader in charge for the InfoExtractors
 460         that are added to it, so this is a "mutual registration".
 461
 462         Available options:
 463
 464         username:         Username for authentication purposes.
 465         password:         Password for authentication purposes.
 466         usenetrc:         Use netrc for authentication instead.
 467         quiet:            Do not print messages to stdout.
 468         forceurl:         Force printing final URL.
 469         forcetitle:       Force printing title.
 470         forcethumbnail:   Force printing thumbnail URL.
 471         forcedescription: Force printing description.
 472         forcefilename:    Force printing final filename.
 473         simulate:         Do not download the video files.
 474         format:           Video format code.
 475         format_limit:     Highest quality format to try.
 476         outtmpl:          Template for output names.
 477         ignoreerrors:     Do not stop on download errors.
 478         ratelimit:        Download speed limit, in bytes/sec.
 479         nooverwrites:     Prevent overwriting files.
 480         retries:          Number of times to retry for HTTP error 5xx
 481         continuedl:       Try to continue downloads if possible.
 482         noprogress:       Do not print the progress bar.
 483         playliststart:    Playlist item to start at.
 484         playlistend:      Playlist item to end at.
 485         matchtitle:       Download only matching titles.
 486         rejecttitle:      Reject downloads for matching titles.
 487         logtostderr:      Log messages to stderr instead of stdout.
 488         consoletitle:     Display progress in console window's titlebar.
 489         nopart:           Do not use temporary .part files.
 490         updatetime:       Use the Last-modified header to set output file timestamps.
 491         writedescription: Write the video description to a .description file
 492         writeinfojson:    Write the video description to a .info.json file
 493         writesubtitles:   Write the video subtitles to a .srt file
 494         subtitleslang:    Language of the subtitles to download
 495         """
 496
 497         params = None
 498         _ies = []
 499         _pps = []
 500         _download_retcode = None
 501         _num_downloads = None
 502         _screen_file = None
 503
 504         def __init__(self, params):
 505                 """Create a FileDownloader object with the given options."""
 506                 self._ies = []
 507                 self._pps = []
 508                 self._download_retcode = 0
 509                 self._num_downloads = 0
 510                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 511                 self.params = params
 512
 513         @staticmethod
 514         def format_bytes(bytes):
 515                 if bytes is None:
 516                         return 'N/A'
 517                 if type(bytes) is str:
 518                         bytes = float(bytes)
 519                 if bytes == 0.0:
 520                         exponent = 0
 521                 else:
 522                         exponent = long(math.log(bytes, 1024.0))
 523                 suffix = 'bkMGTPEZY'[exponent]
 524                 converted = float(bytes) / float(1024 ** exponent)
 525                 return '%.2f%s' % (converted, suffix)
 526
 527         @staticmethod
 528         def calc_percent(byte_counter, data_len):
 529                 if data_len is None:
 530                         return '---.-%'
 531                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 532
 533         @staticmethod
 534         def calc_eta(start, now, total, current):
 535                 if total is None:
 536                         return '--:--'
 537                 dif = now - start
 538                 if current == 0 or dif < 0.001: # One millisecond
 539                         return '--:--'
 540                 rate = float(current) / dif
 541                 eta = long((float(total) - float(current)) / rate)
 542                 (eta_mins, eta_secs) = divmod(eta, 60)
 543                 if eta_mins > 99:
 544                         return '--:--'
 545                 return '%02d:%02d' % (eta_mins, eta_secs)
 546
 547         @staticmethod
 548         def calc_speed(start, now, bytes):
 549                 dif = now - start
 550                 if bytes == 0 or dif < 0.001: # One millisecond
 551                         return '%10s' % '---b/s'
 552                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 553
 554         @staticmethod
 555         def best_block_size(elapsed_time, bytes):
 556                 new_min = max(bytes / 2.0, 1.0)
 557                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 558                 if elapsed_time < 0.001:
 559                         return long(new_max)
 560                 rate = bytes / elapsed_time
 561                 if rate > new_max:
 562                         return long(new_max)
 563                 if rate < new_min:
 564                         return long(new_min)
 565                 return long(rate)
 566
 567         @staticmethod
 568         def parse_bytes(bytestr):
 569                 """Parse a string indicating a byte quantity into a long integer."""
 570                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 571                 if matchobj is None:
 572                         return None
 573                 number = float(matchobj.group(1))
 574                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 575                 return long(round(number * multiplier))
 576
 577         def add_info_extractor(self, ie):
 578                 """Add an InfoExtractor object to the end of the list."""
 579                 self._ies.append(ie)
 580                 ie.set_downloader(self)
 581
 582         def add_post_processor(self, pp):
 583                 """Add a PostProcessor object to the end of the chain."""
 584                 self._pps.append(pp)
 585                 pp.set_downloader(self)
 586
 587         def to_screen(self, message, skip_eol=False):
 588                 """Print message to stdout if not in quiet mode."""
 589                 assert type(message) == type(u'')
 590                 if not self.params.get('quiet', False):
 591                         terminator = [u'\n', u''][skip_eol]
 592                         output = message + terminator
 593
 594                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 595                                 output = output.encode(preferredencoding(), 'ignore')
 596                         self._screen_file.write(output)
 597                         self._screen_file.flush()
 598
 599         def to_stderr(self, message):
 600                 """Print message to stderr."""
 601                 print >>sys.stderr, message.encode(preferredencoding())
 602
 603         def to_cons_title(self, message):
 604                 """Set console/terminal window title to message."""
 605                 if not self.params.get('consoletitle', False):
 606                         return
 607                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 608                         # c_wchar_p() might not be necessary if `message` is
 609                         # already of type unicode()
 610                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 611                 elif 'TERM' in os.environ:
 612                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 613
 614         def fixed_template(self):
 615                 """Checks if the output template is fixed."""
 616                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 617
 618         def trouble(self, message=None):
 619                 """Determine action to take when a download problem appears.
 620
 621                 Depending on if the downloader has been configured to ignore
 622                 download errors or not, this method may throw an exception or
 623                 not when errors are found, after printing the message.
 624                 """
 625                 if message is not None:
 626                         self.to_stderr(message)
 627                 if not self.params.get('ignoreerrors', False):
 628                         raise DownloadError(message)
 629                 self._download_retcode = 1
 630
 631         def slow_down(self, start_time, byte_counter):
 632                 """Sleep if the download speed is over the rate limit."""
 633                 rate_limit = self.params.get('ratelimit', None)
 634                 if rate_limit is None or byte_counter == 0:
 635                         return
 636                 now = time.time()
 637                 elapsed = now - start_time
 638                 if elapsed <= 0.0:
 639                         return
 640                 speed = float(byte_counter) / elapsed
 641                 if speed > rate_limit:
 642                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 643
 644         def temp_name(self, filename):
 645                 """Returns a temporary filename for the given filename."""
 646                 if self.params.get('nopart', False) or filename == u'-' or \
 647                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
 648                         return filename
 649                 return filename + u'.part'
 650
 651         def undo_temp_name(self, filename):
 652                 if filename.endswith(u'.part'):
 653                         return filename[:-len(u'.part')]
 654                 return filename
 655
 656         def try_rename(self, old_filename, new_filename):
 657                 try:
 658                         if old_filename == new_filename:
 659                                 return
 660                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
 661                 except (IOError, OSError), err:
 662                         self.trouble(u'ERROR: unable to rename file')
 663
 664         def try_utime(self, filename, last_modified_hdr):
 665                 """Try to set the last-modified time of the given file."""
 666                 if last_modified_hdr is None:
 667                         return
 668                 if not os.path.isfile(_encodeFilename(filename)):
 669                         return
 670                 timestr = last_modified_hdr
 671                 if timestr is None:
 672                         return
 673                 filetime = timeconvert(timestr)
 674                 if filetime is None:
 675                         return filetime
 676                 try:
 677                         os.utime(filename, (time.time(), filetime))
 678                 except:
 679                         pass
 680                 return filetime
 681
 682         def report_writedescription(self, descfn):
 683                 """ Report that the description file is being written """
 684                 self.to_screen(u'[info] Writing video description to: ' + descfn)
 685
 686         def report_writesubtitles(self, srtfn):
 687                 """ Report that the subtitles file is being written """
 688                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
 689
 690         def report_writeinfojson(self, infofn):
 691                 """ Report that the metadata file has been written """
 692                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 693
 694         def report_destination(self, filename):
 695                 """Report destination filename."""
 696                 self.to_screen(u'[download] Destination: ' + filename)
 697
 698         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 699                 """Report download progress."""
 700                 if self.params.get('noprogress', False):
 701                         return
 702                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 703                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 704                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 705                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 706
 707         def report_resuming_byte(self, resume_len):
 708                 """Report attempt to resume at given byte."""
 709                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 710
 711         def report_retry(self, count, retries):
 712                 """Report retry in case of HTTP error 5xx"""
 713                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 714
 715         def report_file_already_downloaded(self, file_name):
 716                 """Report file has already been fully downloaded."""
 717                 try:
 718                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 719                 except (UnicodeEncodeError), err:
 720                         self.to_screen(u'[download] The file has already been downloaded')
 721
 722         def report_unable_to_resume(self):
 723                 """Report it was impossible to resume download."""
 724                 self.to_screen(u'[download] Unable to resume')
 725
 726         def report_finish(self):
 727                 """Report download finished."""
 728                 if self.params.get('noprogress', False):
 729                         self.to_screen(u'[download] Download completed')
 730                 else:
 731                         self.to_screen(u'')
 732
 733         def increment_downloads(self):
 734                 """Increment the ordinal that assigns a number to each file."""
 735                 self._num_downloads += 1
 736
 737         def prepare_filename(self, info_dict):
 738                 """Generate the output filename."""
 739                 try:
 740                         template_dict = dict(info_dict)
 741                         template_dict['epoch'] = unicode(long(time.time()))
 742                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 743                         filename = self.params['outtmpl'] % template_dict
 744                         return filename
 745                 except (ValueError, KeyError), err:
 746                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 747                         return None
 748
 749         def _match_entry(self, info_dict):
 750                 """ Returns None iff the file should be downloaded """
 751
 752                 title = info_dict['title']
 753                 matchtitle = self.params.get('matchtitle', False)
 754                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 755                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 756                 rejecttitle = self.params.get('rejecttitle', False)
 757                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 758                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 759                 return None
 760
 761         def process_info(self, info_dict):
 762                 """Process a single dictionary returned by an InfoExtractor."""
 763
 764                 reason = self._match_entry(info_dict)
 765                 if reason is not None:
 766                         self.to_screen(u'[download] ' + reason)
 767                         return
 768
 769                 max_downloads = self.params.get('max_downloads')
 770                 if max_downloads is not None:
 771                         if self._num_downloads > int(max_downloads):
 772                                 raise MaxDownloadsReached()
 773
 774                 filename = self.prepare_filename(info_dict)
 775
 776                 # Forced printings
 777                 if self.params.get('forcetitle', False):
 778                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 779                 if self.params.get('forceurl', False):
 780                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 781                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 782                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 783                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 784                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 785                 if self.params.get('forcefilename', False) and filename is not None:
 786                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 787                 if self.params.get('forceformat', False):
 788                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 789
 790                 # Do nothing else if in simulate mode
 791                 if self.params.get('simulate', False):
 792                         return
 793
 794                 if filename is None:
 795                         return
 796
 797                 try:
 798                         dn = os.path.dirname(_encodeFilename(filename))
 799                         if dn != '' and not os.path.exists(dn): # dn is already encoded
 800                                 os.makedirs(dn)
 801                 except (OSError, IOError), err:
 802                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 803                         return
 804
 805                 if self.params.get('writedescription', False):
 806                         try:
 807                                 descfn = filename + u'.description'
 808                                 self.report_writedescription(descfn)
 809                                 descfile = open(_encodeFilename(descfn), 'wb')
 810                                 try:
 811                                         descfile.write(info_dict['description'].encode('utf-8'))
 812                                 finally:
 813                                         descfile.close()
 814                         except (OSError, IOError):
 815                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 816                                 return
 817
 818                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
 819                         # subtitles download errors are already managed as troubles in relevant IE
 820                         # that way it will silently go on when used with unsupporting IE
 821                         try:
 822                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
 823                                 self.report_writesubtitles(srtfn)
 824                                 srtfile = open(_encodeFilename(srtfn), 'wb')
 825                                 try:
 826                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
 827                                 finally:
 828                                         srtfile.close()
 829                         except (OSError, IOError):
 830                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
 831                                 return
 832
 833                 if self.params.get('writeinfojson', False):
 834                         infofn = filename + u'.info.json'
 835                         self.report_writeinfojson(infofn)
 836                         try:
 837                                 json.dump
 838                         except (NameError,AttributeError):
 839                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 840                                 return
 841                         try:
 842                                 infof = open(_encodeFilename(infofn), 'wb')
 843                                 try:
 844                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 845                                         json.dump(json_info_dict, infof)
 846                                 finally:
 847                                         infof.close()
 848                         except (OSError, IOError):
 849                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 850                                 return
 851
 852                 if not self.params.get('skip_download', False):
 853                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
 854                                 success = True
 855                         else:
 856                                 try:
 857                                         success = self._do_download(filename, info_dict)
 858                                 except (OSError, IOError), err:
 859                                         raise UnavailableVideoError
 860                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 861                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 862                                         return
 863                                 except (ContentTooShortError, ), err:
 864                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 865                                         return
 866
 867                         if success:
 868                                 try:
 869                                         self.post_process(filename, info_dict)
 870                                 except (PostProcessingError), err:
 871                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 872                                         return
 873
 874         def download(self, url_list):
 875                 """Download a given list of URLs."""
 876                 if len(url_list) > 1 and self.fixed_template():
 877                         raise SameFileError(self.params['outtmpl'])
 878
 879                 for url in url_list:
 880                         suitable_found = False
 881                         for ie in self._ies:
 882                                 # Go to next InfoExtractor if not suitable
 883                                 if not ie.suitable(url):
 884                                         continue
 885
 886                                 # Suitable InfoExtractor found
 887                                 suitable_found = True
 888
 889                                 # Extract information from URL and process it
 890                                 ie.extract(url)
 891
 892                                 # Suitable InfoExtractor had been found; go to next URL
 893                                 break
 894
 895                         if not suitable_found:
 896                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 897
 898                 return self._download_retcode
 899
 900         def post_process(self, filename, ie_info):
 901                 """Run the postprocessing chain on the given file."""
 902                 info = dict(ie_info)
 903                 info['filepath'] = filename
 904                 for pp in self._pps:
 905                         info = pp.run(info)
 906                         if info is None:
 907                                 break
 908
 909         def _download_with_rtmpdump(self, filename, url, player_url):
 910                 self.report_destination(filename)
 911                 tmpfilename = self.temp_name(filename)
 912
 913                 # Check for rtmpdump first
 914                 try:
 915                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 916                 except (OSError, IOError):
 917                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 918                         return False
 919
 920                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 921                 # the connection was interrumpted and resuming appears to be
 922                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 923                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 924                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
 925                 if self.params.get('verbose', False):
 926                         try:
 927                                 import pipes
 928                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
 929                         except ImportError:
 930                                 shell_quote = repr
 931                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
 932                 retval = subprocess.call(args)
 933                 while retval == 2 or retval == 1:
 934                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
 935                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 936                         time.sleep(5.0) # This seems to be needed
 937                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 938                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
 939                         if prevsize == cursize and retval == 1:
 940                                 break
 941                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 942                         if prevsize == cursize and retval == 2 and cursize > 1024:
 943                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 944                                 retval = 0
 945                                 break
 946                 if retval == 0:
 947                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
 948                         self.try_rename(tmpfilename, filename)
 949                         return True
 950                 else:
 951                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 952                         return False
 953
 954         def _do_download(self, filename, info_dict):
 955                 url = info_dict['url']
 956                 player_url = info_dict.get('player_url', None)
 957
 958                 # Check file already present
 959                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
 960                         self.report_file_already_downloaded(filename)
 961                         return True
 962
 963                 # Attempt to download using rtmpdump
 964                 if url.startswith('rtmp'):
 965                         return self._download_with_rtmpdump(filename, url, player_url)
 966
 967                 tmpfilename = self.temp_name(filename)
 968                 stream = None
 969
 970                 # Do not include the Accept-Encoding header
 971                 headers = {'Youtubedl-no-compression': 'True'}
 972                 basic_request = urllib2.Request(url, None, headers)
 973                 request = urllib2.Request(url, None, headers)
 974
 975                 # Establish possible resume length
 976                 if os.path.isfile(_encodeFilename(tmpfilename)):
 977                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
 978                 else:
 979                         resume_len = 0
 980
 981                 open_mode = 'wb'
 982                 if resume_len != 0:
 983                         if self.params.get('continuedl', False):
 984                                 self.report_resuming_byte(resume_len)
 985                                 request.add_header('Range','bytes=%d-' % resume_len)
 986                                 open_mode = 'ab'
 987                         else:
 988                                 resume_len = 0
 989
 990                 count = 0
 991                 retries = self.params.get('retries', 0)
 992                 while count <= retries:
 993                         # Establish connection
 994                         try:
 995                                 if count == 0 and 'urlhandle' in info_dict:
 996                                         data = info_dict['urlhandle']
 997                                 data = urllib2.urlopen(request)
 998                                 break
 999                         except (urllib2.HTTPError, ), err:
1000                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1001                                         # Unexpected HTTP error
1002                                         raise
1003                                 elif err.code == 416:
1004                                         # Unable to resume (requested range not satisfiable)
1005                                         try:
1006                                                 # Open the connection again without the range header
1007                                                 data = urllib2.urlopen(basic_request)
1008                                                 content_length = data.info()['Content-Length']
1009                                         except (urllib2.HTTPError, ), err:
1010                                                 if err.code < 500 or err.code >= 600:
1011                                                         raise
1012                                         else:
1013                                                 # Examine the reported length
1014                                                 if (content_length is not None and
1015                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1016                                                         # The file had already been fully downloaded.
1017                                                         # Explanation to the above condition: in issue #175 it was revealed that
1018                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1019                                                         # changing the file size slightly and causing problems for some users. So
1020                                                         # I decided to implement a suggested change and consider the file
1021                                                         # completely downloaded if the file size differs less than 100 bytes from
1022                                                         # the one in the hard drive.
1023                                                         self.report_file_already_downloaded(filename)
1024                                                         self.try_rename(tmpfilename, filename)
1025                                                         return True
1026                                                 else:
1027                                                         # The length does not match, we start the download over
1028                                                         self.report_unable_to_resume()
1029                                                         open_mode = 'wb'
1030                                                         break
1031                         # Retry
1032                         count += 1
1033                         if count <= retries:
1034                                 self.report_retry(count, retries)
1035
1036                 if count > retries:
1037                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1038                         return False
1039
1040                 data_len = data.info().get('Content-length', None)
1041                 if data_len is not None:
1042                         data_len = long(data_len) + resume_len
1043                 data_len_str = self.format_bytes(data_len)
1044                 byte_counter = 0 + resume_len
1045                 block_size = 1024
1046                 start = time.time()
1047                 while True:
1048                         # Download and write
1049                         before = time.time()
1050                         data_block = data.read(block_size)
1051                         after = time.time()
1052                         if len(data_block) == 0:
1053                                 break
1054                         byte_counter += len(data_block)
1055
1056                         # Open file just in time
1057                         if stream is None:
1058                                 try:
1059                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1060                                         assert stream is not None
1061                                         filename = self.undo_temp_name(tmpfilename)
1062                                         self.report_destination(filename)
1063                                 except (OSError, IOError), err:
1064                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1065                                         return False
1066                         try:
1067                                 stream.write(data_block)
1068                         except (IOError, OSError), err:
1069                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1070                                 return False
1071                         block_size = self.best_block_size(after - before, len(data_block))
1072
1073                         # Progress message
1074                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1075                         if data_len is None:
1076                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1077                         else:
1078                                 percent_str = self.calc_percent(byte_counter, data_len)
1079                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1080                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1081
1082                         # Apply rate limit
1083                         self.slow_down(start, byte_counter - resume_len)
1084
1085                 if stream is None:
1086                         self.trouble(u'\nERROR: Did not get any data blocks')
1087                         return False
1088                 stream.close()
1089                 self.report_finish()
1090                 if data_len is not None and byte_counter != data_len:
1091                         raise ContentTooShortError(byte_counter, long(data_len))
1092                 self.try_rename(tmpfilename, filename)
1093
1094                 # Update file modification time
1095                 if self.params.get('updatetime', True):
1096                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1097
1098                 return True
1099
1100
1101 class InfoExtractor(object):
1102         """Information Extractor class.
1103
1104         Information extractors are the classes that, given a URL, extract
1105         information from the video (or videos) the URL refers to. This
1106         information includes the real video URL, the video title and simplified
1107         title, author and others. The information is stored in a dictionary
1108         which is then passed to the FileDownloader. The FileDownloader
1109         processes this information possibly downloading the video to the file
1110         system, among other possible outcomes. The dictionaries must include
1111         the following fields:
1112
1113         id:             Video identifier.
1114         url:            Final video URL.
1115         uploader:       Nickname of the video uploader.
1116         title:          Literal title.
1117         stitle:         Simplified title.
1118         ext:            Video filename extension.
1119         format:         Video format.
1120         player_url:     SWF Player URL (may be None).
1121
1122         The following fields are optional. Their primary purpose is to allow
1123         youtube-dl to serve as the backend for a video search function, such
1124         as the one in youtube2mp3.  They are only used when their respective
1125         forced printing functions are called:
1126
1127         thumbnail:      Full URL to a video thumbnail image.
1128         description:    One-line video description.
1129
1130         Subclasses of this one should re-define the _real_initialize() and
1131         _real_extract() methods and define a _VALID_URL regexp.
1132         Probably, they should also be added to the list of extractors.
1133         """
1134
1135         _ready = False
1136         _downloader = None
1137
1138         def __init__(self, downloader=None):
1139                 """Constructor. Receives an optional downloader."""
1140                 self._ready = False
1141                 self.set_downloader(downloader)
1142
1143         def suitable(self, url):
1144                 """Receives a URL and returns True if suitable for this IE."""
1145                 return re.match(self._VALID_URL, url) is not None
1146
1147         def initialize(self):
1148                 """Initializes an instance (authentication, etc)."""
1149                 if not self._ready:
1150                         self._real_initialize()
1151                         self._ready = True
1152
1153         def extract(self, url):
1154                 """Extracts URL information and returns it in list of dicts."""
1155                 self.initialize()
1156                 return self._real_extract(url)
1157
1158         def set_downloader(self, downloader):
1159                 """Sets the downloader for this IE."""
1160                 self._downloader = downloader
1161
1162         def _real_initialize(self):
1163                 """Real initialization process. Redefine in subclasses."""
1164                 pass
1165
1166         def _real_extract(self, url):
1167                 """Real extraction process. Redefine in subclasses."""
1168                 pass
1169
1170
1171 class YoutubeIE(InfoExtractor):
1172         """Information extractor for youtube.com."""
1173
1174         _PREFIX = r'(?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)'
1175         _VALID_URL = r'^('+_PREFIX+r'(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176         _VALID_URL_WITH_AGE = r'^('+_PREFIX+')verify_age\?next_url=([^&]+)(?:.+)?$'
1177         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1178         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1179         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1180         _NETRC_MACHINE = 'youtube'
1181         # Listed in order of quality
1182         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1183         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1184         _video_extensions = {
1185                 '13': '3gp',
1186                 '17': 'mp4',
1187                 '18': 'mp4',
1188                 '22': 'mp4',
1189                 '37': 'mp4',
1190                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1191                 '43': 'webm',
1192                 '44': 'webm',
1193                 '45': 'webm',
1194         }
1195         _video_dimensions = {
1196                 '5': '240x400',
1197                 '6': '???',
1198                 '13': '???',
1199                 '17': '144x176',
1200                 '18': '360x640',
1201                 '22': '720x1280',
1202                 '34': '360x640',
1203                 '35': '480x854',
1204                 '37': '1080x1920',
1205                 '38': '3072x4096',
1206                 '43': '360x640',
1207                 '44': '480x854',
1208                 '45': '720x1280',
1209         }
1210         IE_NAME = u'youtube'
1211
1212         def report_lang(self):
1213                 """Report attempt to set language."""
1214                 self._downloader.to_screen(u'[youtube] Setting language')
1215
1216         def report_login(self):
1217                 """Report attempt to log in."""
1218                 self._downloader.to_screen(u'[youtube] Logging in')
1219
1220         def report_age_confirmation(self):
1221                 """Report attempt to confirm age."""
1222                 self._downloader.to_screen(u'[youtube] Confirming age')
1223
1224         def report_video_webpage_download(self, video_id):
1225                 """Report attempt to download video webpage."""
1226                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1227
1228         def report_video_info_webpage_download(self, video_id):
1229                 """Report attempt to download video info webpage."""
1230                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1231
1232         def report_video_subtitles_download(self, video_id):
1233                 """Report attempt to download video info webpage."""
1234                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1235
1236         def report_information_extraction(self, video_id):
1237                 """Report attempt to extract video information."""
1238                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1239
1240         def report_unavailable_format(self, video_id, format):
1241                 """Report extracted video URL."""
1242                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1243
1244         def report_rtmp_download(self):
1245                 """Indicate the download will use the RTMP protocol."""
1246                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1247
1248         def _closed_captions_xml_to_srt(self, xml_string):
1249                 srt = ''
1250                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1251                 # TODO parse xml instead of regex
1252                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1253                         if not dur: dur = '4'
1254                         start = float(start)
1255                         end = start + float(dur)
1256                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1257                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1258                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1259                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1260                         srt += str(n) + '\n'
1261                         srt += start + ' --> ' + end + '\n'
1262                         srt += caption + '\n\n'
1263                 return srt
1264
1265         def _print_formats(self, formats):
1266                 print 'Available formats:'
1267                 for x in formats:
1268                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1269
1270         def _real_initialize(self):
1271                 if self._downloader is None:
1272                         return
1273
1274                 username = None
1275                 password = None
1276                 downloader_params = self._downloader.params
1277
1278                 # Attempt to use provided username and password or .netrc data
1279                 if downloader_params.get('username', None) is not None:
1280                         username = downloader_params['username']
1281                         password = downloader_params['password']
1282                 elif downloader_params.get('usenetrc', False):
1283                         try:
1284                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1285                                 if info is not None:
1286                                         username = info[0]
1287                                         password = info[2]
1288                                 else:
1289                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1290                         except (IOError, netrc.NetrcParseError), err:
1291                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1292                                 return
1293
1294                 # Set language
1295                 request = urllib2.Request(self._LANG_URL)
1296                 try:
1297                         self.report_lang()
1298                         urllib2.urlopen(request).read()
1299                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1300                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1301                         return
1302
1303                 # No authentication to be performed
1304                 if username is None:
1305                         return
1306
1307                 # Log in
1308                 login_form = {
1309                                 'current_form': 'loginForm',
1310                                 'next':         '/',
1311                                 'action_login': 'Log In',
1312                                 'username':     username,
1313                                 'password':     password,
1314                                 }
1315                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1316                 try:
1317                         self.report_login()
1318                         login_results = urllib2.urlopen(request).read()
1319                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1320                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1321                                 return
1322                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1323                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1324                         return
1325
1326                 # Confirm age
1327                 age_form = {
1328                                 'next_url':             '/',
1329                                 'action_confirm':       'Confirm',
1330                                 }
1331                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1332                 try:
1333                         self.report_age_confirmation()
1334                         age_results = urllib2.urlopen(request).read()
1335                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1336                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1337                         return
1338
1339         def _real_extract(self, url):
1340                 # Extract original video URL from URL with age verification, using next_url parameter
1341                 mobj = re.match(self._VALID_URL_WITH_AGE, url)
1342                 if mobj:
1343                         urldecode = lambda x: re.sub(r'%([0-9a-hA-H][0-9a-hA-H])', lambda m: chr(int(m.group(1), 16)), x)
1344                         # Keep original domain. We can probably change to www.youtube.com, but it should not hurt so keep it.
1345                         # We just make sure we do not have double //, in URL, so we strip starting slash in next_url.
1346                         url = mobj.group(1) + re.sub(r'^/', '', urldecode(mobj.group(2)))
1347
1348                 # Extract video id from URL
1349                 mobj = re.match(self._VALID_URL, url)
1350                 if mobj is None:
1351                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1352                         return
1353                 video_id = mobj.group(2)
1354
1355                 # Get video webpage
1356                 self.report_video_webpage_download(video_id)
1357                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1358                 try:
1359                         video_webpage = urllib2.urlopen(request).read()
1360                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1361                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1362                         return
1363
1364                 # Attempt to extract SWF player URL
1365                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1366                 if mobj is not None:
1367                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1368                 else:
1369                         player_url = None
1370
1371                 # Get video info
1372                 self.report_video_info_webpage_download(video_id)
1373                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1374                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1375                                         % (video_id, el_type))
1376                         request = urllib2.Request(video_info_url)
1377                         try:
1378                                 video_info_webpage = urllib2.urlopen(request).read()
1379                                 video_info = parse_qs(video_info_webpage)
1380                                 if 'token' in video_info:
1381                                         break
1382                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1383                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1384                                 return
1385                 if 'token' not in video_info:
1386                         if 'reason' in video_info:
1387                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1388                         else:
1389                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1390                         return
1391
1392                 # Start extracting information
1393                 self.report_information_extraction(video_id)
1394
1395                 # uploader
1396                 if 'author' not in video_info:
1397                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1398                         return
1399                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1400
1401                 # title
1402                 if 'title' not in video_info:
1403                         self._downloader.trouble(u'ERROR: unable to extract video title')
1404                         return
1405                 video_title = urllib.unquote_plus(video_info['title'][0])
1406                 video_title = video_title.decode('utf-8')
1407                 video_title = sanitize_title(video_title)
1408
1409                 # simplified title
1410                 simple_title = _simplify_title(video_title)
1411
1412                 # thumbnail image
1413                 if 'thumbnail_url' not in video_info:
1414                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1415                         video_thumbnail = ''
1416                 else:   # don't panic if we can't find it
1417                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1418
1419                 # upload date
1420                 upload_date = u'NA'
1421                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1422                 if mobj is not None:
1423                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1424                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1425                         for expression in format_expressions:
1426                                 try:
1427                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1428                                 except:
1429                                         pass
1430
1431                 # description
1432                 try:
1433                         lxml.etree
1434                 except NameError:
1435                         video_description = u'No description available.'
1436                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1437                         if mobj is not None:
1438                                 video_description = mobj.group(1).decode('utf-8')
1439                 else:
1440                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1441                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1442                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1443                         # TODO use another parser
1444
1445                 # closed captions
1446                 video_subtitles = None
1447                 if self._downloader.params.get('writesubtitles', False):
1448                         self.report_video_subtitles_download(video_id)
1449                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1450                         try:
1451                                 srt_list = urllib2.urlopen(request).read()
1452                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1453                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1454                         else:
1455                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1456                                 if srt_lang_list:
1457                                         if self._downloader.params.get('subtitleslang', False):
1458                                                 srt_lang = self._downloader.params.get('subtitleslang')
1459                                         elif 'en' in srt_lang_list:
1460                                                 srt_lang = 'en'
1461                                         else:
1462                                                 srt_lang = srt_lang_list[0]
1463                                         if not srt_lang in srt_lang_list:
1464                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1465                                         else:
1466                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1467                                                 try:
1468                                                         srt_xml = urllib2.urlopen(request).read()
1469                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1470                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1471                                                 else:
1472                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1473                                 else:
1474                                         self._downloader.trouble(u'WARNING: video has no closed captions')
1475
1476                 # token
1477                 video_token = urllib.unquote_plus(video_info['token'][0])
1478
1479                 # Decide which formats to download
1480                 req_format = self._downloader.params.get('format', None)
1481
1482                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1483                         self.report_rtmp_download()
1484                         video_url_list = [(None, video_info['conn'][0])]
1485                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1486                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1487                         url_data = [parse_qs(uds) for uds in url_data_strs]
1488                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1489                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1490
1491                         format_limit = self._downloader.params.get('format_limit', None)
1492                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1493                         if format_limit is not None and format_limit in available_formats:
1494                                 format_list = available_formats[available_formats.index(format_limit):]
1495                         else:
1496                                 format_list = available_formats
1497                         existing_formats = [x for x in format_list if x in url_map]
1498                         if len(existing_formats) == 0:
1499                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1500                                 return
1501                         if self._downloader.params.get('listformats', None):
1502                                 self._print_formats(existing_formats)
1503                                 return
1504                         if req_format is None or req_format == 'best':
1505                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1506                         elif req_format == 'worst':
1507                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1508                         elif req_format in ('-1', 'all'):
1509                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1510                         else:
1511                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1512                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1513                                 req_formats = req_format.split('/')
1514                                 video_url_list = None
1515                                 for rf in req_formats:
1516                                         if rf in url_map:
1517                                                 video_url_list = [(rf, url_map[rf])]
1518                                                 break
1519                                 if video_url_list is None:
1520                                         self._downloader.trouble(u'ERROR: requested format not available')
1521                                         return
1522                 else:
1523                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1524                         return
1525
1526                 for format_param, video_real_url in video_url_list:
1527                         # At this point we have a new video
1528                         self._downloader.increment_downloads()
1529
1530                         # Extension
1531                         video_extension = self._video_extensions.get(format_param, 'flv')
1532
1533                         try:
1534                                 # Process video information
1535                                 self._downloader.process_info({
1536                                         'id':           video_id.decode('utf-8'),
1537                                         'url':          video_real_url.decode('utf-8'),
1538                                         'uploader':     video_uploader.decode('utf-8'),
1539                                         'upload_date':  upload_date,
1540                                         'title':        video_title,
1541                                         'stitle':       simple_title,
1542                                         'ext':          video_extension.decode('utf-8'),
1543                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1544                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1545                                         'description':  video_description,
1546                                         'player_url':   player_url,
1547                                         'subtitles':    video_subtitles
1548                                 })
1549                         except UnavailableVideoError, err:
1550                                 self._downloader.trouble(u'\nERROR: unable to download video')
1551
1552
1553 class MetacafeIE(InfoExtractor):
1554         """Information Extractor for metacafe.com."""
1555
1556         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1557         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1558         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1559         _youtube_ie = None
1560         IE_NAME = u'metacafe'
1561
1562         def __init__(self, youtube_ie, downloader=None):
1563                 InfoExtractor.__init__(self, downloader)
1564                 self._youtube_ie = youtube_ie
1565
1566         def report_disclaimer(self):
1567                 """Report disclaimer retrieval."""
1568                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1569
1570         def report_age_confirmation(self):
1571                 """Report attempt to confirm age."""
1572                 self._downloader.to_screen(u'[metacafe] Confirming age')
1573
1574         def report_download_webpage(self, video_id):
1575                 """Report webpage download."""
1576                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1577
1578         def report_extraction(self, video_id):
1579                 """Report information extraction."""
1580                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1581
1582         def _real_initialize(self):
1583                 # Retrieve disclaimer
1584                 request = urllib2.Request(self._DISCLAIMER)
1585                 try:
1586                         self.report_disclaimer()
1587                         disclaimer = urllib2.urlopen(request).read()
1588                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1589                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1590                         return
1591
1592                 # Confirm age
1593                 disclaimer_form = {
1594                         'filters': '0',
1595                         'submit': "Continue - I'm over 18",
1596                         }
1597                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1598                 try:
1599                         self.report_age_confirmation()
1600                         disclaimer = urllib2.urlopen(request).read()
1601                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1602                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1603                         return
1604
1605         def _real_extract(self, url):
1606                 # Extract id and simplified title from URL
1607                 mobj = re.match(self._VALID_URL, url)
1608                 if mobj is None:
1609                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1610                         return
1611
1612                 video_id = mobj.group(1)
1613
1614                 # Check if video comes from YouTube
1615                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1616                 if mobj2 is not None:
1617                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1618                         return
1619
1620                 # At this point we have a new video
1621                 self._downloader.increment_downloads()
1622
1623                 simple_title = mobj.group(2).decode('utf-8')
1624
1625                 # Retrieve video webpage to extract further information
1626                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1627                 try:
1628                         self.report_download_webpage(video_id)
1629                         webpage = urllib2.urlopen(request).read()
1630                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1631                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1632                         return
1633
1634                 # Extract URL, uploader and title from webpage
1635                 self.report_extraction(video_id)
1636                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1637                 if mobj is not None:
1638                         mediaURL = urllib.unquote(mobj.group(1))
1639                         video_extension = mediaURL[-3:]
1640
1641                         # Extract gdaKey if available
1642                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1643                         if mobj is None:
1644                                 video_url = mediaURL
1645                         else:
1646                                 gdaKey = mobj.group(1)
1647                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1648                 else:
1649                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1650                         if mobj is None:
1651                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1652                                 return
1653                         vardict = parse_qs(mobj.group(1))
1654                         if 'mediaData' not in vardict:
1655                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1656                                 return
1657                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1658                         if mobj is None:
1659                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1660                                 return
1661                         mediaURL = mobj.group(1).replace('\\/', '/')
1662                         video_extension = mediaURL[-3:]
1663                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1664
1665                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1666                 if mobj is None:
1667                         self._downloader.trouble(u'ERROR: unable to extract title')
1668                         return
1669                 video_title = mobj.group(1).decode('utf-8')
1670                 video_title = sanitize_title(video_title)
1671
1672                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1673                 if mobj is None:
1674                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1675                         return
1676                 video_uploader = mobj.group(1)
1677
1678                 try:
1679                         # Process video information
1680                         self._downloader.process_info({
1681                                 'id':           video_id.decode('utf-8'),
1682                                 'url':          video_url.decode('utf-8'),
1683                                 'uploader':     video_uploader.decode('utf-8'),
1684                                 'upload_date':  u'NA',
1685                                 'title':        video_title,
1686                                 'stitle':       simple_title,
1687                                 'ext':          video_extension.decode('utf-8'),
1688                                 'format':       u'NA',
1689                                 'player_url':   None,
1690                         })
1691                 except UnavailableVideoError:
1692                         self._downloader.trouble(u'\nERROR: unable to download video')
1693
1694
1695 class DailymotionIE(InfoExtractor):
1696         """Information Extractor for Dailymotion"""
1697
1698         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1699         IE_NAME = u'dailymotion'
1700
1701         def __init__(self, downloader=None):
1702                 InfoExtractor.__init__(self, downloader)
1703
1704         def report_download_webpage(self, video_id):
1705                 """Report webpage download."""
1706                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1707
1708         def report_extraction(self, video_id):
1709                 """Report information extraction."""
1710                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1711
1712         def _real_extract(self, url):
1713                 # Extract id and simplified title from URL
1714                 mobj = re.match(self._VALID_URL, url)
1715                 if mobj is None:
1716                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1717                         return
1718
1719                 # At this point we have a new video
1720                 self._downloader.increment_downloads()
1721                 video_id = mobj.group(1)
1722
1723                 video_extension = 'flv'
1724
1725                 # Retrieve video webpage to extract further information
1726                 request = urllib2.Request(url)
1727                 request.add_header('Cookie', 'family_filter=off')
1728                 try:
1729                         self.report_download_webpage(video_id)
1730                         webpage = urllib2.urlopen(request).read()
1731                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1732                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1733                         return
1734
1735                 # Extract URL, uploader and title from webpage
1736                 self.report_extraction(video_id)
1737                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1738                 if mobj is None:
1739                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1740                         return
1741                 sequence = urllib.unquote(mobj.group(1))
1742                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1743                 if mobj is None:
1744                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1745                         return
1746                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1747
1748                 # if needed add http://www.dailymotion.com/ if relative URL
1749
1750                 video_url = mediaURL
1751
1752                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1753                 if mobj is None:
1754                         self._downloader.trouble(u'ERROR: unable to extract title')
1755                         return
1756                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1757                 video_title = sanitize_title(video_title)
1758                 simple_title = _simplify_title(video_title)
1759
1760                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1761                 if mobj is None:
1762                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1763                         return
1764                 video_uploader = mobj.group(1)
1765
1766                 try:
1767                         # Process video information
1768                         self._downloader.process_info({
1769                                 'id':           video_id.decode('utf-8'),
1770                                 'url':          video_url.decode('utf-8'),
1771                                 'uploader':     video_uploader.decode('utf-8'),
1772                                 'upload_date':  u'NA',
1773                                 'title':        video_title,
1774                                 'stitle':       simple_title,
1775                                 'ext':          video_extension.decode('utf-8'),
1776                                 'format':       u'NA',
1777                                 'player_url':   None,
1778                         })
1779                 except UnavailableVideoError:
1780                         self._downloader.trouble(u'\nERROR: unable to download video')
1781
1782
1783 class GoogleIE(InfoExtractor):
1784         """Information extractor for video.google.com."""
1785
1786         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1787         IE_NAME = u'video.google'
1788
1789         def __init__(self, downloader=None):
1790                 InfoExtractor.__init__(self, downloader)
1791
1792         def report_download_webpage(self, video_id):
1793                 """Report webpage download."""
1794                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1795
1796         def report_extraction(self, video_id):
1797                 """Report information extraction."""
1798                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1799
1800         def _real_extract(self, url):
1801                 # Extract id from URL
1802                 mobj = re.match(self._VALID_URL, url)
1803                 if mobj is None:
1804                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1805                         return
1806
1807                 # At this point we have a new video
1808                 self._downloader.increment_downloads()
1809                 video_id = mobj.group(1)
1810
1811                 video_extension = 'mp4'
1812
1813                 # Retrieve video webpage to extract further information
1814                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1815                 try:
1816                         self.report_download_webpage(video_id)
1817                         webpage = urllib2.urlopen(request).read()
1818                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1819                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1820                         return
1821
1822                 # Extract URL, uploader, and title from webpage
1823                 self.report_extraction(video_id)
1824                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1825                 if mobj is None:
1826                         video_extension = 'flv'
1827                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1828                 if mobj is None:
1829                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1830                         return
1831                 mediaURL = urllib.unquote(mobj.group(1))
1832                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1833                 mediaURL = mediaURL.replace('\\x26', '\x26')
1834
1835                 video_url = mediaURL
1836
1837                 mobj = re.search(r'<title>(.*)</title>', webpage)
1838                 if mobj is None:
1839                         self._downloader.trouble(u'ERROR: unable to extract title')
1840                         return
1841                 video_title = mobj.group(1).decode('utf-8')
1842                 video_title = sanitize_title(video_title)
1843                 simple_title = _simplify_title(video_title)
1844
1845                 # Extract video description
1846                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1847                 if mobj is None:
1848                         self._downloader.trouble(u'ERROR: unable to extract video description')
1849                         return
1850                 video_description = mobj.group(1).decode('utf-8')
1851                 if not video_description:
1852                         video_description = 'No description available.'
1853
1854                 # Extract video thumbnail
1855                 if self._downloader.params.get('forcethumbnail', False):
1856                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1857                         try:
1858                                 webpage = urllib2.urlopen(request).read()
1859                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1860                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1861                                 return
1862                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1863                         if mobj is None:
1864                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1865                                 return
1866                         video_thumbnail = mobj.group(1)
1867                 else:   # we need something to pass to process_info
1868                         video_thumbnail = ''
1869
1870                 try:
1871                         # Process video information
1872                         self._downloader.process_info({
1873                                 'id':           video_id.decode('utf-8'),
1874                                 'url':          video_url.decode('utf-8'),
1875                                 'uploader':     u'NA',
1876                                 'upload_date':  u'NA',
1877                                 'title':        video_title,
1878                                 'stitle':       simple_title,
1879                                 'ext':          video_extension.decode('utf-8'),
1880                                 'format':       u'NA',
1881                                 'player_url':   None,
1882                         })
1883                 except UnavailableVideoError:
1884                         self._downloader.trouble(u'\nERROR: unable to download video')
1885
1886
1887 class PhotobucketIE(InfoExtractor):
1888         """Information extractor for photobucket.com."""
1889
1890         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1891         IE_NAME = u'photobucket'
1892
1893         def __init__(self, downloader=None):
1894                 InfoExtractor.__init__(self, downloader)
1895
1896         def report_download_webpage(self, video_id):
1897                 """Report webpage download."""
1898                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1899
1900         def report_extraction(self, video_id):
1901                 """Report information extraction."""
1902                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1903
1904         def _real_extract(self, url):
1905                 # Extract id from URL
1906                 mobj = re.match(self._VALID_URL, url)
1907                 if mobj is None:
1908                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1909                         return
1910
1911                 # At this point we have a new video
1912                 self._downloader.increment_downloads()
1913                 video_id = mobj.group(1)
1914
1915                 video_extension = 'flv'
1916
1917                 # Retrieve video webpage to extract further information
1918                 request = urllib2.Request(url)
1919                 try:
1920                         self.report_download_webpage(video_id)
1921                         webpage = urllib2.urlopen(request).read()
1922                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1923                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1924                         return
1925
1926                 # Extract URL, uploader, and title from webpage
1927                 self.report_extraction(video_id)
1928                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1929                 if mobj is None:
1930                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1931                         return
1932                 mediaURL = urllib.unquote(mobj.group(1))
1933
1934                 video_url = mediaURL
1935
1936                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1937                 if mobj is None:
1938                         self._downloader.trouble(u'ERROR: unable to extract title')
1939                         return
1940                 video_title = mobj.group(1).decode('utf-8')
1941                 video_title = sanitize_title(video_title)
1942                 simple_title = _simplify_title(vide_title)
1943
1944                 video_uploader = mobj.group(2).decode('utf-8')
1945
1946                 try:
1947                         # Process video information
1948                         self._downloader.process_info({
1949                                 'id':           video_id.decode('utf-8'),
1950                                 'url':          video_url.decode('utf-8'),
1951                                 'uploader':     video_uploader,
1952                                 'upload_date':  u'NA',
1953                                 'title':        video_title,
1954                                 'stitle':       simple_title,
1955                                 'ext':          video_extension.decode('utf-8'),
1956                                 'format':       u'NA',
1957                                 'player_url':   None,
1958                         })
1959                 except UnavailableVideoError:
1960                         self._downloader.trouble(u'\nERROR: unable to download video')
1961
1962
1963 class YahooIE(InfoExtractor):
1964         """Information extractor for video.yahoo.com."""
1965
1966         # _VALID_URL matches all Yahoo! Video URLs
1967         # _VPAGE_URL matches only the extractable '/watch/' URLs
1968         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1969         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1970         IE_NAME = u'video.yahoo'
1971
1972         def __init__(self, downloader=None):
1973                 InfoExtractor.__init__(self, downloader)
1974
1975         def report_download_webpage(self, video_id):
1976                 """Report webpage download."""
1977                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1978
1979         def report_extraction(self, video_id):
1980                 """Report information extraction."""
1981                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1982
1983         def _real_extract(self, url, new_video=True):
1984                 # Extract ID from URL
1985                 mobj = re.match(self._VALID_URL, url)
1986                 if mobj is None:
1987                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1988                         return
1989
1990                 # At this point we have a new video
1991                 self._downloader.increment_downloads()
1992                 video_id = mobj.group(2)
1993                 video_extension = 'flv'
1994
1995                 # Rewrite valid but non-extractable URLs as
1996                 # extractable English language /watch/ URLs
1997                 if re.match(self._VPAGE_URL, url) is None:
1998                         request = urllib2.Request(url)
1999                         try:
2000                                 webpage = urllib2.urlopen(request).read()
2001                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2002                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2003                                 return
2004
2005                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2006                         if mobj is None:
2007                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
2008                                 return
2009                         yahoo_id = mobj.group(1)
2010
2011                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2012                         if mobj is None:
2013                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2014                                 return
2015                         yahoo_vid = mobj.group(1)
2016
2017                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2018                         return self._real_extract(url, new_video=False)
2019
2020                 # Retrieve video webpage to extract further information
2021                 request = urllib2.Request(url)
2022                 try:
2023                         self.report_download_webpage(video_id)
2024                         webpage = urllib2.urlopen(request).read()
2025                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2026                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2027                         return
2028
2029                 # Extract uploader and title from webpage
2030                 self.report_extraction(video_id)
2031                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2032                 if mobj is None:
2033                         self._downloader.trouble(u'ERROR: unable to extract video title')
2034                         return
2035                 video_title = mobj.group(1).decode('utf-8')
2036                 simple_title = _simplify_title(video_title)
2037
2038                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2039                 if mobj is None:
2040                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2041                         return
2042                 video_uploader = mobj.group(1).decode('utf-8')
2043
2044                 # Extract video thumbnail
2045                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2046                 if mobj is None:
2047                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2048                         return
2049                 video_thumbnail = mobj.group(1).decode('utf-8')
2050
2051                 # Extract video description
2052                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2053                 if mobj is None:
2054                         self._downloader.trouble(u'ERROR: unable to extract video description')
2055                         return
2056                 video_description = mobj.group(1).decode('utf-8')
2057                 if not video_description:
2058                         video_description = 'No description available.'
2059
2060                 # Extract video height and width
2061                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2062                 if mobj is None:
2063                         self._downloader.trouble(u'ERROR: unable to extract video height')
2064                         return
2065                 yv_video_height = mobj.group(1)
2066
2067                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2068                 if mobj is None:
2069                         self._downloader.trouble(u'ERROR: unable to extract video width')
2070                         return
2071                 yv_video_width = mobj.group(1)
2072
2073                 # Retrieve video playlist to extract media URL
2074                 # I'm not completely sure what all these options are, but we
2075                 # seem to need most of them, otherwise the server sends a 401.
2076                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2077                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2078                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2079                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2080                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2081                 try:
2082                         self.report_download_webpage(video_id)
2083                         webpage = urllib2.urlopen(request).read()
2084                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2085                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2086                         return
2087
2088                 # Extract media URL from playlist XML
2089                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2090                 if mobj is None:
2091                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2092                         return
2093                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2094                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2095
2096                 try:
2097                         # Process video information
2098                         self._downloader.process_info({
2099                                 'id':           video_id.decode('utf-8'),
2100                                 'url':          video_url,
2101                                 'uploader':     video_uploader,
2102                                 'upload_date':  u'NA',
2103                                 'title':        video_title,
2104                                 'stitle':       simple_title,
2105                                 'ext':          video_extension.decode('utf-8'),
2106                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2107                                 'description':  video_description,
2108                                 'thumbnail':    video_thumbnail,
2109                                 'player_url':   None,
2110                         })
2111                 except UnavailableVideoError:
2112                         self._downloader.trouble(u'\nERROR: unable to download video')
2113
2114
2115 class VimeoIE(InfoExtractor):
2116         """Information extractor for vimeo.com."""
2117
2118         # _VALID_URL matches Vimeo URLs
2119         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2120         IE_NAME = u'vimeo'
2121
2122         def __init__(self, downloader=None):
2123                 InfoExtractor.__init__(self, downloader)
2124
2125         def report_download_webpage(self, video_id):
2126                 """Report webpage download."""
2127                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2128
2129         def report_extraction(self, video_id):
2130                 """Report information extraction."""
2131                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2132
2133         def _real_extract(self, url, new_video=True):
2134                 # Extract ID from URL
2135                 mobj = re.match(self._VALID_URL, url)
2136                 if mobj is None:
2137                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2138                         return
2139
2140                 # At this point we have a new video
2141                 self._downloader.increment_downloads()
2142                 video_id = mobj.group(1)
2143
2144                 # Retrieve video webpage to extract further information
2145                 request = urllib2.Request(url, None, std_headers)
2146                 try:
2147                         self.report_download_webpage(video_id)
2148                         webpage = urllib2.urlopen(request).read()
2149                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2150                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2151                         return
2152
2153                 # Now we begin extracting as much information as we can from what we
2154                 # retrieved. First we extract the information common to all extractors,
2155                 # and latter we extract those that are Vimeo specific.
2156                 self.report_extraction(video_id)
2157
2158                 # Extract the config JSON
2159                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2160                 try:
2161                         config = json.loads(config)
2162                 except:
2163                         self._downloader.trouble(u'ERROR: unable to extract info section')
2164                         return
2165
2166                 # Extract title
2167                 video_title = config["video"]["title"]
2168                 simple_title = _simplify_title(video_title)
2169
2170                 # Extract uploader
2171                 video_uploader = config["video"]["owner"]["name"]
2172
2173                 # Extract video thumbnail
2174                 video_thumbnail = config["video"]["thumbnail"]
2175
2176                 # Extract video description
2177                 try:
2178                         lxml.etree
2179                 except NameError:
2180                         video_description = u'No description available.'
2181                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2182                         if mobj is not None:
2183                                 video_description = mobj.group(1)
2184                 else:
2185                         html_parser = lxml.etree.HTMLParser()
2186                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2187                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2188                         # TODO use another parser
2189
2190                 # Extract upload date
2191                 video_upload_date = u'NA'
2192                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2193                 if mobj is not None:
2194                         video_upload_date = mobj.group(1)
2195
2196                 # Vimeo specific: extract request signature and timestamp
2197                 sig = config['request']['signature']
2198                 timestamp = config['request']['timestamp']
2199
2200                 # Vimeo specific: extract video codec and quality information
2201                 # TODO bind to format param
2202                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2203                 for codec in codecs:
2204                         if codec[0] in config["video"]["files"]:
2205                                 video_codec = codec[0]
2206                                 video_extension = codec[1]
2207                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2208                                 else: quality = 'sd'
2209                                 break
2210                 else:
2211                         self._downloader.trouble(u'ERROR: no known codec found')
2212                         return
2213
2214                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2215                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2216
2217                 try:
2218                         # Process video information
2219                         self._downloader.process_info({
2220                                 'id':           video_id,
2221                                 'url':          video_url,
2222                                 'uploader':     video_uploader,
2223                                 'upload_date':  video_upload_date,
2224                                 'title':        video_title,
2225                                 'stitle':       simple_title,
2226                                 'ext':          video_extension,
2227                                 'thumbnail':    video_thumbnail,
2228                                 'description':  video_description,
2229                                 'player_url':   None,
2230                         })
2231                 except UnavailableVideoError:
2232                         self._downloader.trouble(u'ERROR: unable to download video')
2233
2234
2235 class GenericIE(InfoExtractor):
2236         """Generic last-resort information extractor."""
2237
2238         _VALID_URL = r'.*'
2239         IE_NAME = u'generic'
2240
2241         def __init__(self, downloader=None):
2242                 InfoExtractor.__init__(self, downloader)
2243
2244         def report_download_webpage(self, video_id):
2245                 """Report webpage download."""
2246                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2247                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2248
2249         def report_extraction(self, video_id):
2250                 """Report information extraction."""
2251                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2252
2253         def _real_extract(self, url):
2254                 # At this point we have a new video
2255                 self._downloader.increment_downloads()
2256
2257                 video_id = url.split('/')[-1]
2258                 request = urllib2.Request(url)
2259                 try:
2260                         self.report_download_webpage(video_id)
2261                         webpage = urllib2.urlopen(request).read()
2262                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2263                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2264                         return
2265                 except ValueError, err:
2266                         # since this is the last-resort InfoExtractor, if
2267                         # this error is thrown, it'll be thrown here
2268                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2269                         return
2270
2271                 self.report_extraction(video_id)
2272                 # Start with something easy: JW Player in SWFObject
2273                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2274                 if mobj is None:
2275                         # Broaden the search a little bit
2276                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2277                 if mobj is None:
2278                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2279                         return
2280
2281                 # It's possible that one of the regexes
2282                 # matched, but returned an empty group:
2283                 if mobj.group(1) is None:
2284                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2285                         return
2286
2287                 video_url = urllib.unquote(mobj.group(1))
2288                 video_id = os.path.basename(video_url)
2289
2290                 # here's a fun little line of code for you:
2291                 video_extension = os.path.splitext(video_id)[1][1:]
2292                 video_id = os.path.splitext(video_id)[0]
2293
2294                 # it's tempting to parse this further, but you would
2295                 # have to take into account all the variations like
2296                 #   Video Title - Site Name
2297                 #   Site Name | Video Title
2298                 #   Video Title - Tagline | Site Name
2299                 # and so on and so forth; it's just not practical
2300                 mobj = re.search(r'<title>(.*)</title>', webpage)
2301                 if mobj is None:
2302                         self._downloader.trouble(u'ERROR: unable to extract title')
2303                         return
2304                 video_title = mobj.group(1).decode('utf-8')
2305                 video_title = sanitize_title(video_title)
2306                 simple_title = _simplify_title(video_title)
2307
2308                 # video uploader is domain name
2309                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2310                 if mobj is None:
2311                         self._downloader.trouble(u'ERROR: unable to extract title')
2312                         return
2313                 video_uploader = mobj.group(1).decode('utf-8')
2314
2315                 try:
2316                         # Process video information
2317                         self._downloader.process_info({
2318                                 'id':           video_id.decode('utf-8'),
2319                                 'url':          video_url.decode('utf-8'),
2320                                 'uploader':     video_uploader,
2321                                 'upload_date':  u'NA',
2322                                 'title':        video_title,
2323                                 'stitle':       simple_title,
2324                                 'ext':          video_extension.decode('utf-8'),
2325                                 'format':       u'NA',
2326                                 'player_url':   None,
2327                         })
2328                 except UnavailableVideoError, err:
2329                         self._downloader.trouble(u'\nERROR: unable to download video')
2330
2331
2332 class YoutubeSearchIE(InfoExtractor):
2333         """Information Extractor for YouTube search queries."""
2334         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2335         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2336         _youtube_ie = None
2337         _max_youtube_results = 1000
2338         IE_NAME = u'youtube:search'
2339
2340         def __init__(self, youtube_ie, downloader=None):
2341                 InfoExtractor.__init__(self, downloader)
2342                 self._youtube_ie = youtube_ie
2343
2344         def report_download_page(self, query, pagenum):
2345                 """Report attempt to download playlist page with given number."""
2346                 query = query.decode(preferredencoding())
2347                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2348
2349         def _real_initialize(self):
2350                 self._youtube_ie.initialize()
2351
2352         def _real_extract(self, query):
2353                 mobj = re.match(self._VALID_URL, query)
2354                 if mobj is None:
2355                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2356                         return
2357
2358                 prefix, query = query.split(':')
2359                 prefix = prefix[8:]
2360                 query = query.encode('utf-8')
2361                 if prefix == '':
2362                         self._download_n_results(query, 1)
2363                         return
2364                 elif prefix == 'all':
2365                         self._download_n_results(query, self._max_youtube_results)
2366                         return
2367                 else:
2368                         try:
2369                                 n = long(prefix)
2370                                 if n <= 0:
2371                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2372                                         return
2373                                 elif n > self._max_youtube_results:
2374                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2375                                         n = self._max_youtube_results
2376                                 self._download_n_results(query, n)
2377                                 return
2378                         except ValueError: # parsing prefix as integer fails
2379                                 self._download_n_results(query, 1)
2380                                 return
2381
2382         def _download_n_results(self, query, n):
2383                 """Downloads a specified number of results for a query"""
2384
2385                 video_ids = []
2386                 pagenum = 0
2387                 limit = n
2388
2389                 while (50 * pagenum) < limit:
2390                         self.report_download_page(query, pagenum+1)
2391                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2392                         request = urllib2.Request(result_url)
2393                         try:
2394                                 data = urllib2.urlopen(request).read()
2395                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2396                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2397                                 return
2398                         api_response = json.loads(data)['data']
2399
2400                         new_ids = list(video['id'] for video in api_response['items'])
2401                         video_ids += new_ids
2402
2403                         limit = min(n, api_response['totalItems'])
2404                         pagenum += 1
2405
2406                 if len(video_ids) > n:
2407                         video_ids = video_ids[:n]
2408                 for id in video_ids:
2409                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2410                 return
2411
2412
2413 class GoogleSearchIE(InfoExtractor):
2414         """Information Extractor for Google Video search queries."""
2415         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2416         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2417         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2418         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2419         _google_ie = None
2420         _max_google_results = 1000
2421         IE_NAME = u'video.google:search'
2422
2423         def __init__(self, google_ie, downloader=None):
2424                 InfoExtractor.__init__(self, downloader)
2425                 self._google_ie = google_ie
2426
2427         def report_download_page(self, query, pagenum):
2428                 """Report attempt to download playlist page with given number."""
2429                 query = query.decode(preferredencoding())
2430                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2431
2432         def _real_initialize(self):
2433                 self._google_ie.initialize()
2434
2435         def _real_extract(self, query):
2436                 mobj = re.match(self._VALID_URL, query)
2437                 if mobj is None:
2438                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2439                         return
2440
2441                 prefix, query = query.split(':')
2442                 prefix = prefix[8:]
2443                 query = query.encode('utf-8')
2444                 if prefix == '':
2445                         self._download_n_results(query, 1)
2446                         return
2447                 elif prefix == 'all':
2448                         self._download_n_results(query, self._max_google_results)
2449                         return
2450                 else:
2451                         try:
2452                                 n = long(prefix)
2453                                 if n <= 0:
2454                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2455                                         return
2456                                 elif n > self._max_google_results:
2457                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2458                                         n = self._max_google_results
2459                                 self._download_n_results(query, n)
2460                                 return
2461                         except ValueError: # parsing prefix as integer fails
2462                                 self._download_n_results(query, 1)
2463                                 return
2464
2465         def _download_n_results(self, query, n):
2466                 """Downloads a specified number of results for a query"""
2467
2468                 video_ids = []
2469                 pagenum = 0
2470
2471                 while True:
2472                         self.report_download_page(query, pagenum)
2473                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2474                         request = urllib2.Request(result_url)
2475                         try:
2476                                 page = urllib2.urlopen(request).read()
2477                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2478                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2479                                 return
2480
2481                         # Extract video identifiers
2482                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2483                                 video_id = mobj.group(1)
2484                                 if video_id not in video_ids:
2485                                         video_ids.append(video_id)
2486                                         if len(video_ids) == n:
2487                                                 # Specified n videos reached
2488                                                 for id in video_ids:
2489                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2490                                                 return
2491
2492                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2493                                 for id in video_ids:
2494                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2495                                 return
2496
2497                         pagenum = pagenum + 1
2498
2499
2500 class YahooSearchIE(InfoExtractor):
2501         """Information Extractor for Yahoo! Video search queries."""
2502         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2503         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2504         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2505         _MORE_PAGES_INDICATOR = r'\s*Next'
2506         _yahoo_ie = None
2507         _max_yahoo_results = 1000
2508         IE_NAME = u'video.yahoo:search'
2509
2510         def __init__(self, yahoo_ie, downloader=None):
2511                 InfoExtractor.__init__(self, downloader)
2512                 self._yahoo_ie = yahoo_ie
2513
2514         def report_download_page(self, query, pagenum):
2515                 """Report attempt to download playlist page with given number."""
2516                 query = query.decode(preferredencoding())
2517                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2518
2519         def _real_initialize(self):
2520                 self._yahoo_ie.initialize()
2521
2522         def _real_extract(self, query):
2523                 mobj = re.match(self._VALID_URL, query)
2524                 if mobj is None:
2525                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2526                         return
2527
2528                 prefix, query = query.split(':')
2529                 prefix = prefix[8:]
2530                 query = query.encode('utf-8')
2531                 if prefix == '':
2532                         self._download_n_results(query, 1)
2533                         return
2534                 elif prefix == 'all':
2535                         self._download_n_results(query, self._max_yahoo_results)
2536                         return
2537                 else:
2538                         try:
2539                                 n = long(prefix)
2540                                 if n <= 0:
2541                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2542                                         return
2543                                 elif n > self._max_yahoo_results:
2544                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2545                                         n = self._max_yahoo_results
2546                                 self._download_n_results(query, n)
2547                                 return
2548                         except ValueError: # parsing prefix as integer fails
2549                                 self._download_n_results(query, 1)
2550                                 return
2551
2552         def _download_n_results(self, query, n):
2553                 """Downloads a specified number of results for a query"""
2554
2555                 video_ids = []
2556                 already_seen = set()
2557                 pagenum = 1
2558
2559                 while True:
2560                         self.report_download_page(query, pagenum)
2561                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2562                         request = urllib2.Request(result_url)
2563                         try:
2564                                 page = urllib2.urlopen(request).read()
2565                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2566                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2567                                 return
2568
2569                         # Extract video identifiers
2570                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2571                                 video_id = mobj.group(1)
2572                                 if video_id not in already_seen:
2573                                         video_ids.append(video_id)
2574                                         already_seen.add(video_id)
2575                                         if len(video_ids) == n:
2576                                                 # Specified n videos reached
2577                                                 for id in video_ids:
2578                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2579                                                 return
2580
2581                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2582                                 for id in video_ids:
2583                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2584                                 return
2585
2586                         pagenum = pagenum + 1
2587
2588
2589 class YoutubePlaylistIE(InfoExtractor):
2590         """Information Extractor for YouTube playlists."""
2591
2592         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2593         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2594         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2595         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2596         _youtube_ie = None
2597         IE_NAME = u'youtube:playlist'
2598
2599         def __init__(self, youtube_ie, downloader=None):
2600                 InfoExtractor.__init__(self, downloader)
2601                 self._youtube_ie = youtube_ie
2602
2603         def report_download_page(self, playlist_id, pagenum):
2604                 """Report attempt to download playlist page with given number."""
2605                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2606
2607         def _real_initialize(self):
2608                 self._youtube_ie.initialize()
2609
2610         def _real_extract(self, url):
2611                 # Extract playlist id
2612                 mobj = re.match(self._VALID_URL, url)
2613                 if mobj is None:
2614                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2615                         return
2616
2617                 # Single video case
2618                 if mobj.group(3) is not None:
2619                         self._youtube_ie.extract(mobj.group(3))
2620                         return
2621
2622                 # Download playlist pages
2623                 # prefix is 'p' as default for playlists but there are other types that need extra care
2624                 playlist_prefix = mobj.group(1)
2625                 if playlist_prefix == 'a':
2626                         playlist_access = 'artist'
2627                 else:
2628                         playlist_prefix = 'p'
2629                         playlist_access = 'view_play_list'
2630                 playlist_id = mobj.group(2)
2631                 video_ids = []
2632                 pagenum = 1
2633
2634                 while True:
2635                         self.report_download_page(playlist_id, pagenum)
2636                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2637                         request = urllib2.Request(url)
2638                         try:
2639                                 page = urllib2.urlopen(request).read()
2640                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2641                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2642                                 return
2643
2644                         # Extract video identifiers
2645                         ids_in_page = []
2646                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2647                                 if mobj.group(1) not in ids_in_page:
2648                                         ids_in_page.append(mobj.group(1))
2649                         video_ids.extend(ids_in_page)
2650
2651                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2652                                 break
2653                         pagenum = pagenum + 1
2654
2655                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2656                 playlistend = self._downloader.params.get('playlistend', -1)
2657                 if playlistend == -1:
2658                         video_ids = video_ids[playliststart:]
2659                 else:
2660                         video_ids = video_ids[playliststart:playlistend]
2661
2662                 for id in video_ids:
2663                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2664                 return
2665
2666
2667 class YoutubeUserIE(InfoExtractor):
2668         """Information Extractor for YouTube users."""
2669
2670         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2671         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2672         _GDATA_PAGE_SIZE = 50
2673         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2674         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2675         _youtube_ie = None
2676         IE_NAME = u'youtube:user'
2677
2678         def __init__(self, youtube_ie, downloader=None):
2679                 InfoExtractor.__init__(self, downloader)
2680                 self._youtube_ie = youtube_ie
2681
2682         def report_download_page(self, username, start_index):
2683                 """Report attempt to download user page."""
2684                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2685                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2686
2687         def _real_initialize(self):
2688                 self._youtube_ie.initialize()
2689
2690         def _real_extract(self, url):
2691                 # Extract username
2692                 mobj = re.match(self._VALID_URL, url)
2693                 if mobj is None:
2694                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2695                         return
2696
2697                 username = mobj.group(1)
2698
2699                 # Download video ids using YouTube Data API. Result size per
2700                 # query is limited (currently to 50 videos) so we need to query
2701                 # page by page until there are no video ids - it means we got
2702                 # all of them.
2703
2704                 video_ids = []
2705                 pagenum = 0
2706
2707                 while True:
2708                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2709                         self.report_download_page(username, start_index)
2710
2711                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2712
2713                         try:
2714                                 page = urllib2.urlopen(request).read()
2715                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2716                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2717                                 return
2718
2719                         # Extract video identifiers
2720                         ids_in_page = []
2721
2722                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2723                                 if mobj.group(1) not in ids_in_page:
2724                                         ids_in_page.append(mobj.group(1))
2725
2726                         video_ids.extend(ids_in_page)
2727
2728                         # A little optimization - if current page is not
2729                         # "full", ie. does not contain PAGE_SIZE video ids then
2730                         # we can assume that this page is the last one - there
2731                         # are no more ids on further pages - no need to query
2732                         # again.
2733
2734                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2735                                 break
2736
2737                         pagenum += 1
2738
2739                 all_ids_count = len(video_ids)
2740                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2741                 playlistend = self._downloader.params.get('playlistend', -1)
2742
2743                 if playlistend == -1:
2744                         video_ids = video_ids[playliststart:]
2745                 else:
2746                         video_ids = video_ids[playliststart:playlistend]
2747
2748                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2749                                 (username, all_ids_count, len(video_ids)))
2750
2751                 for video_id in video_ids:
2752                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2753
2754
2755 class DepositFilesIE(InfoExtractor):
2756         """Information extractor for depositfiles.com"""
2757
2758         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2759         IE_NAME = u'DepositFiles'
2760
2761         def __init__(self, downloader=None):
2762                 InfoExtractor.__init__(self, downloader)
2763
2764         def report_download_webpage(self, file_id):
2765                 """Report webpage download."""
2766                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2767
2768         def report_extraction(self, file_id):
2769                 """Report information extraction."""
2770                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2771
2772         def _real_extract(self, url):
2773                 # At this point we have a new file
2774                 self._downloader.increment_downloads()
2775
2776                 file_id = url.split('/')[-1]
2777                 # Rebuild url in english locale
2778                 url = 'http://depositfiles.com/en/files/' + file_id
2779
2780                 # Retrieve file webpage with 'Free download' button pressed
2781                 free_download_indication = { 'gateway_result' : '1' }
2782                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2783                 try:
2784                         self.report_download_webpage(file_id)
2785                         webpage = urllib2.urlopen(request).read()
2786                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2787                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2788                         return
2789
2790                 # Search for the real file URL
2791                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2792                 if (mobj is None) or (mobj.group(1) is None):
2793                         # Try to figure out reason of the error.
2794                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2795                         if (mobj is not None) and (mobj.group(1) is not None):
2796                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2797                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2798                         else:
2799                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2800                         return
2801
2802                 file_url = mobj.group(1)
2803                 file_extension = os.path.splitext(file_url)[1][1:]
2804
2805                 # Search for file title
2806                 mobj = re.search(r'<b title="(.*?)">', webpage)
2807                 if mobj is None:
2808                         self._downloader.trouble(u'ERROR: unable to extract title')
2809                         return
2810                 file_title = mobj.group(1).decode('utf-8')
2811
2812                 try:
2813                         # Process file information
2814                         self._downloader.process_info({
2815                                 'id':           file_id.decode('utf-8'),
2816                                 'url':          file_url.decode('utf-8'),
2817                                 'uploader':     u'NA',
2818                                 'upload_date':  u'NA',
2819                                 'title':        file_title,
2820                                 'stitle':       file_title,
2821                                 'ext':          file_extension.decode('utf-8'),
2822                                 'format':       u'NA',
2823                                 'player_url':   None,
2824                         })
2825                 except UnavailableVideoError, err:
2826                         self._downloader.trouble(u'ERROR: unable to download file')
2827
2828
2829 class FacebookIE(InfoExtractor):
2830         """Information Extractor for Facebook"""
2831
2832         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2833         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2834         _NETRC_MACHINE = 'facebook'
2835         _available_formats = ['video', 'highqual', 'lowqual']
2836         _video_extensions = {
2837                 'video': 'mp4',
2838                 'highqual': 'mp4',
2839                 'lowqual': 'mp4',
2840         }
2841         IE_NAME = u'facebook'
2842
2843         def __init__(self, downloader=None):
2844                 InfoExtractor.__init__(self, downloader)
2845
2846         def _reporter(self, message):
2847                 """Add header and report message."""
2848                 self._downloader.to_screen(u'[facebook] %s' % message)
2849
2850         def report_login(self):
2851                 """Report attempt to log in."""
2852                 self._reporter(u'Logging in')
2853
2854         def report_video_webpage_download(self, video_id):
2855                 """Report attempt to download video webpage."""
2856                 self._reporter(u'%s: Downloading video webpage' % video_id)
2857
2858         def report_information_extraction(self, video_id):
2859                 """Report attempt to extract video information."""
2860                 self._reporter(u'%s: Extracting video information' % video_id)
2861
2862         def _parse_page(self, video_webpage):
2863                 """Extract video information from page"""
2864                 # General data
2865                 data = {'title': r'\("video_title", "(.*?)"\)',
2866                         'description': r'<div class="datawrap">(.*?)</div>',
2867                         'owner': r'\("video_owner_name", "(.*?)"\)',
2868                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2869                         }
2870                 video_info = {}
2871                 for piece in data.keys():
2872                         mobj = re.search(data[piece], video_webpage)
2873                         if mobj is not None:
2874                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2875
2876                 # Video urls
2877                 video_urls = {}
2878                 for fmt in self._available_formats:
2879                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2880                         if mobj is not None:
2881                                 # URL is in a Javascript segment inside an escaped Unicode format within
2882                                 # the generally utf-8 page
2883                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2884                 video_info['video_urls'] = video_urls
2885
2886                 return video_info
2887
2888         def _real_initialize(self):
2889                 if self._downloader is None:
2890                         return
2891
2892                 useremail = None
2893                 password = None
2894                 downloader_params = self._downloader.params
2895
2896                 # Attempt to use provided username and password or .netrc data
2897                 if downloader_params.get('username', None) is not None:
2898                         useremail = downloader_params['username']
2899                         password = downloader_params['password']
2900                 elif downloader_params.get('usenetrc', False):
2901                         try:
2902                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2903                                 if info is not None:
2904                                         useremail = info[0]
2905                                         password = info[2]
2906                                 else:
2907                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2908                         except (IOError, netrc.NetrcParseError), err:
2909                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2910                                 return
2911
2912                 if useremail is None:
2913                         return
2914
2915                 # Log in
2916                 login_form = {
2917                         'email': useremail,
2918                         'pass': password,
2919                         'login': 'Log+In'
2920                         }
2921                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2922                 try:
2923                         self.report_login()
2924                         login_results = urllib2.urlopen(request).read()
2925                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2926                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2927                                 return
2928                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2929                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2930                         return
2931
2932         def _real_extract(self, url):
2933                 mobj = re.match(self._VALID_URL, url)
2934                 if mobj is None:
2935                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2936                         return
2937                 video_id = mobj.group('ID')
2938
2939                 # Get video webpage
2940                 self.report_video_webpage_download(video_id)
2941                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2942                 try:
2943                         page = urllib2.urlopen(request)
2944                         video_webpage = page.read()
2945                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2946                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2947                         return
2948
2949                 # Start extracting information
2950                 self.report_information_extraction(video_id)
2951
2952                 # Extract information
2953                 video_info = self._parse_page(video_webpage)
2954
2955                 # uploader
2956                 if 'owner' not in video_info:
2957                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2958                         return
2959                 video_uploader = video_info['owner']
2960
2961                 # title
2962                 if 'title' not in video_info:
2963                         self._downloader.trouble(u'ERROR: unable to extract video title')
2964                         return
2965                 video_title = video_info['title']
2966                 video_title = video_title.decode('utf-8')
2967                 video_title = sanitize_title(video_title)
2968
2969                 simple_title = _simplify_title(video_title)
2970
2971                 # thumbnail image
2972                 if 'thumbnail' not in video_info:
2973                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2974                         video_thumbnail = ''
2975                 else:
2976                         video_thumbnail = video_info['thumbnail']
2977
2978                 # upload date
2979                 upload_date = u'NA'
2980                 if 'upload_date' in video_info:
2981                         upload_time = video_info['upload_date']
2982                         timetuple = email.utils.parsedate_tz(upload_time)
2983                         if timetuple is not None:
2984                                 try:
2985                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2986                                 except:
2987                                         pass
2988
2989                 # description
2990                 video_description = video_info.get('description', 'No description available.')
2991
2992                 url_map = video_info['video_urls']
2993                 if len(url_map.keys()) > 0:
2994                         # Decide which formats to download
2995                         req_format = self._downloader.params.get('format', None)
2996                         format_limit = self._downloader.params.get('format_limit', None)
2997
2998                         if format_limit is not None and format_limit in self._available_formats:
2999                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
3000                         else:
3001                                 format_list = self._available_formats
3002                         existing_formats = [x for x in format_list if x in url_map]
3003                         if len(existing_formats) == 0:
3004                                 self._downloader.trouble(u'ERROR: no known formats available for video')
3005                                 return
3006                         if req_format is None:
3007                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3008                         elif req_format == 'worst':
3009                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3010                         elif req_format == '-1':
3011                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3012                         else:
3013                                 # Specific format
3014                                 if req_format not in url_map:
3015                                         self._downloader.trouble(u'ERROR: requested format not available')
3016                                         return
3017                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3018
3019                 for format_param, video_real_url in video_url_list:
3020
3021                         # At this point we have a new video
3022                         self._downloader.increment_downloads()
3023
3024                         # Extension
3025                         video_extension = self._video_extensions.get(format_param, 'mp4')
3026
3027                         try:
3028                                 # Process video information
3029                                 self._downloader.process_info({
3030                                         'id':           video_id.decode('utf-8'),
3031                                         'url':          video_real_url.decode('utf-8'),
3032                                         'uploader':     video_uploader.decode('utf-8'),
3033                                         'upload_date':  upload_date,
3034                                         'title':        video_title,
3035                                         'stitle':       simple_title,
3036                                         'ext':          video_extension.decode('utf-8'),
3037                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3038                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3039                                         'description':  video_description.decode('utf-8'),
3040                                         'player_url':   None,
3041                                 })
3042                         except UnavailableVideoError, err:
3043                                 self._downloader.trouble(u'\nERROR: unable to download video')
3044
3045 class BlipTVIE(InfoExtractor):
3046         """Information extractor for blip.tv"""
3047
3048         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3049         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3050         IE_NAME = u'blip.tv'
3051
3052         def report_extraction(self, file_id):
3053                 """Report information extraction."""
3054                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3055
3056         def report_direct_download(self, title):
3057                 """Report information extraction."""
3058                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3059
3060         def _real_extract(self, url):
3061                 mobj = re.match(self._VALID_URL, url)
3062                 if mobj is None:
3063                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3064                         return
3065
3066                 if '?' in url:
3067                         cchar = '&'
3068                 else:
3069                         cchar = '?'
3070                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3071                 request = urllib2.Request(json_url)
3072                 self.report_extraction(mobj.group(1))
3073                 info = None
3074                 try:
3075                         urlh = urllib2.urlopen(request)
3076                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3077                                 basename = url.split('/')[-1]
3078                                 title,ext = os.path.splitext(basename)
3079                                 title = title.decode('UTF-8')
3080                                 ext = ext.replace('.', '')
3081                                 self.report_direct_download(title)
3082                                 info = {
3083                                         'id': title,
3084                                         'url': url,
3085                                         'title': title,
3086                                         'stitle': _simplify_title(title),
3087                                         'ext': ext,
3088                                         'urlhandle': urlh
3089                                 }
3090                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3091                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3092                         return
3093                 if info is None: # Regular URL
3094                         try:
3095                                 json_code = urlh.read()
3096                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3097                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3098                                 return
3099
3100                         try:
3101                                 json_data = json.loads(json_code)
3102                                 if 'Post' in json_data:
3103                                         data = json_data['Post']
3104                                 else:
3105                                         data = json_data
3106
3107                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3108                                 video_url = data['media']['url']
3109                                 umobj = re.match(self._URL_EXT, video_url)
3110                                 if umobj is None:
3111                                         raise ValueError('Can not determine filename extension')
3112                                 ext = umobj.group(1)
3113
3114                                 info = {
3115                                         'id': data['item_id'],
3116                                         'url': video_url,
3117                                         'uploader': data['display_name'],
3118                                         'upload_date': upload_date,
3119                                         'title': data['title'],
3120                                         'stitle': _simplify_title(data['title']),
3121                                         'ext': ext,
3122                                         'format': data['media']['mimeType'],
3123                                         'thumbnail': data['thumbnailUrl'],
3124                                         'description': data['description'],
3125                                         'player_url': data['embedUrl']
3126                                 }
3127                         except (ValueError,KeyError), err:
3128                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3129                                 return
3130
3131                 self._downloader.increment_downloads()
3132
3133                 try:
3134                         self._downloader.process_info(info)
3135                 except UnavailableVideoError, err:
3136                         self._downloader.trouble(u'\nERROR: unable to download video')
3137
3138
3139 class MyVideoIE(InfoExtractor):
3140         """Information Extractor for myvideo.de."""
3141
3142         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3143         IE_NAME = u'myvideo'
3144
3145         def __init__(self, downloader=None):
3146                 InfoExtractor.__init__(self, downloader)
3147
3148         def report_download_webpage(self, video_id):
3149                 """Report webpage download."""
3150                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3151
3152         def report_extraction(self, video_id):
3153                 """Report information extraction."""
3154                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3155
3156         def _real_extract(self,url):
3157                 mobj = re.match(self._VALID_URL, url)
3158                 if mobj is None:
3159                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3160                         return
3161
3162                 video_id = mobj.group(1)
3163
3164                 # Get video webpage
3165                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3166                 try:
3167                         self.report_download_webpage(video_id)
3168                         webpage = urllib2.urlopen(request).read()
3169                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3170                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3171                         return
3172
3173                 self.report_extraction(video_id)
3174                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3175                                  webpage)
3176                 if mobj is None:
3177                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3178                         return
3179                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3180
3181                 mobj = re.search('<title>([^<]+)</title>', webpage)
3182                 if mobj is None:
3183                         self._downloader.trouble(u'ERROR: unable to extract title')
3184                         return
3185
3186                 video_title = mobj.group(1)
3187                 video_title = sanitize_title(video_title)
3188
3189                 simple_title = _simplify_title(video_title)
3190
3191                 try:
3192                         self._downloader.process_info({
3193                                 'id':           video_id,
3194                                 'url':          video_url,
3195                                 'uploader':     u'NA',
3196                                 'upload_date':  u'NA',
3197                                 'title':        video_title,
3198                                 'stitle':       simple_title,
3199                                 'ext':          u'flv',
3200                                 'format':       u'NA',
3201                                 'player_url':   None,
3202                         })
3203                 except UnavailableVideoError:
3204                         self._downloader.trouble(u'\nERROR: Unable to download video')
3205
3206 class ComedyCentralIE(InfoExtractor):
3207         """Information extractor for The Daily Show and Colbert Report """
3208
3209         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3210         IE_NAME = u'comedycentral'
3211
3212         def report_extraction(self, episode_id):
3213                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3214
3215         def report_config_download(self, episode_id):
3216                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3217
3218         def report_index_download(self, episode_id):
3219                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3220
3221         def report_player_url(self, episode_id):
3222                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3223
3224         def _real_extract(self, url):
3225                 mobj = re.match(self._VALID_URL, url)
3226                 if mobj is None:
3227                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3228                         return
3229
3230                 if mobj.group('shortname'):
3231                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3232                                 url = u'http://www.thedailyshow.com/full-episodes/'
3233                         else:
3234                                 url = u'http://www.colbertnation.com/full-episodes/'
3235                         mobj = re.match(self._VALID_URL, url)
3236                         assert mobj is not None
3237
3238                 dlNewest = not mobj.group('episode')
3239                 if dlNewest:
3240                         epTitle = mobj.group('showname')
3241                 else:
3242                         epTitle = mobj.group('episode')
3243
3244                 req = urllib2.Request(url)
3245                 self.report_extraction(epTitle)
3246                 try:
3247                         htmlHandle = urllib2.urlopen(req)
3248                         html = htmlHandle.read()
3249                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3250                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3251                         return
3252                 if dlNewest:
3253                         url = htmlHandle.geturl()
3254                         mobj = re.match(self._VALID_URL, url)
3255                         if mobj is None:
3256                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3257                                 return
3258                         if mobj.group('episode') == '':
3259                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3260                                 return
3261                         epTitle = mobj.group('episode')
3262
3263                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3264                 if len(mMovieParams) == 0:
3265                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3266                         return
3267
3268                 playerUrl_raw = mMovieParams[0][0]
3269                 self.report_player_url(epTitle)
3270                 try:
3271                         urlHandle = urllib2.urlopen(playerUrl_raw)
3272                         playerUrl = urlHandle.geturl()
3273                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3274                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3275                         return
3276
3277                 uri = mMovieParams[0][1]
3278                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3279                 self.report_index_download(epTitle)
3280                 try:
3281                         indexXml = urllib2.urlopen(indexUrl).read()
3282                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3283                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3284                         return
3285
3286                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3287                 itemEls = idoc.findall('.//item')
3288                 for itemEl in itemEls:
3289                         mediaId = itemEl.findall('./guid')[0].text
3290                         shortMediaId = mediaId.split(':')[-1]
3291                         showId = mediaId.split(':')[-2].replace('.com', '')
3292                         officialTitle = itemEl.findall('./title')[0].text
3293                         officialDate = itemEl.findall('./pubDate')[0].text
3294
3295                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3296                                                 urllib.urlencode({'uri': mediaId}))
3297                         configReq = urllib2.Request(configUrl)
3298                         self.report_config_download(epTitle)
3299                         try:
3300                                 configXml = urllib2.urlopen(configReq).read()
3301                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3302                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3303                                 return
3304
3305                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3306                         turls = []
3307                         for rendition in cdoc.findall('.//rendition'):
3308                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3309                                 turls.append(finfo)
3310
3311                         if len(turls) == 0:
3312                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3313                                 continue
3314
3315                         # For now, just pick the highest bitrate
3316                         format,video_url = turls[-1]
3317
3318                         self._downloader.increment_downloads()
3319
3320                         effTitle = showId + u'-' + epTitle
3321                         info = {
3322                                 'id': shortMediaId,
3323                                 'url': video_url,
3324                                 'uploader': showId,
3325                                 'upload_date': officialDate,
3326                                 'title': effTitle,
3327                                 'stitle': _simplify_title(effTitle),
3328                                 'ext': 'mp4',
3329                                 'format': format,
3330                                 'thumbnail': None,
3331                                 'description': officialTitle,
3332                                 'player_url': playerUrl
3333                         }
3334
3335                         try:
3336                                 self._downloader.process_info(info)
3337                         except UnavailableVideoError, err:
3338                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3339                                 continue
3340
3341
3342 class EscapistIE(InfoExtractor):
3343         """Information extractor for The Escapist """
3344
3345         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3346         IE_NAME = u'escapist'
3347
3348         def report_extraction(self, showName):
3349                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3350
3351         def report_config_download(self, showName):
3352                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3353
3354         def _real_extract(self, url):
3355                 htmlParser = HTMLParser.HTMLParser()
3356
3357                 mobj = re.match(self._VALID_URL, url)
3358                 if mobj is None:
3359                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3360                         return
3361                 showName = mobj.group('showname')
3362                 videoId = mobj.group('episode')
3363
3364                 self.report_extraction(showName)
3365                 try:
3366                         webPage = urllib2.urlopen(url).read()
3367                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3368                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3369                         return
3370
3371                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3372                 description = htmlParser.unescape(descMatch.group(1))
3373                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3374                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3375                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3376                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3377                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3378                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3379
3380                 self.report_config_download(showName)
3381                 try:
3382                         configJSON = urllib2.urlopen(configUrl).read()
3383                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3384                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3385                         return
3386
3387                 # Technically, it's JavaScript, not JSON
3388                 configJSON = configJSON.replace("'", '"')
3389
3390                 try:
3391                         config = json.loads(configJSON)
3392                 except (ValueError,), err:
3393                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3394                         return
3395
3396                 playlist = config['playlist']
3397                 videoUrl = playlist[1]['url']
3398
3399                 self._downloader.increment_downloads()
3400                 info = {
3401                         'id': videoId,
3402                         'url': videoUrl,
3403                         'uploader': showName,
3404                         'upload_date': None,
3405                         'title': showName,
3406                         'stitle': _simplify_title(showName),
3407                         'ext': 'flv',
3408                         'format': 'flv',
3409                         'thumbnail': imgUrl,
3410                         'description': description,
3411                         'player_url': playerUrl,
3412                 }
3413
3414                 try:
3415                         self._downloader.process_info(info)
3416                 except UnavailableVideoError, err:
3417                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3418
3419
3420 class CollegeHumorIE(InfoExtractor):
3421         """Information extractor for collegehumor.com"""
3422
3423         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3424         IE_NAME = u'collegehumor'
3425
3426         def report_webpage(self, video_id):
3427                 """Report information extraction."""
3428                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3429
3430         def report_extraction(self, video_id):
3431                 """Report information extraction."""
3432                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3433
3434         def _real_extract(self, url):
3435                 htmlParser = HTMLParser.HTMLParser()
3436
3437                 mobj = re.match(self._VALID_URL, url)
3438                 if mobj is None:
3439                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3440                         return
3441                 video_id = mobj.group('videoid')
3442
3443                 self.report_webpage(video_id)
3444                 request = urllib2.Request(url)
3445                 try:
3446                         webpage = urllib2.urlopen(request).read()
3447                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3448                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3449                         return
3450
3451                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3452                 if m is None:
3453                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3454                         return
3455                 internal_video_id = m.group('internalvideoid')
3456
3457                 info = {
3458                         'id': video_id,
3459                         'internal_id': internal_video_id,
3460                 }
3461
3462                 self.report_extraction(video_id)
3463                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3464                 try:
3465                         metaXml = urllib2.urlopen(xmlUrl).read()
3466                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3467                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3468                         return
3469
3470                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3471                 try:
3472                         videoNode = mdoc.findall('./video')[0]
3473                         info['description'] = videoNode.findall('./description')[0].text
3474                         info['title'] = videoNode.findall('./caption')[0].text
3475                         info['stitle'] = _simplify_title(info['title'])
3476                         info['url'] = videoNode.findall('./file')[0].text
3477                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3478                         info['ext'] = info['url'].rpartition('.')[2]
3479                         info['format'] = info['ext']
3480                 except IndexError:
3481                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3482                         return
3483
3484                 self._downloader.increment_downloads()
3485
3486                 try:
3487                         self._downloader.process_info(info)
3488                 except UnavailableVideoError, err:
3489                         self._downloader.trouble(u'\nERROR: unable to download video')
3490
3491
3492 class XVideosIE(InfoExtractor):
3493         """Information extractor for xvideos.com"""
3494
3495         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3496         IE_NAME = u'xvideos'
3497
3498         def report_webpage(self, video_id):
3499                 """Report information extraction."""
3500                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3501
3502         def report_extraction(self, video_id):
3503                 """Report information extraction."""
3504                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3505
3506         def _real_extract(self, url):
3507                 htmlParser = HTMLParser.HTMLParser()
3508
3509                 mobj = re.match(self._VALID_URL, url)
3510                 if mobj is None:
3511                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3512                         return
3513                 video_id = mobj.group(1).decode('utf-8')
3514
3515                 self.report_webpage(video_id)
3516
3517                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3518                 try:
3519                         webpage = urllib2.urlopen(request).read()
3520                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3521                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3522                         return
3523
3524                 self.report_extraction(video_id)
3525
3526
3527                 # Extract video URL
3528                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3529                 if mobj is None:
3530                         self._downloader.trouble(u'ERROR: unable to extract video url')
3531                         return
3532                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3533
3534
3535                 # Extract title
3536                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3537                 if mobj is None:
3538                         self._downloader.trouble(u'ERROR: unable to extract video title')
3539                         return
3540                 video_title = mobj.group(1).decode('utf-8')
3541
3542
3543                 # Extract video thumbnail
3544                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3545                 if mobj is None:
3546                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3547                         return
3548                 video_thumbnail = mobj.group(1).decode('utf-8')
3549
3550
3551
3552                 self._downloader.increment_downloads()
3553                 info = {
3554                         'id': video_id,
3555                         'url': video_url,
3556                         'uploader': None,
3557                         'upload_date': None,
3558                         'title': video_title,
3559                         'stitle': _simplify_title(video_title),
3560                         'ext': 'flv',
3561                         'format': 'flv',
3562                         'thumbnail': video_thumbnail,
3563                         'description': None,
3564                         'player_url': None,
3565                 }
3566
3567                 try:
3568                         self._downloader.process_info(info)
3569                 except UnavailableVideoError, err:
3570                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3571
3572
3573 class SoundcloudIE(InfoExtractor):
3574         """Information extractor for soundcloud.com
3575            To access the media, the uid of the song and a stream token
3576            must be extracted from the page source and the script must make
3577            a request to media.soundcloud.com/crossdomain.xml. Then
3578            the media can be grabbed by requesting from an url composed
3579            of the stream token and uid
3580          """
3581
3582         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3583         IE_NAME = u'soundcloud'
3584
3585         def __init__(self, downloader=None):
3586                 InfoExtractor.__init__(self, downloader)
3587
3588         def report_webpage(self, video_id):
3589                 """Report information extraction."""
3590                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3591
3592         def report_extraction(self, video_id):
3593                 """Report information extraction."""
3594                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3595
3596         def _real_extract(self, url):
3597                 htmlParser = HTMLParser.HTMLParser()
3598
3599                 mobj = re.match(self._VALID_URL, url)
3600                 if mobj is None:
3601                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3602                         return
3603
3604                 # extract uploader (which is in the url)
3605                 uploader = mobj.group(1).decode('utf-8')
3606                 # extract simple title (uploader + slug of song title)
3607                 slug_title =  mobj.group(2).decode('utf-8')
3608                 simple_title = uploader + '-' + slug_title
3609
3610                 self.report_webpage('%s/%s' % (uploader, slug_title))
3611
3612                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3613                 try:
3614                         webpage = urllib2.urlopen(request).read()
3615                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3616                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3617                         return
3618
3619                 self.report_extraction('%s/%s' % (uploader, slug_title))
3620
3621                 # extract uid and stream token that soundcloud hands out for access
3622                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3623                 if mobj:
3624                         video_id = mobj.group(1)
3625                         stream_token = mobj.group(2)
3626
3627                 # extract unsimplified title
3628                 mobj = re.search('"title":"(.*?)",', webpage)
3629                 if mobj:
3630                         title = mobj.group(1)
3631
3632                 # construct media url (with uid/token)
3633                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3634                 mediaURL = mediaURL % (video_id, stream_token)
3635
3636                 # description
3637                 description = u'No description available'
3638                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3639                 if mobj:
3640                         description = mobj.group(1)
3641
3642                 # upload date
3643                 upload_date = None
3644                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3645                 if mobj:
3646                         try:
3647                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3648                         except Exception, e:
3649                                 print str(e)
3650
3651                 # for soundcloud, a request to a cross domain is required for cookies
3652                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3653
3654                 try:
3655                         self._downloader.process_info({
3656                                 'id':           video_id.decode('utf-8'),
3657                                 'url':          mediaURL,
3658                                 'uploader':     uploader.decode('utf-8'),
3659                                 'upload_date':  upload_date,
3660                                 'title':        simple_title.decode('utf-8'),
3661                                 'stitle':       simple_title.decode('utf-8'),
3662                                 'ext':          u'mp3',
3663                                 'format':       u'NA',
3664                                 'player_url':   None,
3665                                 'description': description.decode('utf-8')
3666                         })
3667                 except UnavailableVideoError:
3668                         self._downloader.trouble(u'\nERROR: unable to download video')
3669
3670
3671 class InfoQIE(InfoExtractor):
3672         """Information extractor for infoq.com"""
3673
3674         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3675         IE_NAME = u'infoq'
3676
3677         def report_webpage(self, video_id):
3678                 """Report information extraction."""
3679                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3680
3681         def report_extraction(self, video_id):
3682                 """Report information extraction."""
3683                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3684
3685         def _real_extract(self, url):
3686                 htmlParser = HTMLParser.HTMLParser()
3687
3688                 mobj = re.match(self._VALID_URL, url)
3689                 if mobj is None:
3690                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3691                         return
3692
3693                 self.report_webpage(url)
3694
3695                 request = urllib2.Request(url)
3696                 try:
3697                         webpage = urllib2.urlopen(request).read()
3698                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3699                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3700                         return
3701
3702                 self.report_extraction(url)
3703
3704
3705                 # Extract video URL
3706                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3707                 if mobj is None:
3708                         self._downloader.trouble(u'ERROR: unable to extract video url')
3709                         return
3710                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3711
3712
3713                 # Extract title
3714                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3715                 if mobj is None:
3716                         self._downloader.trouble(u'ERROR: unable to extract video title')
3717                         return
3718                 video_title = mobj.group(1).decode('utf-8')
3719
3720                 # Extract description
3721                 video_description = u'No description available.'
3722                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3723                 if mobj is not None:
3724                         video_description = mobj.group(1).decode('utf-8')
3725
3726                 video_filename = video_url.split('/')[-1]
3727                 video_id, extension = video_filename.split('.')
3728
3729                 self._downloader.increment_downloads()
3730                 info = {
3731                         'id': video_id,
3732                         'url': video_url,
3733                         'uploader': None,
3734                         'upload_date': None,
3735                         'title': video_title,
3736                         'stitle': _simplify_title(video_title),
3737                         'ext': extension,
3738                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3739                         'thumbnail': None,
3740                         'description': video_description,
3741                         'player_url': None,
3742                 }
3743
3744                 try:
3745                         self._downloader.process_info(info)
3746                 except UnavailableVideoError, err:
3747                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3748
3749 class MixcloudIE(InfoExtractor):
3750         """Information extractor for www.mixcloud.com"""
3751         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3752         IE_NAME = u'mixcloud'
3753
3754         def __init__(self, downloader=None):
3755                 InfoExtractor.__init__(self, downloader)
3756
3757         def report_download_json(self, file_id):
3758                 """Report JSON download."""
3759                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3760
3761         def report_extraction(self, file_id):
3762                 """Report information extraction."""
3763                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3764
3765         def get_urls(self, jsonData, fmt, bitrate='best'):
3766                 """Get urls from 'audio_formats' section in json"""
3767                 file_url = None
3768                 try:
3769                         bitrate_list = jsonData[fmt]
3770                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3771                                 bitrate = max(bitrate_list) # select highest
3772
3773                         url_list = jsonData[fmt][bitrate]
3774                 except TypeError: # we have no bitrate info.
3775                         url_list = jsonData[fmt]
3776
3777                 return url_list
3778
3779         def check_urls(self, url_list):
3780                 """Returns 1st active url from list"""
3781                 for url in url_list:
3782                         try:
3783                                 urllib2.urlopen(url)
3784                                 return url
3785                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3786                                 url = None
3787
3788                 return None
3789
3790         def _print_formats(self, formats):
3791                 print 'Available formats:'
3792                 for fmt in formats.keys():
3793                         for b in formats[fmt]:
3794                                 try:
3795                                         ext = formats[fmt][b][0]
3796                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3797                                 except TypeError: # we have no bitrate info
3798                                         ext = formats[fmt][0]
3799                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3800                                         break
3801
3802         def _real_extract(self, url):
3803                 mobj = re.match(self._VALID_URL, url)
3804                 if mobj is None:
3805                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3806                         return
3807                 # extract uploader & filename from url
3808                 uploader = mobj.group(1).decode('utf-8')
3809                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3810
3811                 # construct API request
3812                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3813                 # retrieve .json file with links to files
3814                 request = urllib2.Request(file_url)
3815                 try:
3816                         self.report_download_json(file_url)
3817                         jsonData = urllib2.urlopen(request).read()
3818                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3819                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3820                         return
3821
3822                 # parse JSON
3823                 json_data = json.loads(jsonData)
3824                 player_url = json_data['player_swf_url']
3825                 formats = dict(json_data['audio_formats'])
3826
3827                 req_format = self._downloader.params.get('format', None)
3828                 bitrate = None
3829
3830                 if self._downloader.params.get('listformats', None):
3831                         self._print_formats(formats)
3832                         return
3833
3834                 if req_format is None or req_format == 'best':
3835                         for format_param in formats.keys():
3836                                 url_list = self.get_urls(formats, format_param)
3837                                 # check urls
3838                                 file_url = self.check_urls(url_list)
3839                                 if file_url is not None:
3840                                         break # got it!
3841                 else:
3842                         if req_format not in formats.keys():
3843                                 self._downloader.trouble(u'ERROR: format is not available')
3844                                 return
3845
3846                         url_list = self.get_urls(formats, req_format)
3847                         file_url = self.check_urls(url_list)
3848                         format_param = req_format
3849
3850                 # We have audio
3851                 self._downloader.increment_downloads()
3852                 try:
3853                         # Process file information
3854                         self._downloader.process_info({
3855                                 'id': file_id.decode('utf-8'),
3856                                 'url': file_url.decode('utf-8'),
3857                                 'uploader':     uploader.decode('utf-8'),
3858                                 'upload_date': u'NA',
3859                                 'title': json_data['name'],
3860                                 'stitle': _simplify_title(json_data['name']),
3861                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3862                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3863                                 'thumbnail': json_data['thumbnail_url'],
3864                                 'description': json_data['description'],
3865                                 'player_url': player_url.decode('utf-8'),
3866                         })
3867                 except UnavailableVideoError, err:
3868                         self._downloader.trouble(u'ERROR: unable to download file')
3869
3870 class StanfordOpenClassroomIE(InfoExtractor):
3871         """Information extractor for Stanford's Open ClassRoom"""
3872
3873         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3874         IE_NAME = u'stanfordoc'
3875
3876         def report_download_webpage(self, objid):
3877                 """Report information extraction."""
3878                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3879
3880         def report_extraction(self, video_id):
3881                 """Report information extraction."""
3882                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3883
3884         def _real_extract(self, url):
3885                 mobj = re.match(self._VALID_URL, url)
3886                 if mobj is None:
3887                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3888                         return
3889
3890                 if mobj.group('course') and mobj.group('video'): # A specific video
3891                         course = mobj.group('course')
3892                         video = mobj.group('video')
3893                         info = {
3894                                 'id': _simplify_title(course + '_' + video),
3895                         }
3896
3897                         self.report_extraction(info['id'])
3898                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3899                         xmlUrl = baseUrl + video + '.xml'
3900                         try:
3901                                 metaXml = urllib2.urlopen(xmlUrl).read()
3902                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3903                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3904                                 return
3905                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3906                         try:
3907                                 info['title'] = mdoc.findall('./title')[0].text
3908                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3909                         except IndexError:
3910                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3911                                 return
3912                         info['stitle'] = _simplify_title(info['title'])
3913                         info['ext'] = info['url'].rpartition('.')[2]
3914                         info['format'] = info['ext']
3915                         self._downloader.increment_downloads()
3916                         try:
3917                                 self._downloader.process_info(info)
3918                         except UnavailableVideoError, err:
3919                                 self._downloader.trouble(u'\nERROR: unable to download video')
3920                 elif mobj.group('course'): # A course page
3921                         unescapeHTML = HTMLParser.HTMLParser().unescape
3922
3923                         course = mobj.group('course')
3924                         info = {
3925                                 'id': _simplify_title(course),
3926                                 'type': 'playlist',
3927                         }
3928
3929                         self.report_download_webpage(info['id'])
3930                         try:
3931                                 coursepage = urllib2.urlopen(url).read()
3932                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3933                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3934                                 return
3935
3936                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3937                         if m:
3938                                 info['title'] = unescapeHTML(m.group(1))
3939                         else:
3940                                 info['title'] = info['id']
3941                         info['stitle'] = _simplify_title(info['title'])
3942
3943                         m = re.search('<description>([^<]+)</description>', coursepage)
3944                         if m:
3945                                 info['description'] = unescapeHTML(m.group(1))
3946
3947                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3948                         info['list'] = [
3949                                 {
3950                                         'type': 'reference',
3951                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3952                                 }
3953                                         for vpage in links]
3954
3955                         for entry in info['list']:
3956                                 assert entry['type'] == 'reference'
3957                                 self.extract(entry['url'])
3958                 else: # Root page
3959                         unescapeHTML = HTMLParser.HTMLParser().unescape
3960
3961                         info = {
3962                                 'id': 'Stanford OpenClassroom',
3963                                 'type': 'playlist',
3964                         }
3965
3966                         self.report_download_webpage(info['id'])
3967                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3968                         try:
3969                                 rootpage = urllib2.urlopen(rootURL).read()
3970                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3971                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3972                                 return
3973
3974                         info['title'] = info['id']
3975                         info['stitle'] = _simplify_title(info['title'])
3976
3977                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3978                         info['list'] = [
3979                                 {
3980                                         'type': 'reference',
3981                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3982                                 }
3983                                         for cpage in links]
3984
3985                         for entry in info['list']:
3986                                 assert entry['type'] == 'reference'
3987                                 self.extract(entry['url'])
3988
3989 class MTVIE(InfoExtractor):
3990         """Information extractor for MTV.com"""
3991
3992         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3993         IE_NAME = u'mtv'
3994
3995         def report_webpage(self, video_id):
3996                 """Report information extraction."""
3997                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3998
3999         def report_extraction(self, video_id):
4000                 """Report information extraction."""
4001                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
4002
4003         def _real_extract(self, url):
4004                 mobj = re.match(self._VALID_URL, url)
4005                 if mobj is None:
4006                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4007                         return
4008                 if not mobj.group('proto'):
4009                         url = 'http://' + url
4010                 video_id = mobj.group('videoid')
4011                 self.report_webpage(video_id)
4012
4013                 request = urllib2.Request(url)
4014                 try:
4015                         webpage = urllib2.urlopen(request).read()
4016                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4017                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4018                         return
4019
4020                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4021                 if mobj is None:
4022                         self._downloader.trouble(u'ERROR: unable to extract song name')
4023                         return
4024                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4025                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4026                 if mobj is None:
4027                         self._downloader.trouble(u'ERROR: unable to extract performer')
4028                         return
4029                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4030                 video_title = performer + ' - ' + song_name
4031
4032                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4033                 if mobj is None:
4034                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4035                         return
4036                 mtvn_uri = mobj.group(1)
4037
4038                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4039                 if mobj is None:
4040                         self._downloader.trouble(u'ERROR: unable to extract content id')
4041                         return
4042                 content_id = mobj.group(1)
4043
4044                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4045                 self.report_extraction(video_id)
4046                 request = urllib2.Request(videogen_url)
4047                 try:
4048                         metadataXml = urllib2.urlopen(request).read()
4049                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4050                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4051                         return
4052
4053                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4054                 renditions = mdoc.findall('.//rendition')
4055
4056                 # For now, always pick the highest quality.
4057                 rendition = renditions[-1]
4058
4059                 try:
4060                         _,_,ext = rendition.attrib['type'].partition('/')
4061                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4062                         video_url = rendition.find('./src').text
4063                 except KeyError:
4064                         self._downloader.trouble('Invalid rendition field.')
4065                         return
4066
4067                 self._downloader.increment_downloads()
4068                 info = {
4069                         'id': video_id,
4070                         'url': video_url,
4071                         'uploader': performer,
4072                         'title': video_title,
4073                         'stitle': _simplify_title(video_title),
4074                         'ext': ext,
4075                         'format': format,
4076                 }
4077
4078                 try:
4079                         self._downloader.process_info(info)
4080                 except UnavailableVideoError, err:
4081                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4082
4083
4084 class PostProcessor(object):
4085         """Post Processor class.
4086
4087         PostProcessor objects can be added to downloaders with their
4088         add_post_processor() method. When the downloader has finished a
4089         successful download, it will take its internal chain of PostProcessors
4090         and start calling the run() method on each one of them, first with
4091         an initial argument and then with the returned value of the previous
4092         PostProcessor.
4093
4094         The chain will be stopped if one of them ever returns None or the end
4095         of the chain is reached.
4096
4097         PostProcessor objects follow a "mutual registration" process similar
4098         to InfoExtractor objects.
4099         """
4100
4101         _downloader = None
4102
4103         def __init__(self, downloader=None):
4104                 self._downloader = downloader
4105
4106         def set_downloader(self, downloader):
4107                 """Sets the downloader for this PP."""
4108                 self._downloader = downloader
4109
4110         def run(self, information):
4111                 """Run the PostProcessor.
4112
4113                 The "information" argument is a dictionary like the ones
4114                 composed by InfoExtractors. The only difference is that this
4115                 one has an extra field called "filepath" that points to the
4116                 downloaded file.
4117
4118                 When this method returns None, the postprocessing chain is
4119                 stopped. However, this method may return an information
4120                 dictionary that will be passed to the next postprocessing
4121                 object in the chain. It can be the one it received after
4122                 changing some fields.
4123
4124                 In addition, this method may raise a PostProcessingError
4125                 exception that will be taken into account by the downloader
4126                 it was called from.
4127                 """
4128                 return information # by default, do nothing
4129
4130 class AudioConversionError(BaseException):
4131         def __init__(self, message):
4132                 self.message = message
4133
4134 class FFmpegExtractAudioPP(PostProcessor):
4135
4136         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4137                 PostProcessor.__init__(self, downloader)
4138                 if preferredcodec is None:
4139                         preferredcodec = 'best'
4140                 self._preferredcodec = preferredcodec
4141                 self._preferredquality = preferredquality
4142                 self._keepvideo = keepvideo
4143
4144         @staticmethod
4145         def get_audio_codec(path):
4146                 try:
4147                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4148                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4149                         output = handle.communicate()[0]
4150                         if handle.wait() != 0:
4151                                 return None
4152                 except (IOError, OSError):
4153                         return None
4154                 audio_codec = None
4155                 for line in output.split('\n'):
4156                         if line.startswith('codec_name='):
4157                                 audio_codec = line.split('=')[1].strip()
4158                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4159                                 return audio_codec
4160                 return None
4161
4162         @staticmethod
4163         def run_ffmpeg(path, out_path, codec, more_opts):
4164                 if codec is None:
4165                         acodec_opts = []
4166                 else:
4167                         acodec_opts = ['-acodec', codec]
4168                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4169                 try:
4170                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4171                         stdout,stderr = p.communicate()
4172                 except (IOError, OSError):
4173                         e = sys.exc_info()[1]
4174                         if isinstance(e, OSError) and e.errno == 2:
4175                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4176                         else:
4177                                 raise e
4178                 if p.returncode != 0:
4179                         msg = stderr.strip().split('\n')[-1]
4180                         raise AudioConversionError(msg)
4181
4182         def run(self, information):
4183                 path = information['filepath']
4184
4185                 filecodec = self.get_audio_codec(path)
4186                 if filecodec is None:
4187                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4188                         return None
4189
4190                 more_opts = []
4191                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4192                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4193                                 # Lossless, but in another container
4194                                 acodec = 'copy'
4195                                 extension = self._preferredcodec
4196                                 more_opts = ['-absf', 'aac_adtstoasc']
4197                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4198                                 # Lossless if possible
4199                                 acodec = 'copy'
4200                                 extension = filecodec
4201                                 if filecodec == 'aac':
4202                                         more_opts = ['-f', 'adts']
4203                                 if filecodec == 'vorbis':
4204                                         extension = 'ogg'
4205                         else:
4206                                 # MP3 otherwise.
4207                                 acodec = 'libmp3lame'
4208                                 extension = 'mp3'
4209                                 more_opts = []
4210                                 if self._preferredquality is not None:
4211                                         more_opts += ['-ab', self._preferredquality]
4212                 else:
4213                         # We convert the audio (lossy)
4214                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4215                         extension = self._preferredcodec
4216                         more_opts = []
4217                         if self._preferredquality is not None:
4218                                 more_opts += ['-ab', self._preferredquality]
4219                         if self._preferredcodec == 'aac':
4220                                 more_opts += ['-f', 'adts']
4221                         if self._preferredcodec == 'm4a':
4222                                 more_opts += ['-absf', 'aac_adtstoasc']
4223                         if self._preferredcodec == 'vorbis':
4224                                 extension = 'ogg'
4225                         if self._preferredcodec == 'wav':
4226                                 extension = 'wav'
4227                                 more_opts += ['-f', 'wav']
4228
4229                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4230                 new_path = prefix + sep + extension
4231                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4232                 try:
4233                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4234                 except:
4235                         etype,e,tb = sys.exc_info()
4236                         if isinstance(e, AudioConversionError):
4237                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4238                         else:
4239                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4240                         return None
4241
4242                 # Try to update the date time for extracted audio file.
4243                 if information.get('filetime') is not None:
4244                         try:
4245                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4246                         except:
4247                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4248
4249                 if not self._keepvideo:
4250                         try:
4251                                 os.remove(_encodeFilename(path))
4252                         except (IOError, OSError):
4253                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4254                                 return None
4255
4256                 information['filepath'] = new_path
4257                 return information
4258
4259
4260 def updateSelf(downloader, filename):
4261         ''' Update the program file with the latest version from the repository '''
4262         # Note: downloader only used for options
4263         if not os.access(filename, os.W_OK):
4264                 sys.exit('ERROR: no write permissions on %s' % filename)
4265
4266         downloader.to_screen(u'Updating to latest version...')
4267
4268         try:
4269                 try:
4270                         urlh = urllib.urlopen(UPDATE_URL)
4271                         newcontent = urlh.read()
4272
4273                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4274                         if vmatch is not None and vmatch.group(1) == __version__:
4275                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4276                                 return
4277                 finally:
4278                         urlh.close()
4279         except (IOError, OSError), err:
4280                 sys.exit('ERROR: unable to download latest version')
4281
4282         try:
4283                 outf = open(filename, 'wb')
4284                 try:
4285                         outf.write(newcontent)
4286                 finally:
4287                         outf.close()
4288         except (IOError, OSError), err:
4289                 sys.exit('ERROR: unable to overwrite current version')
4290
4291         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4292
4293 def parseOpts():
4294         def _readOptions(filename_bytes):
4295                 try:
4296                         optionf = open(filename_bytes)
4297                 except IOError:
4298                         return [] # silently skip if file is not present
4299                 try:
4300                         res = []
4301                         for l in optionf:
4302                                 res += shlex.split(l, comments=True)
4303                 finally:
4304                         optionf.close()
4305                 return res
4306
4307         def _format_option_string(option):
4308                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4309
4310                 opts = []
4311
4312                 if option._short_opts: opts.append(option._short_opts[0])
4313                 if option._long_opts: opts.append(option._long_opts[0])
4314                 if len(opts) > 1: opts.insert(1, ', ')
4315
4316                 if option.takes_value(): opts.append(' %s' % option.metavar)
4317
4318                 return "".join(opts)
4319
4320         def _find_term_columns():
4321                 columns = os.environ.get('COLUMNS', None)
4322                 if columns:
4323                         return int(columns)
4324
4325                 try:
4326                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4327                         out,err = sp.communicate()
4328                         return int(out.split()[1])
4329                 except:
4330                         pass
4331                 return None
4332
4333         max_width = 80
4334         max_help_position = 80
4335
4336         # No need to wrap help messages if we're on a wide console
4337         columns = _find_term_columns()
4338         if columns: max_width = columns
4339
4340         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4341         fmt.format_option_strings = _format_option_string
4342
4343         kw = {
4344                 'version'   : __version__,
4345                 'formatter' : fmt,
4346                 'usage' : '%prog [options] url [url...]',
4347                 'conflict_handler' : 'resolve',
4348         }
4349
4350         parser = optparse.OptionParser(**kw)
4351
4352         # option groups
4353         general        = optparse.OptionGroup(parser, 'General Options')
4354         selection      = optparse.OptionGroup(parser, 'Video Selection')
4355         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4356         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4357         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4358         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4359         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4360
4361         general.add_option('-h', '--help',
4362                         action='help', help='print this help text and exit')
4363         general.add_option('-v', '--version',
4364                         action='version', help='print program version and exit')
4365         general.add_option('-U', '--update',
4366                         action='store_true', dest='update_self', help='update this program to latest version')
4367         general.add_option('-i', '--ignore-errors',
4368                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4369         general.add_option('-r', '--rate-limit',
4370                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4371         general.add_option('-R', '--retries',
4372                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4373         general.add_option('--dump-user-agent',
4374                         action='store_true', dest='dump_user_agent',
4375                         help='display the current browser identification', default=False)
4376         general.add_option('--list-extractors',
4377                         action='store_true', dest='list_extractors',
4378                         help='List all supported extractors and the URLs they would handle', default=False)
4379
4380         selection.add_option('--playlist-start',
4381                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4382         selection.add_option('--playlist-end',
4383                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4384         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4385         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4386         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4387
4388         authentication.add_option('-u', '--username',
4389                         dest='username', metavar='USERNAME', help='account username')
4390         authentication.add_option('-p', '--password',
4391                         dest='password', metavar='PASSWORD', help='account password')
4392         authentication.add_option('-n', '--netrc',
4393                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4394
4395
4396         video_format.add_option('-f', '--format',
4397                         action='store', dest='format', metavar='FORMAT', help='video format code')
4398         video_format.add_option('--all-formats',
4399                         action='store_const', dest='format', help='download all available video formats', const='all')
4400         video_format.add_option('--prefer-free-formats',
4401                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4402         video_format.add_option('--max-quality',
4403                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4404         video_format.add_option('-F', '--list-formats',
4405                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4406         video_format.add_option('--write-srt',
4407                         action='store_true', dest='writesubtitles',
4408                         help='write video closed captions to a .srt file (currently youtube only)', default=False)
4409         video_format.add_option('--srt-lang',
4410                         action='store', dest='subtitleslang', metavar='LANG',
4411                         help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4412
4413
4414         verbosity.add_option('-q', '--quiet',
4415                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4416         verbosity.add_option('-s', '--simulate',
4417                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4418         verbosity.add_option('--skip-download',
4419                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4420         verbosity.add_option('-g', '--get-url',
4421                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4422         verbosity.add_option('-e', '--get-title',
4423                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4424         verbosity.add_option('--get-thumbnail',
4425                         action='store_true', dest='getthumbnail',
4426                         help='simulate, quiet but print thumbnail URL', default=False)
4427         verbosity.add_option('--get-description',
4428                         action='store_true', dest='getdescription',
4429                         help='simulate, quiet but print video description', default=False)
4430         verbosity.add_option('--get-filename',
4431                         action='store_true', dest='getfilename',
4432                         help='simulate, quiet but print output filename', default=False)
4433         verbosity.add_option('--get-format',
4434                         action='store_true', dest='getformat',
4435                         help='simulate, quiet but print output format', default=False)
4436         verbosity.add_option('--no-progress',
4437                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4438         verbosity.add_option('--console-title',
4439                         action='store_true', dest='consoletitle',
4440                         help='display progress in console titlebar', default=False)
4441         verbosity.add_option('-v', '--verbose',
4442                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4443
4444
4445         filesystem.add_option('-t', '--title',
4446                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4447         filesystem.add_option('-l', '--literal',
4448                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4449         filesystem.add_option('-A', '--auto-number',
4450                         action='store_true', dest='autonumber',
4451                         help='number downloaded files starting from 00000', default=False)
4452         filesystem.add_option('-o', '--output',
4453                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4454         filesystem.add_option('-a', '--batch-file',
4455                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4456         filesystem.add_option('-w', '--no-overwrites',
4457                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4458         filesystem.add_option('-c', '--continue',
4459                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4460         filesystem.add_option('--no-continue',
4461                         action='store_false', dest='continue_dl',
4462                         help='do not resume partially downloaded files (restart from beginning)')
4463         filesystem.add_option('--cookies',
4464                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4465         filesystem.add_option('--no-part',
4466                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4467         filesystem.add_option('--no-mtime',
4468                         action='store_false', dest='updatetime',
4469                         help='do not use the Last-modified header to set the file modification time', default=True)
4470         filesystem.add_option('--write-description',
4471                         action='store_true', dest='writedescription',
4472                         help='write video description to a .description file', default=False)
4473         filesystem.add_option('--write-info-json',
4474                         action='store_true', dest='writeinfojson',
4475                         help='write video metadata to a .info.json file', default=False)
4476
4477
4478         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4479                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4480         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4481                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4482         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4483                         help='ffmpeg audio bitrate specification, 128k by default')
4484         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4485                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4486
4487
4488         parser.add_option_group(general)
4489         parser.add_option_group(selection)
4490         parser.add_option_group(filesystem)
4491         parser.add_option_group(verbosity)
4492         parser.add_option_group(video_format)
4493         parser.add_option_group(authentication)
4494         parser.add_option_group(postproc)
4495
4496         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4497         if xdg_config_home:
4498                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4499         else:
4500                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4501         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4502         opts, args = parser.parse_args(argv)
4503
4504         return parser, opts, args
4505
4506 def gen_extractors():
4507         """ Return a list of an instance of every supported extractor.
4508         The order does matter; the first extractor matched is the one handling the URL.
4509         """
4510         youtube_ie = YoutubeIE()
4511         google_ie = GoogleIE()
4512         yahoo_ie = YahooIE()
4513         return [
4514                 YoutubePlaylistIE(youtube_ie),
4515                 YoutubeUserIE(youtube_ie),
4516                 YoutubeSearchIE(youtube_ie),
4517                 youtube_ie,
4518                 MetacafeIE(youtube_ie),
4519                 DailymotionIE(),
4520                 google_ie,
4521                 GoogleSearchIE(google_ie),
4522                 PhotobucketIE(),
4523                 yahoo_ie,
4524                 YahooSearchIE(yahoo_ie),
4525                 DepositFilesIE(),
4526                 FacebookIE(),
4527                 BlipTVIE(),
4528                 VimeoIE(),
4529                 MyVideoIE(),
4530                 ComedyCentralIE(),
4531                 EscapistIE(),
4532                 CollegeHumorIE(),
4533                 XVideosIE(),
4534                 SoundcloudIE(),
4535                 InfoQIE(),
4536                 MixcloudIE(),
4537                 StanfordOpenClassroomIE(),
4538                 MTVIE(),
4539
4540                 GenericIE()
4541         ]
4542
4543 def _real_main():
4544         parser, opts, args = parseOpts()
4545
4546         # Open appropriate CookieJar
4547         if opts.cookiefile is None:
4548                 jar = cookielib.CookieJar()
4549         else:
4550                 try:
4551                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4552                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4553                                 jar.load()
4554                 except (IOError, OSError), err:
4555                         sys.exit(u'ERROR: unable to open cookie file')
4556
4557         # Dump user agent
4558         if opts.dump_user_agent:
4559                 print std_headers['User-Agent']
4560                 sys.exit(0)
4561
4562         # Batch file verification
4563         batchurls = []
4564         if opts.batchfile is not None:
4565                 try:
4566                         if opts.batchfile == '-':
4567                                 batchfd = sys.stdin
4568                         else:
4569                                 batchfd = open(opts.batchfile, 'r')
4570                         batchurls = batchfd.readlines()
4571                         batchurls = [x.strip() for x in batchurls]
4572                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4573                 except IOError:
4574                         sys.exit(u'ERROR: batch file could not be read')
4575         all_urls = batchurls + args
4576         all_urls = map(lambda url: url.strip(), all_urls)
4577
4578         # General configuration
4579         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4580         proxy_handler = urllib2.ProxyHandler()
4581         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4582         urllib2.install_opener(opener)
4583         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4584
4585         if opts.verbose:
4586                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4587
4588         extractors = gen_extractors()
4589
4590         if opts.list_extractors:
4591                 for ie in extractors:
4592                         print(ie.IE_NAME)
4593                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4594                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4595                         for mu in matchedUrls:
4596                                 print(u'  ' + mu)
4597                 sys.exit(0)
4598
4599         # Conflicting, missing and erroneous options
4600         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4601                 parser.error(u'using .netrc conflicts with giving username/password')
4602         if opts.password is not None and opts.username is None:
4603                 parser.error(u'account username missing')
4604         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4605                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4606         if opts.usetitle and opts.useliteral:
4607                 parser.error(u'using title conflicts with using literal title')
4608         if opts.username is not None and opts.password is None:
4609                 opts.password = getpass.getpass(u'Type account password and press return:')
4610         if opts.ratelimit is not None:
4611                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4612                 if numeric_limit is None:
4613                         parser.error(u'invalid rate limit specified')
4614                 opts.ratelimit = numeric_limit
4615         if opts.retries is not None:
4616                 try:
4617                         opts.retries = long(opts.retries)
4618                 except (TypeError, ValueError), err:
4619                         parser.error(u'invalid retry count specified')
4620         try:
4621                 opts.playliststart = int(opts.playliststart)
4622                 if opts.playliststart <= 0:
4623                         raise ValueError(u'Playlist start must be positive')
4624         except (TypeError, ValueError), err:
4625                 parser.error(u'invalid playlist start number specified')
4626         try:
4627                 opts.playlistend = int(opts.playlistend)
4628                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4629                         raise ValueError(u'Playlist end must be greater than playlist start')
4630         except (TypeError, ValueError), err:
4631                 parser.error(u'invalid playlist end number specified')
4632         if opts.extractaudio:
4633                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4634                         parser.error(u'invalid audio format specified')
4635
4636         # File downloader
4637         fd = FileDownloader({
4638                 'usenetrc': opts.usenetrc,
4639                 'username': opts.username,
4640                 'password': opts.password,
4641                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4642                 'forceurl': opts.geturl,
4643                 'forcetitle': opts.gettitle,
4644                 'forcethumbnail': opts.getthumbnail,
4645                 'forcedescription': opts.getdescription,
4646                 'forcefilename': opts.getfilename,
4647                 'forceformat': opts.getformat,
4648                 'simulate': opts.simulate,
4649                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4650                 'format': opts.format,
4651                 'format_limit': opts.format_limit,
4652                 'listformats': opts.listformats,
4653                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4654                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4655                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4656                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4657                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4658                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4659                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4660                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4661                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4662                         or u'%(id)s.%(ext)s'),
4663                 'ignoreerrors': opts.ignoreerrors,
4664                 'ratelimit': opts.ratelimit,
4665                 'nooverwrites': opts.nooverwrites,
4666                 'retries': opts.retries,
4667                 'continuedl': opts.continue_dl,
4668                 'noprogress': opts.noprogress,
4669                 'playliststart': opts.playliststart,
4670                 'playlistend': opts.playlistend,
4671                 'logtostderr': opts.outtmpl == '-',
4672                 'consoletitle': opts.consoletitle,
4673                 'nopart': opts.nopart,
4674                 'updatetime': opts.updatetime,
4675                 'writedescription': opts.writedescription,
4676                 'writeinfojson': opts.writeinfojson,
4677                 'writesubtitles': opts.writesubtitles,
4678                 'subtitleslang': opts.subtitleslang,
4679                 'matchtitle': opts.matchtitle,
4680                 'rejecttitle': opts.rejecttitle,
4681                 'max_downloads': opts.max_downloads,
4682                 'prefer_free_formats': opts.prefer_free_formats,
4683                 'verbose': opts.verbose,
4684                 })
4685         for extractor in extractors:
4686                 fd.add_info_extractor(extractor)
4687
4688         # PostProcessors
4689         if opts.extractaudio:
4690                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4691
4692         # Update version
4693         if opts.update_self:
4694                 updateSelf(fd, sys.argv[0])
4695
4696         # Maybe do nothing
4697         if len(all_urls) < 1:
4698                 if not opts.update_self:
4699                         parser.error(u'you must provide at least one URL')
4700                 else:
4701                         sys.exit()
4702
4703         try:
4704                 retcode = fd.download(all_urls)
4705         except MaxDownloadsReached:
4706                 fd.to_screen(u'--max-download limit reached, aborting.')
4707                 retcode = 101
4708
4709         # Dump cookie jar if requested
4710         if opts.cookiefile is not None:
4711                 try:
4712                         jar.save()
4713                 except (IOError, OSError), err:
4714                         sys.exit(u'ERROR: unable to save cookie jar')
4715
4716         sys.exit(retcode)
4717
4718 def main():
4719         try:
4720                 _real_main()
4721         except DownloadError:
4722                 sys.exit(1)
4723         except SameFileError:
4724                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4725         except KeyboardInterrupt:
4726                 sys.exit(u'\nERROR: Interrupted by user')
4727
4728 if __name__ == '__main__':
4729         main()
4730
4731 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: