youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         )
  18
  19 __license__ = 'Public Domain'
  20 __version__ = '2011.11.22'
  21
  22 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  23
  24 import cookielib
  25 import datetime
  26 import gzip
  27 import htmlentitydefs
  28 import HTMLParser
  29 import httplib
  30 import locale
  31 import math
  32 import netrc
  33 import os
  34 import os.path
  35 import re
  36 import socket
  37 import string
  38 import subprocess
  39 import sys
  40 import time
  41 import urllib
  42 import urllib2
  43 import warnings
  44 import zlib
  45
  46 if os.name == 'nt':
  47         import ctypes
  48
  49 try:
  50         import email.utils
  51 except ImportError: # Python 2.4
  52         import email.Utils
  53 try:
  54         import cStringIO as StringIO
  55 except ImportError:
  56         import StringIO
  57
  58 # parse_qs was moved from the cgi module to the urlparse module recently.
  59 try:
  60         from urlparse import parse_qs
  61 except ImportError:
  62         from cgi import parse_qs
  63
  64 try:
  65         import lxml.etree
  66 except ImportError:
  67         pass # Handled below
  68
  69 try:
  70         import xml.etree.ElementTree
  71 except ImportError: # Python<2.5: Not officially supported, but let it slip
  72         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  73
  74 std_headers = {
  75         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  76         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  77         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  78         'Accept-Encoding': 'gzip, deflate',
  79         'Accept-Language': 'en-us,en;q=0.5',
  80 }
  81
  82 try:
  83         import json
  84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  85         import re
  86         class json(object):
  87                 @staticmethod
  88                 def loads(s):
  89                         s = s.decode('UTF-8')
  90                         def raiseError(msg, i):
  91                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  92                         def skipSpace(i, expectMore=True):
  93                                 while i < len(s) and s[i] in ' \t\r\n':
  94                                         i += 1
  95                                 if expectMore:
  96                                         if i >= len(s):
  97                                                 raiseError('Premature end', i)
  98                                 return i
  99                         def decodeEscape(match):
 100                                 esc = match.group(1)
 101                                 _STATIC = {
 102                                         '"': '"',
 103                                         '\\': '\\',
 104                                         '/': '/',
 105                                         'b': unichr(0x8),
 106                                         'f': unichr(0xc),
 107                                         'n': '\n',
 108                                         'r': '\r',
 109                                         't': '\t',
 110                                 }
 111                                 if esc in _STATIC:
 112                                         return _STATIC[esc]
 113                                 if esc[0] == 'u':
 114                                         if len(esc) == 1+4:
 115                                                 return unichr(int(esc[1:5], 16))
 116                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 117                                                 hi = int(esc[1:5], 16)
 118                                                 low = int(esc[7:11], 16)
 119                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 120                                 raise ValueError('Unknown escape ' + str(esc))
 121                         def parseString(i):
 122                                 i += 1
 123                                 e = i
 124                                 while True:
 125                                         e = s.index('"', e)
 126                                         bslashes = 0
 127                                         while s[e-bslashes-1] == '\\':
 128                                                 bslashes += 1
 129                                         if bslashes % 2 == 1:
 130                                                 e += 1
 131                                                 continue
 132                                         break
 133                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 134                                 stri = rexp.sub(decodeEscape, s[i:e])
 135                                 return (e+1,stri)
 136                         def parseObj(i):
 137                                 i += 1
 138                                 res = {}
 139                                 i = skipSpace(i)
 140                                 if s[i] == '}': # Empty dictionary
 141                                         return (i+1,res)
 142                                 while True:
 143                                         if s[i] != '"':
 144                                                 raiseError('Expected a string object key', i)
 145                                         i,key = parseString(i)
 146                                         i = skipSpace(i)
 147                                         if i >= len(s) or s[i] != ':':
 148                                                 raiseError('Expected a colon', i)
 149                                         i,val = parse(i+1)
 150                                         res[key] = val
 151                                         i = skipSpace(i)
 152                                         if s[i] == '}':
 153                                                 return (i+1, res)
 154                                         if s[i] != ',':
 155                                                 raiseError('Expected comma or closing curly brace', i)
 156                                         i = skipSpace(i+1)
 157                         def parseArray(i):
 158                                 res = []
 159                                 i = skipSpace(i+1)
 160                                 if s[i] == ']': # Empty array
 161                                         return (i+1,res)
 162                                 while True:
 163                                         i,val = parse(i)
 164                                         res.append(val)
 165                                         i = skipSpace(i) # Raise exception if premature end
 166                                         if s[i] == ']':
 167                                                 return (i+1, res)
 168                                         if s[i] != ',':
 169                                                 raiseError('Expected a comma or closing bracket', i)
 170                                         i = skipSpace(i+1)
 171                         def parseDiscrete(i):
 172                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 173                                         if s.startswith(k, i):
 174                                                 return (i+len(k), v)
 175                                 raiseError('Not a boolean (or null)', i)
 176                         def parseNumber(i):
 177                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 178                                 if mobj is None:
 179                                         raiseError('Not a number', i)
 180                                 nums = mobj.group(1)
 181                                 if '.' in nums or 'e' in nums or 'E' in nums:
 182                                         return (i+len(nums), float(nums))
 183                                 return (i+len(nums), int(nums))
 184                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 185                         def parse(i):
 186                                 i = skipSpace(i)
 187                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 188                                 i = skipSpace(i, False)
 189                                 return (i,res)
 190                         i,res = parse(0)
 191                         if i < len(s):
 192                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 193                         return res
 194
 195 def preferredencoding():
 196         """Get preferred encoding.
 197
 198         Returns the best encoding scheme for the system, based on
 199         locale.getpreferredencoding() and some further tweaks.
 200         """
 201         def yield_preferredencoding():
 202                 try:
 203                         pref = locale.getpreferredencoding()
 204                         u'TEST'.encode(pref)
 205                 except:
 206                         pref = 'UTF-8'
 207                 while True:
 208                         yield pref
 209         return yield_preferredencoding().next()
 210
 211
 212 def htmlentity_transform(matchobj):
 213         """Transforms an HTML entity to a Unicode character.
 214
 215         This function receives a match object and is intended to be used with
 216         the re.sub() function.
 217         """
 218         entity = matchobj.group(1)
 219
 220         # Known non-numeric HTML entity
 221         if entity in htmlentitydefs.name2codepoint:
 222                 return unichr(htmlentitydefs.name2codepoint[entity])
 223
 224         # Unicode character
 225         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 226         if mobj is not None:
 227                 numstr = mobj.group(1)
 228                 if numstr.startswith(u'x'):
 229                         base = 16
 230                         numstr = u'0%s' % numstr
 231                 else:
 232                         base = 10
 233                 return unichr(long(numstr, base))
 234
 235         # Unknown entity in name, return its literal representation
 236         return (u'&%s;' % entity)
 237
 238
 239 def sanitize_title(utitle):
 240         """Sanitizes a video title so it could be used as part of a filename."""
 241         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 242         return utitle.replace(unicode(os.sep), u'%')
 243
 244
 245 def sanitize_open(filename, open_mode):
 246         """Try to open the given filename, and slightly tweak it if this fails.
 247
 248         Attempts to open the given filename. If this fails, it tries to change
 249         the filename slightly, step by step, until it's either able to open it
 250         or it fails and raises a final exception, like the standard open()
 251         function.
 252
 253         It returns the tuple (stream, definitive_file_name).
 254         """
 255         try:
 256                 if filename == u'-':
 257                         if sys.platform == 'win32':
 258                                 import msvcrt
 259                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 260                         return (sys.stdout, filename)
 261                 stream = open(filename, open_mode)
 262                 return (stream, filename)
 263         except (IOError, OSError), err:
 264                 # In case of error, try to remove win32 forbidden chars
 265                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 266
 267                 # An exception here should be caught in the caller
 268                 stream = open(filename, open_mode)
 269                 return (stream, filename)
 270
 271
 272 def timeconvert(timestr):
 273         """Convert RFC 2822 defined time string into system timestamp"""
 274         timestamp = None
 275         timetuple = email.utils.parsedate_tz(timestr)
 276         if timetuple is not None:
 277                 timestamp = email.utils.mktime_tz(timetuple)
 278         return timestamp
 279
 280 def _simplify_title(title):
 281         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 282         return expr.sub(u'_', title).strip(u'_')
 283
 284 class DownloadError(Exception):
 285         """Download Error exception.
 286
 287         This exception may be thrown by FileDownloader objects if they are not
 288         configured to continue on errors. They will contain the appropriate
 289         error message.
 290         """
 291         pass
 292
 293
 294 class SameFileError(Exception):
 295         """Same File exception.
 296
 297         This exception will be thrown by FileDownloader objects if they detect
 298         multiple files would have to be downloaded to the same file on disk.
 299         """
 300         pass
 301
 302
 303 class PostProcessingError(Exception):
 304         """Post Processing exception.
 305
 306         This exception may be raised by PostProcessor's .run() method to
 307         indicate an error in the postprocessing task.
 308         """
 309         pass
 310
 311
 312 class UnavailableVideoError(Exception):
 313         """Unavailable Format exception.
 314
 315         This exception will be thrown when a video is requested
 316         in a format that is not available for that video.
 317         """
 318         pass
 319
 320
 321 class ContentTooShortError(Exception):
 322         """Content Too Short exception.
 323
 324         This exception may be raised by FileDownloader objects when a file they
 325         download is too small for what the server announced first, indicating
 326         the connection was probably interrupted.
 327         """
 328         # Both in bytes
 329         downloaded = None
 330         expected = None
 331
 332         def __init__(self, downloaded, expected):
 333                 self.downloaded = downloaded
 334                 self.expected = expected
 335
 336
 337 class YoutubeDLHandler(urllib2.HTTPHandler):
 338         """Handler for HTTP requests and responses.
 339
 340         This class, when installed with an OpenerDirector, automatically adds
 341         the standard headers to every HTTP request and handles gzipped and
 342         deflated responses from web servers. If compression is to be avoided in
 343         a particular request, the original request in the program code only has
 344         to include the HTTP header "Youtubedl-No-Compression", which will be
 345         removed before making the real request.
 346
 347         Part of this code was copied from:
 348
 349         http://techknack.net/python-urllib2-handlers/
 350
 351         Andrew Rowls, the author of that code, agreed to release it to the
 352         public domain.
 353         """
 354
 355         @staticmethod
 356         def deflate(data):
 357                 try:
 358                         return zlib.decompress(data, -zlib.MAX_WBITS)
 359                 except zlib.error:
 360                         return zlib.decompress(data)
 361
 362         @staticmethod
 363         def addinfourl_wrapper(stream, headers, url, code):
 364                 if hasattr(urllib2.addinfourl, 'getcode'):
 365                         return urllib2.addinfourl(stream, headers, url, code)
 366                 ret = urllib2.addinfourl(stream, headers, url)
 367                 ret.code = code
 368                 return ret
 369
 370         def http_request(self, req):
 371                 for h in std_headers:
 372                         if h in req.headers:
 373                                 del req.headers[h]
 374                         req.add_header(h, std_headers[h])
 375                 if 'Youtubedl-no-compression' in req.headers:
 376                         if 'Accept-encoding' in req.headers:
 377                                 del req.headers['Accept-encoding']
 378                         del req.headers['Youtubedl-no-compression']
 379                 return req
 380
 381         def http_response(self, req, resp):
 382                 old_resp = resp
 383                 # gzip
 384                 if resp.headers.get('Content-encoding', '') == 'gzip':
 385                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 386                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 387                         resp.msg = old_resp.msg
 388                 # deflate
 389                 if resp.headers.get('Content-encoding', '') == 'deflate':
 390                         gz = StringIO.StringIO(self.deflate(resp.read()))
 391                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 392                         resp.msg = old_resp.msg
 393                 return resp
 394
 395
 396 class FileDownloader(object):
 397         """File Downloader class.
 398
 399         File downloader objects are the ones responsible of downloading the
 400         actual video file and writing it to disk if the user has requested
 401         it, among some other tasks. In most cases there should be one per
 402         program. As, given a video URL, the downloader doesn't know how to
 403         extract all the needed information, task that InfoExtractors do, it
 404         has to pass the URL to one of them.
 405
 406         For this, file downloader objects have a method that allows
 407         InfoExtractors to be registered in a given order. When it is passed
 408         a URL, the file downloader handles it to the first InfoExtractor it
 409         finds that reports being able to handle it. The InfoExtractor extracts
 410         all the information about the video or videos the URL refers to, and
 411         asks the FileDownloader to process the video information, possibly
 412         downloading the video.
 413
 414         File downloaders accept a lot of parameters. In order not to saturate
 415         the object constructor with arguments, it receives a dictionary of
 416         options instead. These options are available through the params
 417         attribute for the InfoExtractors to use. The FileDownloader also
 418         registers itself as the downloader in charge for the InfoExtractors
 419         that are added to it, so this is a "mutual registration".
 420
 421         Available options:
 422
 423         username:         Username for authentication purposes.
 424         password:         Password for authentication purposes.
 425         usenetrc:         Use netrc for authentication instead.
 426         quiet:            Do not print messages to stdout.
 427         forceurl:         Force printing final URL.
 428         forcetitle:       Force printing title.
 429         forcethumbnail:   Force printing thumbnail URL.
 430         forcedescription: Force printing description.
 431         forcefilename:    Force printing final filename.
 432         simulate:         Do not download the video files.
 433         format:           Video format code.
 434         format_limit:     Highest quality format to try.
 435         outtmpl:          Template for output names.
 436         ignoreerrors:     Do not stop on download errors.
 437         ratelimit:        Download speed limit, in bytes/sec.
 438         nooverwrites:     Prevent overwriting files.
 439         retries:          Number of times to retry for HTTP error 5xx
 440         continuedl:       Try to continue downloads if possible.
 441         noprogress:       Do not print the progress bar.
 442         playliststart:    Playlist item to start at.
 443         playlistend:      Playlist item to end at.
 444         matchtitle:       Download only matching titles.
 445         rejecttitle:      Reject downloads for matching titles.
 446         logtostderr:      Log messages to stderr instead of stdout.
 447         consoletitle:     Display progress in console window's titlebar.
 448         nopart:           Do not use temporary .part files.
 449         updatetime:       Use the Last-modified header to set output file timestamps.
 450         writedescription: Write the video description to a .description file
 451         writeinfojson:    Write the video description to a .info.json file
 452         """
 453
 454         params = None
 455         _ies = []
 456         _pps = []
 457         _download_retcode = None
 458         _num_downloads = None
 459         _screen_file = None
 460
 461         def __init__(self, params):
 462                 """Create a FileDownloader object with the given options."""
 463                 self._ies = []
 464                 self._pps = []
 465                 self._download_retcode = 0
 466                 self._num_downloads = 0
 467                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 468                 self.params = params
 469
 470         @staticmethod
 471         def format_bytes(bytes):
 472                 if bytes is None:
 473                         return 'N/A'
 474                 if type(bytes) is str:
 475                         bytes = float(bytes)
 476                 if bytes == 0.0:
 477                         exponent = 0
 478                 else:
 479                         exponent = long(math.log(bytes, 1024.0))
 480                 suffix = 'bkMGTPEZY'[exponent]
 481                 converted = float(bytes) / float(1024 ** exponent)
 482                 return '%.2f%s' % (converted, suffix)
 483
 484         @staticmethod
 485         def calc_percent(byte_counter, data_len):
 486                 if data_len is None:
 487                         return '---.-%'
 488                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 489
 490         @staticmethod
 491         def calc_eta(start, now, total, current):
 492                 if total is None:
 493                         return '--:--'
 494                 dif = now - start
 495                 if current == 0 or dif < 0.001: # One millisecond
 496                         return '--:--'
 497                 rate = float(current) / dif
 498                 eta = long((float(total) - float(current)) / rate)
 499                 (eta_mins, eta_secs) = divmod(eta, 60)
 500                 if eta_mins > 99:
 501                         return '--:--'
 502                 return '%02d:%02d' % (eta_mins, eta_secs)
 503
 504         @staticmethod
 505         def calc_speed(start, now, bytes):
 506                 dif = now - start
 507                 if bytes == 0 or dif < 0.001: # One millisecond
 508                         return '%10s' % '---b/s'
 509                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 510
 511         @staticmethod
 512         def best_block_size(elapsed_time, bytes):
 513                 new_min = max(bytes / 2.0, 1.0)
 514                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 515                 if elapsed_time < 0.001:
 516                         return long(new_max)
 517                 rate = bytes / elapsed_time
 518                 if rate > new_max:
 519                         return long(new_max)
 520                 if rate < new_min:
 521                         return long(new_min)
 522                 return long(rate)
 523
 524         @staticmethod
 525         def parse_bytes(bytestr):
 526                 """Parse a string indicating a byte quantity into a long integer."""
 527                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 528                 if matchobj is None:
 529                         return None
 530                 number = float(matchobj.group(1))
 531                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 532                 return long(round(number * multiplier))
 533
 534         def add_info_extractor(self, ie):
 535                 """Add an InfoExtractor object to the end of the list."""
 536                 self._ies.append(ie)
 537                 ie.set_downloader(self)
 538
 539         def add_post_processor(self, pp):
 540                 """Add a PostProcessor object to the end of the chain."""
 541                 self._pps.append(pp)
 542                 pp.set_downloader(self)
 543
 544         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 545                 """Print message to stdout if not in quiet mode."""
 546                 try:
 547                         if not self.params.get('quiet', False):
 548                                 terminator = [u'\n', u''][skip_eol]
 549                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 550                         self._screen_file.flush()
 551                 except (UnicodeEncodeError), err:
 552                         if not ignore_encoding_errors:
 553                                 raise
 554
 555         def to_stderr(self, message):
 556                 """Print message to stderr."""
 557                 print >>sys.stderr, message.encode(preferredencoding())
 558
 559         def to_cons_title(self, message):
 560                 """Set console/terminal window title to message."""
 561                 if not self.params.get('consoletitle', False):
 562                         return
 563                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 564                         # c_wchar_p() might not be necessary if `message` is
 565                         # already of type unicode()
 566                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 567                 elif 'TERM' in os.environ:
 568                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 569
 570         def fixed_template(self):
 571                 """Checks if the output template is fixed."""
 572                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 573
 574         def trouble(self, message=None):
 575                 """Determine action to take when a download problem appears.
 576
 577                 Depending on if the downloader has been configured to ignore
 578                 download errors or not, this method may throw an exception or
 579                 not when errors are found, after printing the message.
 580                 """
 581                 if message is not None:
 582                         self.to_stderr(message)
 583                 if not self.params.get('ignoreerrors', False):
 584                         raise DownloadError(message)
 585                 self._download_retcode = 1
 586
 587         def slow_down(self, start_time, byte_counter):
 588                 """Sleep if the download speed is over the rate limit."""
 589                 rate_limit = self.params.get('ratelimit', None)
 590                 if rate_limit is None or byte_counter == 0:
 591                         return
 592                 now = time.time()
 593                 elapsed = now - start_time
 594                 if elapsed <= 0.0:
 595                         return
 596                 speed = float(byte_counter) / elapsed
 597                 if speed > rate_limit:
 598                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 599
 600         def temp_name(self, filename):
 601                 """Returns a temporary filename for the given filename."""
 602                 if self.params.get('nopart', False) or filename == u'-' or \
 603                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 604                         return filename
 605                 return filename + u'.part'
 606
 607         def undo_temp_name(self, filename):
 608                 if filename.endswith(u'.part'):
 609                         return filename[:-len(u'.part')]
 610                 return filename
 611
 612         def try_rename(self, old_filename, new_filename):
 613                 try:
 614                         if old_filename == new_filename:
 615                                 return
 616                         os.rename(old_filename, new_filename)
 617                 except (IOError, OSError), err:
 618                         self.trouble(u'ERROR: unable to rename file')
 619
 620         def try_utime(self, filename, last_modified_hdr):
 621                 """Try to set the last-modified time of the given file."""
 622                 if last_modified_hdr is None:
 623                         return
 624                 if not os.path.isfile(filename):
 625                         return
 626                 timestr = last_modified_hdr
 627                 if timestr is None:
 628                         return
 629                 filetime = timeconvert(timestr)
 630                 if filetime is None:
 631                         return filetime
 632                 try:
 633                         os.utime(filename, (time.time(), filetime))
 634                 except:
 635                         pass
 636                 return filetime
 637
 638         def report_writedescription(self, descfn):
 639                 """ Report that the description file is being written """
 640                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 641
 642         def report_writeinfojson(self, infofn):
 643                 """ Report that the metadata file has been written """
 644                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 645
 646         def report_destination(self, filename):
 647                 """Report destination filename."""
 648                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 649
 650         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 651                 """Report download progress."""
 652                 if self.params.get('noprogress', False):
 653                         return
 654                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 655                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 656                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 657                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 658
 659         def report_resuming_byte(self, resume_len):
 660                 """Report attempt to resume at given byte."""
 661                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 662
 663         def report_retry(self, count, retries):
 664                 """Report retry in case of HTTP error 5xx"""
 665                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 666
 667         def report_file_already_downloaded(self, file_name):
 668                 """Report file has already been fully downloaded."""
 669                 try:
 670                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 671                 except (UnicodeEncodeError), err:
 672                         self.to_screen(u'[download] The file has already been downloaded')
 673
 674         def report_unable_to_resume(self):
 675                 """Report it was impossible to resume download."""
 676                 self.to_screen(u'[download] Unable to resume')
 677
 678         def report_finish(self):
 679                 """Report download finished."""
 680                 if self.params.get('noprogress', False):
 681                         self.to_screen(u'[download] Download completed')
 682                 else:
 683                         self.to_screen(u'')
 684
 685         def increment_downloads(self):
 686                 """Increment the ordinal that assigns a number to each file."""
 687                 self._num_downloads += 1
 688
 689         def prepare_filename(self, info_dict):
 690                 """Generate the output filename."""
 691                 try:
 692                         template_dict = dict(info_dict)
 693                         template_dict['epoch'] = unicode(long(time.time()))
 694                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 695                         filename = self.params['outtmpl'] % template_dict
 696                         return filename
 697                 except (ValueError, KeyError), err:
 698                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 699                         return None
 700
 701         def process_info(self, info_dict):
 702                 """Process a single dictionary returned by an InfoExtractor."""
 703                 filename = self.prepare_filename(info_dict)
 704
 705                 # Forced printings
 706                 if self.params.get('forcetitle', False):
 707                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 708                 if self.params.get('forceurl', False):
 709                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 710                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 711                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 712                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 713                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 714                 if self.params.get('forcefilename', False) and filename is not None:
 715                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 716                 if self.params.get('forceformat', False):
 717                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 718
 719                 # Do nothing else if in simulate mode
 720                 if self.params.get('simulate', False):
 721                         return
 722
 723                 if filename is None:
 724                         return
 725
 726                 matchtitle=self.params.get('matchtitle',False)
 727                 rejecttitle=self.params.get('rejecttitle',False)
 728                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 729                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 730                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
 731                         return
 732                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 733                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
 734                         return
 735
 736                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 737                         self.to_stderr(u'WARNING: file exists and will be skipped')
 738                         return
 739
 740                 try:
 741                         dn = os.path.dirname(filename)
 742                         if dn != '' and not os.path.exists(dn):
 743                                 os.makedirs(dn)
 744                 except (OSError, IOError), err:
 745                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 746                         return
 747
 748                 if self.params.get('writedescription', False):
 749                         try:
 750                                 descfn = filename + '.description'
 751                                 self.report_writedescription(descfn)
 752                                 descfile = open(descfn, 'wb')
 753                                 try:
 754                                         descfile.write(info_dict['description'].encode('utf-8'))
 755                                 finally:
 756                                         descfile.close()
 757                         except (OSError, IOError):
 758                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 759                                 return
 760
 761                 if self.params.get('writeinfojson', False):
 762                         infofn = filename + '.info.json'
 763                         self.report_writeinfojson(infofn)
 764                         try:
 765                                 json.dump
 766                         except (NameError,AttributeError):
 767                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 768                                 return
 769                         try:
 770                                 infof = open(infofn, 'wb')
 771                                 try:
 772                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 773                                         json.dump(json_info_dict, infof)
 774                                 finally:
 775                                         infof.close()
 776                         except (OSError, IOError):
 777                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 778                                 return
 779
 780                 if not self.params.get('skip_download', False):
 781                         try:
 782                                 success = self._do_download(filename, info_dict)
 783                         except (OSError, IOError), err:
 784                                 raise UnavailableVideoError
 785                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 786                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 787                                 return
 788                         except (ContentTooShortError, ), err:
 789                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 790                                 return
 791
 792                         if success:
 793                                 try:
 794                                         self.post_process(filename, info_dict)
 795                                 except (PostProcessingError), err:
 796                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 797                                         return
 798
 799         def download(self, url_list):
 800                 """Download a given list of URLs."""
 801                 if len(url_list) > 1 and self.fixed_template():
 802                         raise SameFileError(self.params['outtmpl'])
 803
 804                 for url in url_list:
 805                         suitable_found = False
 806                         for ie in self._ies:
 807                                 # Go to next InfoExtractor if not suitable
 808                                 if not ie.suitable(url):
 809                                         continue
 810
 811                                 # Suitable InfoExtractor found
 812                                 suitable_found = True
 813
 814                                 # Extract information from URL and process it
 815                                 ie.extract(url)
 816
 817                                 # Suitable InfoExtractor had been found; go to next URL
 818                                 break
 819
 820                         if not suitable_found:
 821                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 822
 823                 return self._download_retcode
 824
 825         def post_process(self, filename, ie_info):
 826                 """Run the postprocessing chain on the given file."""
 827                 info = dict(ie_info)
 828                 info['filepath'] = filename
 829                 for pp in self._pps:
 830                         info = pp.run(info)
 831                         if info is None:
 832                                 break
 833
 834         def _download_with_rtmpdump(self, filename, url, player_url):
 835                 self.report_destination(filename)
 836                 tmpfilename = self.temp_name(filename)
 837
 838                 # Check for rtmpdump first
 839                 try:
 840                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 841                 except (OSError, IOError):
 842                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 843                         return False
 844
 845                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 846                 # the connection was interrumpted and resuming appears to be
 847                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 848                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 849                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 850                 while retval == 2 or retval == 1:
 851                         prevsize = os.path.getsize(tmpfilename)
 852                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 853                         time.sleep(5.0) # This seems to be needed
 854                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 855                         cursize = os.path.getsize(tmpfilename)
 856                         if prevsize == cursize and retval == 1:
 857                                 break
 858                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 859                         if prevsize == cursize and retval == 2 and cursize > 1024:
 860                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 861                                 retval = 0
 862                                 break
 863                 if retval == 0:
 864                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 865                         self.try_rename(tmpfilename, filename)
 866                         return True
 867                 else:
 868                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 869                         return False
 870
 871         def _do_download(self, filename, info_dict):
 872                 url = info_dict['url']
 873                 player_url = info_dict.get('player_url', None)
 874
 875                 # Check file already present
 876                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 877                         self.report_file_already_downloaded(filename)
 878                         return True
 879
 880                 # Attempt to download using rtmpdump
 881                 if url.startswith('rtmp'):
 882                         return self._download_with_rtmpdump(filename, url, player_url)
 883
 884                 tmpfilename = self.temp_name(filename)
 885                 stream = None
 886
 887                 # Do not include the Accept-Encoding header
 888                 headers = {'Youtubedl-no-compression': 'True'}
 889                 basic_request = urllib2.Request(url, None, headers)
 890                 request = urllib2.Request(url, None, headers)
 891
 892                 # Establish possible resume length
 893                 if os.path.isfile(tmpfilename):
 894                         resume_len = os.path.getsize(tmpfilename)
 895                 else:
 896                         resume_len = 0
 897
 898                 open_mode = 'wb'
 899                 if resume_len != 0:
 900                         if self.params.get('continuedl', False):
 901                                 self.report_resuming_byte(resume_len)
 902                                 request.add_header('Range','bytes=%d-' % resume_len)
 903                                 open_mode = 'ab'
 904                         else:
 905                                 resume_len = 0
 906
 907                 count = 0
 908                 retries = self.params.get('retries', 0)
 909                 while count <= retries:
 910                         # Establish connection
 911                         try:
 912                                 if count == 0 and 'urlhandle' in info_dict:
 913                                         data = info_dict['urlhandle']
 914                                 data = urllib2.urlopen(request)
 915                                 break
 916                         except (urllib2.HTTPError, ), err:
 917                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 918                                         # Unexpected HTTP error
 919                                         raise
 920                                 elif err.code == 416:
 921                                         # Unable to resume (requested range not satisfiable)
 922                                         try:
 923                                                 # Open the connection again without the range header
 924                                                 data = urllib2.urlopen(basic_request)
 925                                                 content_length = data.info()['Content-Length']
 926                                         except (urllib2.HTTPError, ), err:
 927                                                 if err.code < 500 or err.code >= 600:
 928                                                         raise
 929                                         else:
 930                                                 # Examine the reported length
 931                                                 if (content_length is not None and
 932                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 933                                                         # The file had already been fully downloaded.
 934                                                         # Explanation to the above condition: in issue #175 it was revealed that
 935                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 936                                                         # changing the file size slightly and causing problems for some users. So
 937                                                         # I decided to implement a suggested change and consider the file
 938                                                         # completely downloaded if the file size differs less than 100 bytes from
 939                                                         # the one in the hard drive.
 940                                                         self.report_file_already_downloaded(filename)
 941                                                         self.try_rename(tmpfilename, filename)
 942                                                         return True
 943                                                 else:
 944                                                         # The length does not match, we start the download over
 945                                                         self.report_unable_to_resume()
 946                                                         open_mode = 'wb'
 947                                                         break
 948                         # Retry
 949                         count += 1
 950                         if count <= retries:
 951                                 self.report_retry(count, retries)
 952
 953                 if count > retries:
 954                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 955                         return False
 956
 957                 data_len = data.info().get('Content-length', None)
 958                 if data_len is not None:
 959                         data_len = long(data_len) + resume_len
 960                 data_len_str = self.format_bytes(data_len)
 961                 byte_counter = 0 + resume_len
 962                 block_size = 1024
 963                 start = time.time()
 964                 while True:
 965                         # Download and write
 966                         before = time.time()
 967                         data_block = data.read(block_size)
 968                         after = time.time()
 969                         if len(data_block) == 0:
 970                                 break
 971                         byte_counter += len(data_block)
 972
 973                         # Open file just in time
 974                         if stream is None:
 975                                 try:
 976                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 977                                         assert stream is not None
 978                                         filename = self.undo_temp_name(tmpfilename)
 979                                         self.report_destination(filename)
 980                                 except (OSError, IOError), err:
 981                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 982                                         return False
 983                         try:
 984                                 stream.write(data_block)
 985                         except (IOError, OSError), err:
 986                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 987                                 return False
 988                         block_size = self.best_block_size(after - before, len(data_block))
 989
 990                         # Progress message
 991                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 992                         if data_len is None:
 993                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
 994                         else:
 995                                 percent_str = self.calc_percent(byte_counter, data_len)
 996                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 997                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 998
 999                         # Apply rate limit
1000                         self.slow_down(start, byte_counter - resume_len)
1001
1002                 if stream is None:
1003                         self.trouble(u'\nERROR: Did not get any data blocks')
1004                         return False
1005                 stream.close()
1006                 self.report_finish()
1007                 if data_len is not None and byte_counter != data_len:
1008                         raise ContentTooShortError(byte_counter, long(data_len))
1009                 self.try_rename(tmpfilename, filename)
1010
1011                 # Update file modification time
1012                 if self.params.get('updatetime', True):
1013                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1014
1015                 return True
1016
1017
1018 class InfoExtractor(object):
1019         """Information Extractor class.
1020
1021         Information extractors are the classes that, given a URL, extract
1022         information from the video (or videos) the URL refers to. This
1023         information includes the real video URL, the video title and simplified
1024         title, author and others. The information is stored in a dictionary
1025         which is then passed to the FileDownloader. The FileDownloader
1026         processes this information possibly downloading the video to the file
1027         system, among other possible outcomes. The dictionaries must include
1028         the following fields:
1029
1030         id:             Video identifier.
1031         url:            Final video URL.
1032         uploader:       Nickname of the video uploader.
1033         title:          Literal title.
1034         stitle:         Simplified title.
1035         ext:            Video filename extension.
1036         format:         Video format.
1037         player_url:     SWF Player URL (may be None).
1038
1039         The following fields are optional. Their primary purpose is to allow
1040         youtube-dl to serve as the backend for a video search function, such
1041         as the one in youtube2mp3.  They are only used when their respective
1042         forced printing functions are called:
1043
1044         thumbnail:      Full URL to a video thumbnail image.
1045         description:    One-line video description.
1046
1047         Subclasses of this one should re-define the _real_initialize() and
1048         _real_extract() methods and define a _VALID_URL regexp.
1049         Probably, they should also be added to the list of extractors.
1050         """
1051
1052         _ready = False
1053         _downloader = None
1054
1055         def __init__(self, downloader=None):
1056                 """Constructor. Receives an optional downloader."""
1057                 self._ready = False
1058                 self.set_downloader(downloader)
1059
1060         def suitable(self, url):
1061                 """Receives a URL and returns True if suitable for this IE."""
1062                 return re.match(self._VALID_URL, url) is not None
1063
1064         def initialize(self):
1065                 """Initializes an instance (authentication, etc)."""
1066                 if not self._ready:
1067                         self._real_initialize()
1068                         self._ready = True
1069
1070         def extract(self, url):
1071                 """Extracts URL information and returns it in list of dicts."""
1072                 self.initialize()
1073                 return self._real_extract(url)
1074
1075         def set_downloader(self, downloader):
1076                 """Sets the downloader for this IE."""
1077                 self._downloader = downloader
1078
1079         def _real_initialize(self):
1080                 """Real initialization process. Redefine in subclasses."""
1081                 pass
1082
1083         def _real_extract(self, url):
1084                 """Real extraction process. Redefine in subclasses."""
1085                 pass
1086
1087
1088 class YoutubeIE(InfoExtractor):
1089         """Information extractor for youtube.com."""
1090
1091         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1092         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1093         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1094         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1095         _NETRC_MACHINE = 'youtube'
1096         # Listed in order of quality
1097         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1098         _video_extensions = {
1099                 '13': '3gp',
1100                 '17': 'mp4',
1101                 '18': 'mp4',
1102                 '22': 'mp4',
1103                 '37': 'mp4',
1104                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1105                 '43': 'webm',
1106                 '44': 'webm',
1107                 '45': 'webm',
1108         }
1109         _video_dimensions = {
1110                 '5': '240x400',
1111                 '6': '???',
1112                 '13': '???',
1113                 '17': '144x176',
1114                 '18': '360x640',
1115                 '22': '720x1280',
1116                 '34': '360x640',
1117                 '35': '480x854',
1118                 '37': '1080x1920',
1119                 '38': '3072x4096',
1120                 '43': '360x640',
1121                 '44': '480x854',
1122                 '45': '720x1280',
1123         }
1124         IE_NAME = u'youtube'
1125
1126         def report_lang(self):
1127                 """Report attempt to set language."""
1128                 self._downloader.to_screen(u'[youtube] Setting language')
1129
1130         def report_login(self):
1131                 """Report attempt to log in."""
1132                 self._downloader.to_screen(u'[youtube] Logging in')
1133
1134         def report_age_confirmation(self):
1135                 """Report attempt to confirm age."""
1136                 self._downloader.to_screen(u'[youtube] Confirming age')
1137
1138         def report_video_webpage_download(self, video_id):
1139                 """Report attempt to download video webpage."""
1140                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1141
1142         def report_video_info_webpage_download(self, video_id):
1143                 """Report attempt to download video info webpage."""
1144                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1145
1146         def report_information_extraction(self, video_id):
1147                 """Report attempt to extract video information."""
1148                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1149
1150         def report_unavailable_format(self, video_id, format):
1151                 """Report extracted video URL."""
1152                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1153
1154         def report_rtmp_download(self):
1155                 """Indicate the download will use the RTMP protocol."""
1156                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1157
1158         def _print_formats(self, formats):
1159                 print 'Available formats:'
1160                 for x in formats:
1161                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1162
1163         def _real_initialize(self):
1164                 if self._downloader is None:
1165                         return
1166
1167                 username = None
1168                 password = None
1169                 downloader_params = self._downloader.params
1170
1171                 # Attempt to use provided username and password or .netrc data
1172                 if downloader_params.get('username', None) is not None:
1173                         username = downloader_params['username']
1174                         password = downloader_params['password']
1175                 elif downloader_params.get('usenetrc', False):
1176                         try:
1177                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1178                                 if info is not None:
1179                                         username = info[0]
1180                                         password = info[2]
1181                                 else:
1182                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1183                         except (IOError, netrc.NetrcParseError), err:
1184                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1185                                 return
1186
1187                 # Set language
1188                 request = urllib2.Request(self._LANG_URL)
1189                 try:
1190                         self.report_lang()
1191                         urllib2.urlopen(request).read()
1192                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1193                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1194                         return
1195
1196                 # No authentication to be performed
1197                 if username is None:
1198                         return
1199
1200                 # Log in
1201                 login_form = {
1202                                 'current_form': 'loginForm',
1203                                 'next':         '/',
1204                                 'action_login': 'Log In',
1205                                 'username':     username,
1206                                 'password':     password,
1207                                 }
1208                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1209                 try:
1210                         self.report_login()
1211                         login_results = urllib2.urlopen(request).read()
1212                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1213                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1214                                 return
1215                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1216                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1217                         return
1218
1219                 # Confirm age
1220                 age_form = {
1221                                 'next_url':             '/',
1222                                 'action_confirm':       'Confirm',
1223                                 }
1224                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1225                 try:
1226                         self.report_age_confirmation()
1227                         age_results = urllib2.urlopen(request).read()
1228                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1229                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1230                         return
1231
1232         def _real_extract(self, url):
1233                 # Extract video id from URL
1234                 mobj = re.match(self._VALID_URL, url)
1235                 if mobj is None:
1236                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1237                         return
1238                 video_id = mobj.group(2)
1239
1240                 # Get video webpage
1241                 self.report_video_webpage_download(video_id)
1242                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1243                 try:
1244                         video_webpage = urllib2.urlopen(request).read()
1245                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1246                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1247                         return
1248
1249                 # Attempt to extract SWF player URL
1250                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1251                 if mobj is not None:
1252                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1253                 else:
1254                         player_url = None
1255
1256                 # Get video info
1257                 self.report_video_info_webpage_download(video_id)
1258                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1259                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1260                                         % (video_id, el_type))
1261                         request = urllib2.Request(video_info_url)
1262                         try:
1263                                 video_info_webpage = urllib2.urlopen(request).read()
1264                                 video_info = parse_qs(video_info_webpage)
1265                                 if 'token' in video_info:
1266                                         break
1267                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1268                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1269                                 return
1270                 if 'token' not in video_info:
1271                         if 'reason' in video_info:
1272                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1273                         else:
1274                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1275                         return
1276
1277                 # Start extracting information
1278                 self.report_information_extraction(video_id)
1279
1280                 # uploader
1281                 if 'author' not in video_info:
1282                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1283                         return
1284                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1285
1286                 # title
1287                 if 'title' not in video_info:
1288                         self._downloader.trouble(u'ERROR: unable to extract video title')
1289                         return
1290                 video_title = urllib.unquote_plus(video_info['title'][0])
1291                 video_title = video_title.decode('utf-8')
1292                 video_title = sanitize_title(video_title)
1293
1294                 # simplified title
1295                 simple_title = _simplify_title(video_title)
1296
1297                 # thumbnail image
1298                 if 'thumbnail_url' not in video_info:
1299                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1300                         video_thumbnail = ''
1301                 else:   # don't panic if we can't find it
1302                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1303
1304                 # upload date
1305                 upload_date = u'NA'
1306                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1307                 if mobj is not None:
1308                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1309                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1310                         for expression in format_expressions:
1311                                 try:
1312                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1313                                 except:
1314                                         pass
1315
1316                 # description
1317                 try:
1318                         lxml.etree
1319                 except NameError:
1320                         video_description = u'No description available.'
1321                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1322                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1323                                 if mobj is not None:
1324                                         video_description = mobj.group(1).decode('utf-8')
1325                 else:
1326                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1327                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1328                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1329                         # TODO use another parser
1330
1331                 # token
1332                 video_token = urllib.unquote_plus(video_info['token'][0])
1333
1334                 # Decide which formats to download
1335                 req_format = self._downloader.params.get('format', None)
1336
1337                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1338                         self.report_rtmp_download()
1339                         video_url_list = [(None, video_info['conn'][0])]
1340                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1341                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1342                         url_data = [parse_qs(uds) for uds in url_data_strs]
1343                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1344                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1345
1346                         format_limit = self._downloader.params.get('format_limit', None)
1347                         if format_limit is not None and format_limit in self._available_formats:
1348                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1349                         else:
1350                                 format_list = self._available_formats
1351                         existing_formats = [x for x in format_list if x in url_map]
1352                         if len(existing_formats) == 0:
1353                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1354                                 return
1355                         if self._downloader.params.get('listformats', None):
1356                                 self._print_formats(existing_formats)
1357                                 return
1358                         if req_format is None or req_format == 'best':
1359                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1360                         elif req_format == 'worst':
1361                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1362                         elif req_format in ('-1', 'all'):
1363                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1364                         else:
1365                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1366                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1367                                 req_formats = req_format.split('/')
1368                                 video_url_list = None
1369                                 for rf in req_formats:
1370                                         if rf in url_map:
1371                                                 video_url_list = [(rf, url_map[rf])]
1372                                                 break
1373                                 if video_url_list is None:
1374                                         self._downloader.trouble(u'ERROR: requested format not available')
1375                                         return
1376                 else:
1377                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1378                         return
1379
1380                 for format_param, video_real_url in video_url_list:
1381                         # At this point we have a new video
1382                         self._downloader.increment_downloads()
1383
1384                         # Extension
1385                         video_extension = self._video_extensions.get(format_param, 'flv')
1386
1387                         try:
1388                                 # Process video information
1389                                 self._downloader.process_info({
1390                                         'id':           video_id.decode('utf-8'),
1391                                         'url':          video_real_url.decode('utf-8'),
1392                                         'uploader':     video_uploader.decode('utf-8'),
1393                                         'upload_date':  upload_date,
1394                                         'title':        video_title,
1395                                         'stitle':       simple_title,
1396                                         'ext':          video_extension.decode('utf-8'),
1397                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1398                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1399                                         'description':  video_description,
1400                                         'player_url':   player_url,
1401                                 })
1402                         except UnavailableVideoError, err:
1403                                 self._downloader.trouble(u'\nERROR: unable to download video')
1404
1405
1406 class MetacafeIE(InfoExtractor):
1407         """Information Extractor for metacafe.com."""
1408
1409         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1410         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1411         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1412         _youtube_ie = None
1413         IE_NAME = u'metacafe'
1414
1415         def __init__(self, youtube_ie, downloader=None):
1416                 InfoExtractor.__init__(self, downloader)
1417                 self._youtube_ie = youtube_ie
1418
1419         def report_disclaimer(self):
1420                 """Report disclaimer retrieval."""
1421                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1422
1423         def report_age_confirmation(self):
1424                 """Report attempt to confirm age."""
1425                 self._downloader.to_screen(u'[metacafe] Confirming age')
1426
1427         def report_download_webpage(self, video_id):
1428                 """Report webpage download."""
1429                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1430
1431         def report_extraction(self, video_id):
1432                 """Report information extraction."""
1433                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1434
1435         def _real_initialize(self):
1436                 # Retrieve disclaimer
1437                 request = urllib2.Request(self._DISCLAIMER)
1438                 try:
1439                         self.report_disclaimer()
1440                         disclaimer = urllib2.urlopen(request).read()
1441                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1443                         return
1444
1445                 # Confirm age
1446                 disclaimer_form = {
1447                         'filters': '0',
1448                         'submit': "Continue - I'm over 18",
1449                         }
1450                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1451                 try:
1452                         self.report_age_confirmation()
1453                         disclaimer = urllib2.urlopen(request).read()
1454                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1455                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1456                         return
1457
1458         def _real_extract(self, url):
1459                 # Extract id and simplified title from URL
1460                 mobj = re.match(self._VALID_URL, url)
1461                 if mobj is None:
1462                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1463                         return
1464
1465                 video_id = mobj.group(1)
1466
1467                 # Check if video comes from YouTube
1468                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1469                 if mobj2 is not None:
1470                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1471                         return
1472
1473                 # At this point we have a new video
1474                 self._downloader.increment_downloads()
1475
1476                 simple_title = mobj.group(2).decode('utf-8')
1477
1478                 # Retrieve video webpage to extract further information
1479                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1480                 try:
1481                         self.report_download_webpage(video_id)
1482                         webpage = urllib2.urlopen(request).read()
1483                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1484                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1485                         return
1486
1487                 # Extract URL, uploader and title from webpage
1488                 self.report_extraction(video_id)
1489                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1490                 if mobj is not None:
1491                         mediaURL = urllib.unquote(mobj.group(1))
1492                         video_extension = mediaURL[-3:]
1493
1494                         # Extract gdaKey if available
1495                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1496                         if mobj is None:
1497                                 video_url = mediaURL
1498                         else:
1499                                 gdaKey = mobj.group(1)
1500                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1501                 else:
1502                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1503                         if mobj is None:
1504                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1505                                 return
1506                         vardict = parse_qs(mobj.group(1))
1507                         if 'mediaData' not in vardict:
1508                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1509                                 return
1510                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1511                         if mobj is None:
1512                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1513                                 return
1514                         mediaURL = mobj.group(1).replace('\\/', '/')
1515                         video_extension = mediaURL[-3:]
1516                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1517
1518                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1519                 if mobj is None:
1520                         self._downloader.trouble(u'ERROR: unable to extract title')
1521                         return
1522                 video_title = mobj.group(1).decode('utf-8')
1523                 video_title = sanitize_title(video_title)
1524
1525                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1526                 if mobj is None:
1527                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1528                         return
1529                 video_uploader = mobj.group(1)
1530
1531                 try:
1532                         # Process video information
1533                         self._downloader.process_info({
1534                                 'id':           video_id.decode('utf-8'),
1535                                 'url':          video_url.decode('utf-8'),
1536                                 'uploader':     video_uploader.decode('utf-8'),
1537                                 'upload_date':  u'NA',
1538                                 'title':        video_title,
1539                                 'stitle':       simple_title,
1540                                 'ext':          video_extension.decode('utf-8'),
1541                                 'format':       u'NA',
1542                                 'player_url':   None,
1543                         })
1544                 except UnavailableVideoError:
1545                         self._downloader.trouble(u'\nERROR: unable to download video')
1546
1547
1548 class DailymotionIE(InfoExtractor):
1549         """Information Extractor for Dailymotion"""
1550
1551         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1552         IE_NAME = u'dailymotion'
1553
1554         def __init__(self, downloader=None):
1555                 InfoExtractor.__init__(self, downloader)
1556
1557         def report_download_webpage(self, video_id):
1558                 """Report webpage download."""
1559                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1560
1561         def report_extraction(self, video_id):
1562                 """Report information extraction."""
1563                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1564
1565         def _real_extract(self, url):
1566                 # Extract id and simplified title from URL
1567                 mobj = re.match(self._VALID_URL, url)
1568                 if mobj is None:
1569                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1570                         return
1571
1572                 # At this point we have a new video
1573                 self._downloader.increment_downloads()
1574                 video_id = mobj.group(1)
1575
1576                 simple_title = mobj.group(2).decode('utf-8')
1577                 video_extension = 'flv'
1578
1579                 # Retrieve video webpage to extract further information
1580                 request = urllib2.Request(url)
1581                 request.add_header('Cookie', 'family_filter=off')
1582                 try:
1583                         self.report_download_webpage(video_id)
1584                         webpage = urllib2.urlopen(request).read()
1585                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1587                         return
1588
1589                 # Extract URL, uploader and title from webpage
1590                 self.report_extraction(video_id)
1591                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1592                 if mobj is None:
1593                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1594                         return
1595                 sequence = urllib.unquote(mobj.group(1))
1596                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1597                 if mobj is None:
1598                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1599                         return
1600                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1601
1602                 # if needed add http://www.dailymotion.com/ if relative URL
1603
1604                 video_url = mediaURL
1605
1606                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1607                 if mobj is None:
1608                         self._downloader.trouble(u'ERROR: unable to extract title')
1609                         return
1610                 video_title = mobj.group(1).decode('utf-8')
1611                 video_title = sanitize_title(video_title)
1612
1613                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1614                 if mobj is None:
1615                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1616                         return
1617                 video_uploader = mobj.group(1)
1618
1619                 try:
1620                         # Process video information
1621                         self._downloader.process_info({
1622                                 'id':           video_id.decode('utf-8'),
1623                                 'url':          video_url.decode('utf-8'),
1624                                 'uploader':     video_uploader.decode('utf-8'),
1625                                 'upload_date':  u'NA',
1626                                 'title':        video_title,
1627                                 'stitle':       simple_title,
1628                                 'ext':          video_extension.decode('utf-8'),
1629                                 'format':       u'NA',
1630                                 'player_url':   None,
1631                         })
1632                 except UnavailableVideoError:
1633                         self._downloader.trouble(u'\nERROR: unable to download video')
1634
1635
1636 class GoogleIE(InfoExtractor):
1637         """Information extractor for video.google.com."""
1638
1639         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1640         IE_NAME = u'video.google'
1641
1642         def __init__(self, downloader=None):
1643                 InfoExtractor.__init__(self, downloader)
1644
1645         def report_download_webpage(self, video_id):
1646                 """Report webpage download."""
1647                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1648
1649         def report_extraction(self, video_id):
1650                 """Report information extraction."""
1651                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1652
1653         def _real_extract(self, url):
1654                 # Extract id from URL
1655                 mobj = re.match(self._VALID_URL, url)
1656                 if mobj is None:
1657                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1658                         return
1659
1660                 # At this point we have a new video
1661                 self._downloader.increment_downloads()
1662                 video_id = mobj.group(1)
1663
1664                 video_extension = 'mp4'
1665
1666                 # Retrieve video webpage to extract further information
1667                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1668                 try:
1669                         self.report_download_webpage(video_id)
1670                         webpage = urllib2.urlopen(request).read()
1671                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1672                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1673                         return
1674
1675                 # Extract URL, uploader, and title from webpage
1676                 self.report_extraction(video_id)
1677                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1678                 if mobj is None:
1679                         video_extension = 'flv'
1680                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1681                 if mobj is None:
1682                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1683                         return
1684                 mediaURL = urllib.unquote(mobj.group(1))
1685                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1686                 mediaURL = mediaURL.replace('\\x26', '\x26')
1687
1688                 video_url = mediaURL
1689
1690                 mobj = re.search(r'<title>(.*)</title>', webpage)
1691                 if mobj is None:
1692                         self._downloader.trouble(u'ERROR: unable to extract title')
1693                         return
1694                 video_title = mobj.group(1).decode('utf-8')
1695                 video_title = sanitize_title(video_title)
1696                 simple_title = _simplify_title(video_title)
1697
1698                 # Extract video description
1699                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1700                 if mobj is None:
1701                         self._downloader.trouble(u'ERROR: unable to extract video description')
1702                         return
1703                 video_description = mobj.group(1).decode('utf-8')
1704                 if not video_description:
1705                         video_description = 'No description available.'
1706
1707                 # Extract video thumbnail
1708                 if self._downloader.params.get('forcethumbnail', False):
1709                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1710                         try:
1711                                 webpage = urllib2.urlopen(request).read()
1712                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1713                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1714                                 return
1715                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1716                         if mobj is None:
1717                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1718                                 return
1719                         video_thumbnail = mobj.group(1)
1720                 else:   # we need something to pass to process_info
1721                         video_thumbnail = ''
1722
1723                 try:
1724                         # Process video information
1725                         self._downloader.process_info({
1726                                 'id':           video_id.decode('utf-8'),
1727                                 'url':          video_url.decode('utf-8'),
1728                                 'uploader':     u'NA',
1729                                 'upload_date':  u'NA',
1730                                 'title':        video_title,
1731                                 'stitle':       simple_title,
1732                                 'ext':          video_extension.decode('utf-8'),
1733                                 'format':       u'NA',
1734                                 'player_url':   None,
1735                         })
1736                 except UnavailableVideoError:
1737                         self._downloader.trouble(u'\nERROR: unable to download video')
1738
1739
1740 class PhotobucketIE(InfoExtractor):
1741         """Information extractor for photobucket.com."""
1742
1743         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1744         IE_NAME = u'photobucket'
1745
1746         def __init__(self, downloader=None):
1747                 InfoExtractor.__init__(self, downloader)
1748
1749         def report_download_webpage(self, video_id):
1750                 """Report webpage download."""
1751                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1752
1753         def report_extraction(self, video_id):
1754                 """Report information extraction."""
1755                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1756
1757         def _real_extract(self, url):
1758                 # Extract id from URL
1759                 mobj = re.match(self._VALID_URL, url)
1760                 if mobj is None:
1761                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1762                         return
1763
1764                 # At this point we have a new video
1765                 self._downloader.increment_downloads()
1766                 video_id = mobj.group(1)
1767
1768                 video_extension = 'flv'
1769
1770                 # Retrieve video webpage to extract further information
1771                 request = urllib2.Request(url)
1772                 try:
1773                         self.report_download_webpage(video_id)
1774                         webpage = urllib2.urlopen(request).read()
1775                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1776                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1777                         return
1778
1779                 # Extract URL, uploader, and title from webpage
1780                 self.report_extraction(video_id)
1781                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1782                 if mobj is None:
1783                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1784                         return
1785                 mediaURL = urllib.unquote(mobj.group(1))
1786
1787                 video_url = mediaURL
1788
1789                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1790                 if mobj is None:
1791                         self._downloader.trouble(u'ERROR: unable to extract title')
1792                         return
1793                 video_title = mobj.group(1).decode('utf-8')
1794                 video_title = sanitize_title(video_title)
1795                 simple_title = _simplify_title(vide_title)
1796
1797                 video_uploader = mobj.group(2).decode('utf-8')
1798
1799                 try:
1800                         # Process video information
1801                         self._downloader.process_info({
1802                                 'id':           video_id.decode('utf-8'),
1803                                 'url':          video_url.decode('utf-8'),
1804                                 'uploader':     video_uploader,
1805                                 'upload_date':  u'NA',
1806                                 'title':        video_title,
1807                                 'stitle':       simple_title,
1808                                 'ext':          video_extension.decode('utf-8'),
1809                                 'format':       u'NA',
1810                                 'player_url':   None,
1811                         })
1812                 except UnavailableVideoError:
1813                         self._downloader.trouble(u'\nERROR: unable to download video')
1814
1815
1816 class YahooIE(InfoExtractor):
1817         """Information extractor for video.yahoo.com."""
1818
1819         # _VALID_URL matches all Yahoo! Video URLs
1820         # _VPAGE_URL matches only the extractable '/watch/' URLs
1821         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1822         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1823         IE_NAME = u'video.yahoo'
1824
1825         def __init__(self, downloader=None):
1826                 InfoExtractor.__init__(self, downloader)
1827
1828         def report_download_webpage(self, video_id):
1829                 """Report webpage download."""
1830                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1831
1832         def report_extraction(self, video_id):
1833                 """Report information extraction."""
1834                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1835
1836         def _real_extract(self, url, new_video=True):
1837                 # Extract ID from URL
1838                 mobj = re.match(self._VALID_URL, url)
1839                 if mobj is None:
1840                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1841                         return
1842
1843                 # At this point we have a new video
1844                 self._downloader.increment_downloads()
1845                 video_id = mobj.group(2)
1846                 video_extension = 'flv'
1847
1848                 # Rewrite valid but non-extractable URLs as
1849                 # extractable English language /watch/ URLs
1850                 if re.match(self._VPAGE_URL, url) is None:
1851                         request = urllib2.Request(url)
1852                         try:
1853                                 webpage = urllib2.urlopen(request).read()
1854                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1855                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1856                                 return
1857
1858                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1859                         if mobj is None:
1860                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1861                                 return
1862                         yahoo_id = mobj.group(1)
1863
1864                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1865                         if mobj is None:
1866                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1867                                 return
1868                         yahoo_vid = mobj.group(1)
1869
1870                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1871                         return self._real_extract(url, new_video=False)
1872
1873                 # Retrieve video webpage to extract further information
1874                 request = urllib2.Request(url)
1875                 try:
1876                         self.report_download_webpage(video_id)
1877                         webpage = urllib2.urlopen(request).read()
1878                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1879                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1880                         return
1881
1882                 # Extract uploader and title from webpage
1883                 self.report_extraction(video_id)
1884                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1885                 if mobj is None:
1886                         self._downloader.trouble(u'ERROR: unable to extract video title')
1887                         return
1888                 video_title = mobj.group(1).decode('utf-8')
1889                 simple_title = _simplify_title(video_title)
1890
1891                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1892                 if mobj is None:
1893                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1894                         return
1895                 video_uploader = mobj.group(1).decode('utf-8')
1896
1897                 # Extract video thumbnail
1898                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1899                 if mobj is None:
1900                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1901                         return
1902                 video_thumbnail = mobj.group(1).decode('utf-8')
1903
1904                 # Extract video description
1905                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1906                 if mobj is None:
1907                         self._downloader.trouble(u'ERROR: unable to extract video description')
1908                         return
1909                 video_description = mobj.group(1).decode('utf-8')
1910                 if not video_description:
1911                         video_description = 'No description available.'
1912
1913                 # Extract video height and width
1914                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1915                 if mobj is None:
1916                         self._downloader.trouble(u'ERROR: unable to extract video height')
1917                         return
1918                 yv_video_height = mobj.group(1)
1919
1920                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1921                 if mobj is None:
1922                         self._downloader.trouble(u'ERROR: unable to extract video width')
1923                         return
1924                 yv_video_width = mobj.group(1)
1925
1926                 # Retrieve video playlist to extract media URL
1927                 # I'm not completely sure what all these options are, but we
1928                 # seem to need most of them, otherwise the server sends a 401.
1929                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1930                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1931                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1932                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1933                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1934                 try:
1935                         self.report_download_webpage(video_id)
1936                         webpage = urllib2.urlopen(request).read()
1937                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1938                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1939                         return
1940
1941                 # Extract media URL from playlist XML
1942                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1943                 if mobj is None:
1944                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1945                         return
1946                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1947                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1948
1949                 try:
1950                         # Process video information
1951                         self._downloader.process_info({
1952                                 'id':           video_id.decode('utf-8'),
1953                                 'url':          video_url,
1954                                 'uploader':     video_uploader,
1955                                 'upload_date':  u'NA',
1956                                 'title':        video_title,
1957                                 'stitle':       simple_title,
1958                                 'ext':          video_extension.decode('utf-8'),
1959                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1960                                 'description':  video_description,
1961                                 'thumbnail':    video_thumbnail,
1962                                 'player_url':   None,
1963                         })
1964                 except UnavailableVideoError:
1965                         self._downloader.trouble(u'\nERROR: unable to download video')
1966
1967
1968 class VimeoIE(InfoExtractor):
1969         """Information extractor for vimeo.com."""
1970
1971         # _VALID_URL matches Vimeo URLs
1972         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1973         IE_NAME = u'vimeo'
1974
1975         def __init__(self, downloader=None):
1976                 InfoExtractor.__init__(self, downloader)
1977
1978         def report_download_webpage(self, video_id):
1979                 """Report webpage download."""
1980                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1981
1982         def report_extraction(self, video_id):
1983                 """Report information extraction."""
1984                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1985
1986         def _real_extract(self, url, new_video=True):
1987                 # Extract ID from URL
1988                 mobj = re.match(self._VALID_URL, url)
1989                 if mobj is None:
1990                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1991                         return
1992
1993                 # At this point we have a new video
1994                 self._downloader.increment_downloads()
1995                 video_id = mobj.group(1)
1996
1997                 # Retrieve video webpage to extract further information
1998                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1999                 try:
2000                         self.report_download_webpage(video_id)
2001                         webpage = urllib2.urlopen(request).read()
2002                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2003                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2004                         return
2005
2006                 # Now we begin extracting as much information as we can from what we
2007                 # retrieved. First we extract the information common to all extractors,
2008                 # and latter we extract those that are Vimeo specific.
2009                 self.report_extraction(video_id)
2010
2011                 # Extract title
2012                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2013                 if mobj is None:
2014                         self._downloader.trouble(u'ERROR: unable to extract video title')
2015                         return
2016                 video_title = mobj.group(1).decode('utf-8')
2017                 simple_title = _simple_title(video_title)
2018
2019                 # Extract uploader
2020                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2021                 if mobj is None:
2022                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2023                         return
2024                 video_uploader = mobj.group(1).decode('utf-8')
2025
2026                 # Extract video thumbnail
2027                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2028                 if mobj is None:
2029                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2030                         return
2031                 video_thumbnail = mobj.group(1).decode('utf-8')
2032
2033                 # # Extract video description
2034                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2035                 # if mobj is None:
2036                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2037                 #       return
2038                 # video_description = mobj.group(1).decode('utf-8')
2039                 # if not video_description: video_description = 'No description available.'
2040                 video_description = 'Foo.'
2041
2042                 # Vimeo specific: extract request signature
2043                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2044                 if mobj is None:
2045                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2046                         return
2047                 sig = mobj.group(1).decode('utf-8')
2048
2049                 # Vimeo specific: extract video quality information
2050                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2051                 if mobj is None:
2052                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2053                         return
2054                 quality = mobj.group(1).decode('utf-8')
2055
2056                 if int(quality) == 1:
2057                         quality = 'hd'
2058                 else:
2059                         quality = 'sd'
2060
2061                 # Vimeo specific: Extract request signature expiration
2062                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2063                 if mobj is None:
2064                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2065                         return
2066                 sig_exp = mobj.group(1).decode('utf-8')
2067
2068                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2069
2070                 try:
2071                         # Process video information
2072                         self._downloader.process_info({
2073                                 'id':           video_id.decode('utf-8'),
2074                                 'url':          video_url,
2075                                 'uploader':     video_uploader,
2076                                 'upload_date':  u'NA',
2077                                 'title':        video_title,
2078                                 'stitle':       simple_title,
2079                                 'ext':          u'mp4',
2080                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2081                                 'description':  video_description,
2082                                 'thumbnail':    video_thumbnail,
2083                                 'description':  video_description,
2084                                 'player_url':   None,
2085                         })
2086                 except UnavailableVideoError:
2087                         self._downloader.trouble(u'ERROR: unable to download video')
2088
2089
2090 class GenericIE(InfoExtractor):
2091         """Generic last-resort information extractor."""
2092
2093         _VALID_URL = r'.*'
2094         IE_NAME = u'generic'
2095
2096         def __init__(self, downloader=None):
2097                 InfoExtractor.__init__(self, downloader)
2098
2099         def report_download_webpage(self, video_id):
2100                 """Report webpage download."""
2101                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2102                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2103
2104         def report_extraction(self, video_id):
2105                 """Report information extraction."""
2106                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2107
2108         def _real_extract(self, url):
2109                 # At this point we have a new video
2110                 self._downloader.increment_downloads()
2111
2112                 video_id = url.split('/')[-1]
2113                 request = urllib2.Request(url)
2114                 try:
2115                         self.report_download_webpage(video_id)
2116                         webpage = urllib2.urlopen(request).read()
2117                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2118                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2119                         return
2120                 except ValueError, err:
2121                         # since this is the last-resort InfoExtractor, if
2122                         # this error is thrown, it'll be thrown here
2123                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2124                         return
2125
2126                 self.report_extraction(video_id)
2127                 # Start with something easy: JW Player in SWFObject
2128                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2129                 if mobj is None:
2130                         # Broaden the search a little bit
2131                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2132                 if mobj is None:
2133                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2134                         return
2135
2136                 # It's possible that one of the regexes
2137                 # matched, but returned an empty group:
2138                 if mobj.group(1) is None:
2139                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2140                         return
2141
2142                 video_url = urllib.unquote(mobj.group(1))
2143                 video_id = os.path.basename(video_url)
2144
2145                 # here's a fun little line of code for you:
2146                 video_extension = os.path.splitext(video_id)[1][1:]
2147                 video_id = os.path.splitext(video_id)[0]
2148
2149                 # it's tempting to parse this further, but you would
2150                 # have to take into account all the variations like
2151                 #   Video Title - Site Name
2152                 #   Site Name | Video Title
2153                 #   Video Title - Tagline | Site Name
2154                 # and so on and so forth; it's just not practical
2155                 mobj = re.search(r'<title>(.*)</title>', webpage)
2156                 if mobj is None:
2157                         self._downloader.trouble(u'ERROR: unable to extract title')
2158                         return
2159                 video_title = mobj.group(1).decode('utf-8')
2160                 video_title = sanitize_title(video_title)
2161                 simple_title = _simplify_title(video_title)
2162
2163                 # video uploader is domain name
2164                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2165                 if mobj is None:
2166                         self._downloader.trouble(u'ERROR: unable to extract title')
2167                         return
2168                 video_uploader = mobj.group(1).decode('utf-8')
2169
2170                 try:
2171                         # Process video information
2172                         self._downloader.process_info({
2173                                 'id':           video_id.decode('utf-8'),
2174                                 'url':          video_url.decode('utf-8'),
2175                                 'uploader':     video_uploader,
2176                                 'upload_date':  u'NA',
2177                                 'title':        video_title,
2178                                 'stitle':       simple_title,
2179                                 'ext':          video_extension.decode('utf-8'),
2180                                 'format':       u'NA',
2181                                 'player_url':   None,
2182                         })
2183                 except UnavailableVideoError, err:
2184                         self._downloader.trouble(u'\nERROR: unable to download video')
2185
2186
2187 class YoutubeSearchIE(InfoExtractor):
2188         """Information Extractor for YouTube search queries."""
2189         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2190         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2191         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2192         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2193         _youtube_ie = None
2194         _max_youtube_results = 1000
2195         IE_NAME = u'youtube:search'
2196
2197         def __init__(self, youtube_ie, downloader=None):
2198                 InfoExtractor.__init__(self, downloader)
2199                 self._youtube_ie = youtube_ie
2200
2201         def report_download_page(self, query, pagenum):
2202                 """Report attempt to download playlist page with given number."""
2203                 query = query.decode(preferredencoding())
2204                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2205
2206         def _real_initialize(self):
2207                 self._youtube_ie.initialize()
2208
2209         def _real_extract(self, query):
2210                 mobj = re.match(self._VALID_URL, query)
2211                 if mobj is None:
2212                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2213                         return
2214
2215                 prefix, query = query.split(':')
2216                 prefix = prefix[8:]
2217                 query = query.encode('utf-8')
2218                 if prefix == '':
2219                         self._download_n_results(query, 1)
2220                         return
2221                 elif prefix == 'all':
2222                         self._download_n_results(query, self._max_youtube_results)
2223                         return
2224                 else:
2225                         try:
2226                                 n = long(prefix)
2227                                 if n <= 0:
2228                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2229                                         return
2230                                 elif n > self._max_youtube_results:
2231                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2232                                         n = self._max_youtube_results
2233                                 self._download_n_results(query, n)
2234                                 return
2235                         except ValueError: # parsing prefix as integer fails
2236                                 self._download_n_results(query, 1)
2237                                 return
2238
2239         def _download_n_results(self, query, n):
2240                 """Downloads a specified number of results for a query"""
2241
2242                 video_ids = []
2243                 already_seen = set()
2244                 pagenum = 1
2245
2246                 while True:
2247                         self.report_download_page(query, pagenum)
2248                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2249                         request = urllib2.Request(result_url)
2250                         try:
2251                                 page = urllib2.urlopen(request).read()
2252                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2253                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2254                                 return
2255
2256                         # Extract video identifiers
2257                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2258                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2259                                 if video_id not in already_seen:
2260                                         video_ids.append(video_id)
2261                                         already_seen.add(video_id)
2262                                         if len(video_ids) == n:
2263                                                 # Specified n videos reached
2264                                                 for id in video_ids:
2265                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2266                                                 return
2267
2268                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2269                                 for id in video_ids:
2270                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2271                                 return
2272
2273                         pagenum = pagenum + 1
2274
2275
2276 class GoogleSearchIE(InfoExtractor):
2277         """Information Extractor for Google Video search queries."""
2278         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2279         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2280         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2281         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2282         _google_ie = None
2283         _max_google_results = 1000
2284         IE_NAME = u'video.google:search'
2285
2286         def __init__(self, google_ie, downloader=None):
2287                 InfoExtractor.__init__(self, downloader)
2288                 self._google_ie = google_ie
2289
2290         def report_download_page(self, query, pagenum):
2291                 """Report attempt to download playlist page with given number."""
2292                 query = query.decode(preferredencoding())
2293                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2294
2295         def _real_initialize(self):
2296                 self._google_ie.initialize()
2297
2298         def _real_extract(self, query):
2299                 mobj = re.match(self._VALID_URL, query)
2300                 if mobj is None:
2301                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2302                         return
2303
2304                 prefix, query = query.split(':')
2305                 prefix = prefix[8:]
2306                 query = query.encode('utf-8')
2307                 if prefix == '':
2308                         self._download_n_results(query, 1)
2309                         return
2310                 elif prefix == 'all':
2311                         self._download_n_results(query, self._max_google_results)
2312                         return
2313                 else:
2314                         try:
2315                                 n = long(prefix)
2316                                 if n <= 0:
2317                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2318                                         return
2319                                 elif n > self._max_google_results:
2320                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2321                                         n = self._max_google_results
2322                                 self._download_n_results(query, n)
2323                                 return
2324                         except ValueError: # parsing prefix as integer fails
2325                                 self._download_n_results(query, 1)
2326                                 return
2327
2328         def _download_n_results(self, query, n):
2329                 """Downloads a specified number of results for a query"""
2330
2331                 video_ids = []
2332                 already_seen = set()
2333                 pagenum = 1
2334
2335                 while True:
2336                         self.report_download_page(query, pagenum)
2337                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2338                         request = urllib2.Request(result_url)
2339                         try:
2340                                 page = urllib2.urlopen(request).read()
2341                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2342                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2343                                 return
2344
2345                         # Extract video identifiers
2346                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2347                                 video_id = mobj.group(1)
2348                                 if video_id not in already_seen:
2349                                         video_ids.append(video_id)
2350                                         already_seen.add(video_id)
2351                                         if len(video_ids) == n:
2352                                                 # Specified n videos reached
2353                                                 for id in video_ids:
2354                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2355                                                 return
2356
2357                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2358                                 for id in video_ids:
2359                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2360                                 return
2361
2362                         pagenum = pagenum + 1
2363
2364
2365 class YahooSearchIE(InfoExtractor):
2366         """Information Extractor for Yahoo! Video search queries."""
2367         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2368         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2369         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2370         _MORE_PAGES_INDICATOR = r'\s*Next'
2371         _yahoo_ie = None
2372         _max_yahoo_results = 1000
2373         IE_NAME = u'video.yahoo:search'
2374
2375         def __init__(self, yahoo_ie, downloader=None):
2376                 InfoExtractor.__init__(self, downloader)
2377                 self._yahoo_ie = yahoo_ie
2378
2379         def report_download_page(self, query, pagenum):
2380                 """Report attempt to download playlist page with given number."""
2381                 query = query.decode(preferredencoding())
2382                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2383
2384         def _real_initialize(self):
2385                 self._yahoo_ie.initialize()
2386
2387         def _real_extract(self, query):
2388                 mobj = re.match(self._VALID_URL, query)
2389                 if mobj is None:
2390                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2391                         return
2392
2393                 prefix, query = query.split(':')
2394                 prefix = prefix[8:]
2395                 query = query.encode('utf-8')
2396                 if prefix == '':
2397                         self._download_n_results(query, 1)
2398                         return
2399                 elif prefix == 'all':
2400                         self._download_n_results(query, self._max_yahoo_results)
2401                         return
2402                 else:
2403                         try:
2404                                 n = long(prefix)
2405                                 if n <= 0:
2406                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2407                                         return
2408                                 elif n > self._max_yahoo_results:
2409                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2410                                         n = self._max_yahoo_results
2411                                 self._download_n_results(query, n)
2412                                 return
2413                         except ValueError: # parsing prefix as integer fails
2414                                 self._download_n_results(query, 1)
2415                                 return
2416
2417         def _download_n_results(self, query, n):
2418                 """Downloads a specified number of results for a query"""
2419
2420                 video_ids = []
2421                 already_seen = set()
2422                 pagenum = 1
2423
2424                 while True:
2425                         self.report_download_page(query, pagenum)
2426                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2427                         request = urllib2.Request(result_url)
2428                         try:
2429                                 page = urllib2.urlopen(request).read()
2430                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2431                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2432                                 return
2433
2434                         # Extract video identifiers
2435                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2436                                 video_id = mobj.group(1)
2437                                 if video_id not in already_seen:
2438                                         video_ids.append(video_id)
2439                                         already_seen.add(video_id)
2440                                         if len(video_ids) == n:
2441                                                 # Specified n videos reached
2442                                                 for id in video_ids:
2443                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2444                                                 return
2445
2446                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2447                                 for id in video_ids:
2448                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2449                                 return
2450
2451                         pagenum = pagenum + 1
2452
2453
2454 class YoutubePlaylistIE(InfoExtractor):
2455         """Information Extractor for YouTube playlists."""
2456
2457         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2458         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2459         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2460         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2461         _youtube_ie = None
2462         IE_NAME = u'youtube:playlist'
2463
2464         def __init__(self, youtube_ie, downloader=None):
2465                 InfoExtractor.__init__(self, downloader)
2466                 self._youtube_ie = youtube_ie
2467
2468         def report_download_page(self, playlist_id, pagenum):
2469                 """Report attempt to download playlist page with given number."""
2470                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2471
2472         def _real_initialize(self):
2473                 self._youtube_ie.initialize()
2474
2475         def _real_extract(self, url):
2476                 # Extract playlist id
2477                 mobj = re.match(self._VALID_URL, url)
2478                 if mobj is None:
2479                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2480                         return
2481
2482                 # Single video case
2483                 if mobj.group(3) is not None:
2484                         self._youtube_ie.extract(mobj.group(3))
2485                         return
2486
2487                 # Download playlist pages
2488                 # prefix is 'p' as default for playlists but there are other types that need extra care
2489                 playlist_prefix = mobj.group(1)
2490                 if playlist_prefix == 'a':
2491                         playlist_access = 'artist'
2492                 else:
2493                         playlist_prefix = 'p'
2494                         playlist_access = 'view_play_list'
2495                 playlist_id = mobj.group(2)
2496                 video_ids = []
2497                 pagenum = 1
2498
2499                 while True:
2500                         self.report_download_page(playlist_id, pagenum)
2501                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2502                         request = urllib2.Request(url)
2503                         try:
2504                                 page = urllib2.urlopen(request).read()
2505                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2506                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2507                                 return
2508
2509                         # Extract video identifiers
2510                         ids_in_page = []
2511                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2512                                 if mobj.group(1) not in ids_in_page:
2513                                         ids_in_page.append(mobj.group(1))
2514                         video_ids.extend(ids_in_page)
2515
2516                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2517                                 break
2518                         pagenum = pagenum + 1
2519
2520                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2521                 playlistend = self._downloader.params.get('playlistend', -1)
2522                 video_ids = video_ids[playliststart:playlistend]
2523
2524                 for id in video_ids:
2525                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2526                 return
2527
2528
2529 class YoutubeUserIE(InfoExtractor):
2530         """Information Extractor for YouTube users."""
2531
2532         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2533         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2534         _GDATA_PAGE_SIZE = 50
2535         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2536         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2537         _youtube_ie = None
2538         IE_NAME = u'youtube:user'
2539
2540         def __init__(self, youtube_ie, downloader=None):
2541                 InfoExtractor.__init__(self, downloader)
2542                 self._youtube_ie = youtube_ie
2543
2544         def report_download_page(self, username, start_index):
2545                 """Report attempt to download user page."""
2546                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2547                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2548
2549         def _real_initialize(self):
2550                 self._youtube_ie.initialize()
2551
2552         def _real_extract(self, url):
2553                 # Extract username
2554                 mobj = re.match(self._VALID_URL, url)
2555                 if mobj is None:
2556                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2557                         return
2558
2559                 username = mobj.group(1)
2560
2561                 # Download video ids using YouTube Data API. Result size per
2562                 # query is limited (currently to 50 videos) so we need to query
2563                 # page by page until there are no video ids - it means we got
2564                 # all of them.
2565
2566                 video_ids = []
2567                 pagenum = 0
2568
2569                 while True:
2570                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2571                         self.report_download_page(username, start_index)
2572
2573                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2574
2575                         try:
2576                                 page = urllib2.urlopen(request).read()
2577                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2578                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2579                                 return
2580
2581                         # Extract video identifiers
2582                         ids_in_page = []
2583
2584                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2585                                 if mobj.group(1) not in ids_in_page:
2586                                         ids_in_page.append(mobj.group(1))
2587
2588                         video_ids.extend(ids_in_page)
2589
2590                         # A little optimization - if current page is not
2591                         # "full", ie. does not contain PAGE_SIZE video ids then
2592                         # we can assume that this page is the last one - there
2593                         # are no more ids on further pages - no need to query
2594                         # again.
2595
2596                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2597                                 break
2598
2599                         pagenum += 1
2600
2601                 all_ids_count = len(video_ids)
2602                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2603                 playlistend = self._downloader.params.get('playlistend', -1)
2604
2605                 if playlistend == -1:
2606                         video_ids = video_ids[playliststart:]
2607                 else:
2608                         video_ids = video_ids[playliststart:playlistend]
2609
2610                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2611                                 (username, all_ids_count, len(video_ids)))
2612
2613                 for video_id in video_ids:
2614                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2615
2616
2617 class DepositFilesIE(InfoExtractor):
2618         """Information extractor for depositfiles.com"""
2619
2620         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2621         IE_NAME = u'DepositFiles'
2622
2623         def __init__(self, downloader=None):
2624                 InfoExtractor.__init__(self, downloader)
2625
2626         def report_download_webpage(self, file_id):
2627                 """Report webpage download."""
2628                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2629
2630         def report_extraction(self, file_id):
2631                 """Report information extraction."""
2632                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2633
2634         def _real_extract(self, url):
2635                 # At this point we have a new file
2636                 self._downloader.increment_downloads()
2637
2638                 file_id = url.split('/')[-1]
2639                 # Rebuild url in english locale
2640                 url = 'http://depositfiles.com/en/files/' + file_id
2641
2642                 # Retrieve file webpage with 'Free download' button pressed
2643                 free_download_indication = { 'gateway_result' : '1' }
2644                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2645                 try:
2646                         self.report_download_webpage(file_id)
2647                         webpage = urllib2.urlopen(request).read()
2648                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2649                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2650                         return
2651
2652                 # Search for the real file URL
2653                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2654                 if (mobj is None) or (mobj.group(1) is None):
2655                         # Try to figure out reason of the error.
2656                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2657                         if (mobj is not None) and (mobj.group(1) is not None):
2658                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2659                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2660                         else:
2661                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2662                         return
2663
2664                 file_url = mobj.group(1)
2665                 file_extension = os.path.splitext(file_url)[1][1:]
2666
2667                 # Search for file title
2668                 mobj = re.search(r'<b title="(.*?)">', webpage)
2669                 if mobj is None:
2670                         self._downloader.trouble(u'ERROR: unable to extract title')
2671                         return
2672                 file_title = mobj.group(1).decode('utf-8')
2673
2674                 try:
2675                         # Process file information
2676                         self._downloader.process_info({
2677                                 'id':           file_id.decode('utf-8'),
2678                                 'url':          file_url.decode('utf-8'),
2679                                 'uploader':     u'NA',
2680                                 'upload_date':  u'NA',
2681                                 'title':        file_title,
2682                                 'stitle':       file_title,
2683                                 'ext':          file_extension.decode('utf-8'),
2684                                 'format':       u'NA',
2685                                 'player_url':   None,
2686                         })
2687                 except UnavailableVideoError, err:
2688                         self._downloader.trouble(u'ERROR: unable to download file')
2689
2690
2691 class FacebookIE(InfoExtractor):
2692         """Information Extractor for Facebook"""
2693
2694         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2695         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2696         _NETRC_MACHINE = 'facebook'
2697         _available_formats = ['video', 'highqual', 'lowqual']
2698         _video_extensions = {
2699                 'video': 'mp4',
2700                 'highqual': 'mp4',
2701                 'lowqual': 'mp4',
2702         }
2703         IE_NAME = u'facebook'
2704
2705         def __init__(self, downloader=None):
2706                 InfoExtractor.__init__(self, downloader)
2707
2708         def _reporter(self, message):
2709                 """Add header and report message."""
2710                 self._downloader.to_screen(u'[facebook] %s' % message)
2711
2712         def report_login(self):
2713                 """Report attempt to log in."""
2714                 self._reporter(u'Logging in')
2715
2716         def report_video_webpage_download(self, video_id):
2717                 """Report attempt to download video webpage."""
2718                 self._reporter(u'%s: Downloading video webpage' % video_id)
2719
2720         def report_information_extraction(self, video_id):
2721                 """Report attempt to extract video information."""
2722                 self._reporter(u'%s: Extracting video information' % video_id)
2723
2724         def _parse_page(self, video_webpage):
2725                 """Extract video information from page"""
2726                 # General data
2727                 data = {'title': r'\("video_title", "(.*?)"\)',
2728                         'description': r'<div class="datawrap">(.*?)</div>',
2729                         'owner': r'\("video_owner_name", "(.*?)"\)',
2730                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2731                         }
2732                 video_info = {}
2733                 for piece in data.keys():
2734                         mobj = re.search(data[piece], video_webpage)
2735                         if mobj is not None:
2736                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2737
2738                 # Video urls
2739                 video_urls = {}
2740                 for fmt in self._available_formats:
2741                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2742                         if mobj is not None:
2743                                 # URL is in a Javascript segment inside an escaped Unicode format within
2744                                 # the generally utf-8 page
2745                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2746                 video_info['video_urls'] = video_urls
2747
2748                 return video_info
2749
2750         def _real_initialize(self):
2751                 if self._downloader is None:
2752                         return
2753
2754                 useremail = None
2755                 password = None
2756                 downloader_params = self._downloader.params
2757
2758                 # Attempt to use provided username and password or .netrc data
2759                 if downloader_params.get('username', None) is not None:
2760                         useremail = downloader_params['username']
2761                         password = downloader_params['password']
2762                 elif downloader_params.get('usenetrc', False):
2763                         try:
2764                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2765                                 if info is not None:
2766                                         useremail = info[0]
2767                                         password = info[2]
2768                                 else:
2769                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2770                         except (IOError, netrc.NetrcParseError), err:
2771                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2772                                 return
2773
2774                 if useremail is None:
2775                         return
2776
2777                 # Log in
2778                 login_form = {
2779                         'email': useremail,
2780                         'pass': password,
2781                         'login': 'Log+In'
2782                         }
2783                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2784                 try:
2785                         self.report_login()
2786                         login_results = urllib2.urlopen(request).read()
2787                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2788                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2789                                 return
2790                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2791                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2792                         return
2793
2794         def _real_extract(self, url):
2795                 mobj = re.match(self._VALID_URL, url)
2796                 if mobj is None:
2797                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2798                         return
2799                 video_id = mobj.group('ID')
2800
2801                 # Get video webpage
2802                 self.report_video_webpage_download(video_id)
2803                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2804                 try:
2805                         page = urllib2.urlopen(request)
2806                         video_webpage = page.read()
2807                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2808                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2809                         return
2810
2811                 # Start extracting information
2812                 self.report_information_extraction(video_id)
2813
2814                 # Extract information
2815                 video_info = self._parse_page(video_webpage)
2816
2817                 # uploader
2818                 if 'owner' not in video_info:
2819                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2820                         return
2821                 video_uploader = video_info['owner']
2822
2823                 # title
2824                 if 'title' not in video_info:
2825                         self._downloader.trouble(u'ERROR: unable to extract video title')
2826                         return
2827                 video_title = video_info['title']
2828                 video_title = video_title.decode('utf-8')
2829                 video_title = sanitize_title(video_title)
2830
2831                 simple_title = _simplify_title(video_title)
2832
2833                 # thumbnail image
2834                 if 'thumbnail' not in video_info:
2835                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2836                         video_thumbnail = ''
2837                 else:
2838                         video_thumbnail = video_info['thumbnail']
2839
2840                 # upload date
2841                 upload_date = u'NA'
2842                 if 'upload_date' in video_info:
2843                         upload_time = video_info['upload_date']
2844                         timetuple = email.utils.parsedate_tz(upload_time)
2845                         if timetuple is not None:
2846                                 try:
2847                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2848                                 except:
2849                                         pass
2850
2851                 # description
2852                 video_description = video_info.get('description', 'No description available.')
2853
2854                 url_map = video_info['video_urls']
2855                 if len(url_map.keys()) > 0:
2856                         # Decide which formats to download
2857                         req_format = self._downloader.params.get('format', None)
2858                         format_limit = self._downloader.params.get('format_limit', None)
2859
2860                         if format_limit is not None and format_limit in self._available_formats:
2861                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2862                         else:
2863                                 format_list = self._available_formats
2864                         existing_formats = [x for x in format_list if x in url_map]
2865                         if len(existing_formats) == 0:
2866                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2867                                 return
2868                         if req_format is None:
2869                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2870                         elif req_format == 'worst':
2871                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2872                         elif req_format == '-1':
2873                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2874                         else:
2875                                 # Specific format
2876                                 if req_format not in url_map:
2877                                         self._downloader.trouble(u'ERROR: requested format not available')
2878                                         return
2879                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2880
2881                 for format_param, video_real_url in video_url_list:
2882
2883                         # At this point we have a new video
2884                         self._downloader.increment_downloads()
2885
2886                         # Extension
2887                         video_extension = self._video_extensions.get(format_param, 'mp4')
2888
2889                         try:
2890                                 # Process video information
2891                                 self._downloader.process_info({
2892                                         'id':           video_id.decode('utf-8'),
2893                                         'url':          video_real_url.decode('utf-8'),
2894                                         'uploader':     video_uploader.decode('utf-8'),
2895                                         'upload_date':  upload_date,
2896                                         'title':        video_title,
2897                                         'stitle':       simple_title,
2898                                         'ext':          video_extension.decode('utf-8'),
2899                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2900                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2901                                         'description':  video_description.decode('utf-8'),
2902                                         'player_url':   None,
2903                                 })
2904                         except UnavailableVideoError, err:
2905                                 self._downloader.trouble(u'\nERROR: unable to download video')
2906
2907 class BlipTVIE(InfoExtractor):
2908         """Information extractor for blip.tv"""
2909
2910         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2911         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2912         IE_NAME = u'blip.tv'
2913
2914         def report_extraction(self, file_id):
2915                 """Report information extraction."""
2916                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2917
2918         def report_direct_download(self, title):
2919                 """Report information extraction."""
2920                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2921
2922         def _real_extract(self, url):
2923                 mobj = re.match(self._VALID_URL, url)
2924                 if mobj is None:
2925                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2926                         return
2927
2928                 if '?' in url:
2929                         cchar = '&'
2930                 else:
2931                         cchar = '?'
2932                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2933                 request = urllib2.Request(json_url)
2934                 self.report_extraction(mobj.group(1))
2935                 info = None
2936                 try:
2937                         urlh = urllib2.urlopen(request)
2938                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2939                                 basename = url.split('/')[-1]
2940                                 title,ext = os.path.splitext(basename)
2941                                 title = title.decode('UTF-8')
2942                                 ext = ext.replace('.', '')
2943                                 self.report_direct_download(title)
2944                                 info = {
2945                                         'id': title,
2946                                         'url': url,
2947                                         'title': title,
2948                                         'stitle': _simplify_title(title),
2949                                         'ext': ext,
2950                                         'urlhandle': urlh
2951                                 }
2952                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2953                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2954                         return
2955                 if info is None: # Regular URL
2956                         try:
2957                                 json_code = urlh.read()
2958                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2959                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2960                                 return
2961
2962                         try:
2963                                 json_data = json.loads(json_code)
2964                                 if 'Post' in json_data:
2965                                         data = json_data['Post']
2966                                 else:
2967                                         data = json_data
2968
2969                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2970                                 video_url = data['media']['url']
2971                                 umobj = re.match(self._URL_EXT, video_url)
2972                                 if umobj is None:
2973                                         raise ValueError('Can not determine filename extension')
2974                                 ext = umobj.group(1)
2975
2976                                 info = {
2977                                         'id': data['item_id'],
2978                                         'url': video_url,
2979                                         'uploader': data['display_name'],
2980                                         'upload_date': upload_date,
2981                                         'title': data['title'],
2982                                         'stitle': _simplify_title(data['title']),
2983                                         'ext': ext,
2984                                         'format': data['media']['mimeType'],
2985                                         'thumbnail': data['thumbnailUrl'],
2986                                         'description': data['description'],
2987                                         'player_url': data['embedUrl']
2988                                 }
2989                         except (ValueError,KeyError), err:
2990                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2991                                 return
2992
2993                 self._downloader.increment_downloads()
2994
2995                 try:
2996                         self._downloader.process_info(info)
2997                 except UnavailableVideoError, err:
2998                         self._downloader.trouble(u'\nERROR: unable to download video')
2999
3000
3001 class MyVideoIE(InfoExtractor):
3002         """Information Extractor for myvideo.de."""
3003
3004         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3005         IE_NAME = u'myvideo'
3006
3007         def __init__(self, downloader=None):
3008                 InfoExtractor.__init__(self, downloader)
3009
3010         def report_download_webpage(self, video_id):
3011                 """Report webpage download."""
3012                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3013
3014         def report_extraction(self, video_id):
3015                 """Report information extraction."""
3016                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3017
3018         def _real_extract(self,url):
3019                 mobj = re.match(self._VALID_URL, url)
3020                 if mobj is None:
3021                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3022                         return
3023
3024                 video_id = mobj.group(1)
3025
3026                 # Get video webpage
3027                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3028                 try:
3029                         self.report_download_webpage(video_id)
3030                         webpage = urllib2.urlopen(request).read()
3031                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3032                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3033                         return
3034
3035                 self.report_extraction(video_id)
3036                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3037                                  webpage)
3038                 if mobj is None:
3039                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3040                         return
3041                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3042
3043                 mobj = re.search('<title>([^<]+)</title>', webpage)
3044                 if mobj is None:
3045                         self._downloader.trouble(u'ERROR: unable to extract title')
3046                         return
3047
3048                 video_title = mobj.group(1)
3049                 video_title = sanitize_title(video_title)
3050
3051                 simple_title = _simplify_title(video_title)
3052
3053                 try:
3054                         self._downloader.process_info({
3055                                 'id':           video_id,
3056                                 'url':          video_url,
3057                                 'uploader':     u'NA',
3058                                 'upload_date':  u'NA',
3059                                 'title':        video_title,
3060                                 'stitle':       simple_title,
3061                                 'ext':          u'flv',
3062                                 'format':       u'NA',
3063                                 'player_url':   None,
3064                         })
3065                 except UnavailableVideoError:
3066                         self._downloader.trouble(u'\nERROR: Unable to download video')
3067
3068 class ComedyCentralIE(InfoExtractor):
3069         """Information extractor for The Daily Show and Colbert Report """
3070
3071         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3072         IE_NAME = u'comedycentral'
3073
3074         def report_extraction(self, episode_id):
3075                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3076
3077         def report_config_download(self, episode_id):
3078                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3079
3080         def report_index_download(self, episode_id):
3081                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3082
3083         def report_player_url(self, episode_id):
3084                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3085
3086         def _real_extract(self, url):
3087                 mobj = re.match(self._VALID_URL, url)
3088                 if mobj is None:
3089                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3090                         return
3091
3092                 if mobj.group('shortname'):
3093                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3094                                 url = u'http://www.thedailyshow.com/full-episodes/'
3095                         else:
3096                                 url = u'http://www.colbertnation.com/full-episodes/'
3097                         mobj = re.match(self._VALID_URL, url)
3098                         assert mobj is not None
3099
3100                 dlNewest = not mobj.group('episode')
3101                 if dlNewest:
3102                         epTitle = mobj.group('showname')
3103                 else:
3104                         epTitle = mobj.group('episode')
3105
3106                 req = urllib2.Request(url)
3107                 self.report_extraction(epTitle)
3108                 try:
3109                         htmlHandle = urllib2.urlopen(req)
3110                         html = htmlHandle.read()
3111                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3112                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3113                         return
3114                 if dlNewest:
3115                         url = htmlHandle.geturl()
3116                         mobj = re.match(self._VALID_URL, url)
3117                         if mobj is None:
3118                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3119                                 return
3120                         if mobj.group('episode') == '':
3121                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3122                                 return
3123                         epTitle = mobj.group('episode')
3124
3125                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3126                 if len(mMovieParams) == 0:
3127                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3128                         return
3129
3130                 playerUrl_raw = mMovieParams[0][0]
3131                 self.report_player_url(epTitle)
3132                 try:
3133                         urlHandle = urllib2.urlopen(playerUrl_raw)
3134                         playerUrl = urlHandle.geturl()
3135                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3136                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3137                         return
3138
3139                 uri = mMovieParams[0][1]
3140                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3141                 self.report_index_download(epTitle)
3142                 try:
3143                         indexXml = urllib2.urlopen(indexUrl).read()
3144                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3145                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3146                         return
3147
3148                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3149                 itemEls = idoc.findall('.//item')
3150                 for itemEl in itemEls:
3151                         mediaId = itemEl.findall('./guid')[0].text
3152                         shortMediaId = mediaId.split(':')[-1]
3153                         showId = mediaId.split(':')[-2].replace('.com', '')
3154                         officialTitle = itemEl.findall('./title')[0].text
3155                         officialDate = itemEl.findall('./pubDate')[0].text
3156
3157                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3158                                                 urllib.urlencode({'uri': mediaId}))
3159                         configReq = urllib2.Request(configUrl)
3160                         self.report_config_download(epTitle)
3161                         try:
3162                                 configXml = urllib2.urlopen(configReq).read()
3163                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3164                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3165                                 return
3166
3167                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3168                         turls = []
3169                         for rendition in cdoc.findall('.//rendition'):
3170                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3171                                 turls.append(finfo)
3172
3173                         if len(turls) == 0:
3174                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3175                                 continue
3176
3177                         # For now, just pick the highest bitrate
3178                         format,video_url = turls[-1]
3179
3180                         self._downloader.increment_downloads()
3181
3182                         effTitle = showId + u'-' + epTitle
3183                         info = {
3184                                 'id': shortMediaId,
3185                                 'url': video_url,
3186                                 'uploader': showId,
3187                                 'upload_date': officialDate,
3188                                 'title': effTitle,
3189                                 'stitle': _simplify_title(effTitle),
3190                                 'ext': 'mp4',
3191                                 'format': format,
3192                                 'thumbnail': None,
3193                                 'description': officialTitle,
3194                                 'player_url': playerUrl
3195                         }
3196
3197                         try:
3198                                 self._downloader.process_info(info)
3199                         except UnavailableVideoError, err:
3200                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3201                                 continue
3202
3203
3204 class EscapistIE(InfoExtractor):
3205         """Information extractor for The Escapist """
3206
3207         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3208         IE_NAME = u'escapist'
3209
3210         def report_extraction(self, showName):
3211                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3212
3213         def report_config_download(self, showName):
3214                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3215
3216         def _real_extract(self, url):
3217                 htmlParser = HTMLParser.HTMLParser()
3218
3219                 mobj = re.match(self._VALID_URL, url)
3220                 if mobj is None:
3221                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3222                         return
3223                 showName = mobj.group('showname')
3224                 videoId = mobj.group('episode')
3225
3226                 self.report_extraction(showName)
3227                 try:
3228                         webPage = urllib2.urlopen(url).read()
3229                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3230                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3231                         return
3232
3233                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3234                 description = htmlParser.unescape(descMatch.group(1))
3235                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3236                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3237                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3238                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3239                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3240                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3241
3242                 self.report_config_download(showName)
3243                 try:
3244                         configJSON = urllib2.urlopen(configUrl).read()
3245                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3246                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3247                         return
3248
3249                 # Technically, it's JavaScript, not JSON
3250                 configJSON = configJSON.replace("'", '"')
3251
3252                 try:
3253                         config = json.loads(configJSON)
3254                 except (ValueError,), err:
3255                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3256                         return
3257
3258                 playlist = config['playlist']
3259                 videoUrl = playlist[1]['url']
3260
3261                 self._downloader.increment_downloads()
3262                 info = {
3263                         'id': videoId,
3264                         'url': videoUrl,
3265                         'uploader': showName,
3266                         'upload_date': None,
3267                         'title': showName,
3268                         'stitle': _simplify_title(showName),
3269                         'ext': 'flv',
3270                         'format': 'flv',
3271                         'thumbnail': imgUrl,
3272                         'description': description,
3273                         'player_url': playerUrl,
3274                 }
3275
3276                 try:
3277                         self._downloader.process_info(info)
3278                 except UnavailableVideoError, err:
3279                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3280
3281
3282 class CollegeHumorIE(InfoExtractor):
3283         """Information extractor for collegehumor.com"""
3284
3285         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3286         IE_NAME = u'collegehumor'
3287
3288         def report_webpage(self, video_id):
3289                 """Report information extraction."""
3290                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3291
3292         def report_extraction(self, video_id):
3293                 """Report information extraction."""
3294                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3295
3296         def _real_extract(self, url):
3297                 htmlParser = HTMLParser.HTMLParser()
3298
3299                 mobj = re.match(self._VALID_URL, url)
3300                 if mobj is None:
3301                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3302                         return
3303                 video_id = mobj.group('videoid')
3304
3305                 self.report_webpage(video_id)
3306                 request = urllib2.Request(url)
3307                 try:
3308                         webpage = urllib2.urlopen(request).read()
3309                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3310                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3311                         return
3312
3313                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3314                 if m is None:
3315                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3316                         return
3317                 internal_video_id = m.group('internalvideoid')
3318
3319                 info = {
3320                         'id': video_id,
3321                         'internal_id': internal_video_id,
3322                 }
3323
3324                 self.report_extraction(video_id)
3325                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3326                 try:
3327                         metaXml = urllib2.urlopen(xmlUrl).read()
3328                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3329                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3330                         return
3331
3332                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3333                 try:
3334                         videoNode = mdoc.findall('./video')[0]
3335                         info['description'] = videoNode.findall('./description')[0].text
3336                         info['title'] = videoNode.findall('./caption')[0].text
3337                         info['stitle'] = _simplify_title(info['title'])
3338                         info['url'] = videoNode.findall('./file')[0].text
3339                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3340                         info['ext'] = info['url'].rpartition('.')[2]
3341                         info['format'] = info['ext']
3342                 except IndexError:
3343                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3344                         return
3345
3346                 self._downloader.increment_downloads()
3347
3348                 try:
3349                         self._downloader.process_info(info)
3350                 except UnavailableVideoError, err:
3351                         self._downloader.trouble(u'\nERROR: unable to download video')
3352
3353
3354 class XVideosIE(InfoExtractor):
3355         """Information extractor for xvideos.com"""
3356
3357         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3358         IE_NAME = u'xvideos'
3359
3360         def report_webpage(self, video_id):
3361                 """Report information extraction."""
3362                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3363
3364         def report_extraction(self, video_id):
3365                 """Report information extraction."""
3366                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3367
3368         def _real_extract(self, url):
3369                 htmlParser = HTMLParser.HTMLParser()
3370
3371                 mobj = re.match(self._VALID_URL, url)
3372                 if mobj is None:
3373                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3374                         return
3375                 video_id = mobj.group(1).decode('utf-8')
3376
3377                 self.report_webpage(video_id)
3378
3379                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3380                 try:
3381                         webpage = urllib2.urlopen(request).read()
3382                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3383                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3384                         return
3385
3386                 self.report_extraction(video_id)
3387
3388
3389                 # Extract video URL
3390                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3391                 if mobj is None:
3392                         self._downloader.trouble(u'ERROR: unable to extract video url')
3393                         return
3394                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3395
3396
3397                 # Extract title
3398                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3399                 if mobj is None:
3400                         self._downloader.trouble(u'ERROR: unable to extract video title')
3401                         return
3402                 video_title = mobj.group(1).decode('utf-8')
3403
3404
3405                 # Extract video thumbnail
3406                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3407                 if mobj is None:
3408                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3409                         return
3410                 video_thumbnail = mobj.group(1).decode('utf-8')
3411
3412
3413
3414                 self._downloader.increment_downloads()
3415                 info = {
3416                         'id': video_id,
3417                         'url': video_url,
3418                         'uploader': None,
3419                         'upload_date': None,
3420                         'title': video_title,
3421                         'stitle': _simplify_title(video_title),
3422                         'ext': 'flv',
3423                         'format': 'flv',
3424                         'thumbnail': video_thumbnail,
3425                         'description': None,
3426                         'player_url': None,
3427                 }
3428
3429                 try:
3430                         self._downloader.process_info(info)
3431                 except UnavailableVideoError, err:
3432                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3433
3434
3435 class SoundcloudIE(InfoExtractor):
3436         """Information extractor for soundcloud.com
3437            To access the media, the uid of the song and a stream token
3438            must be extracted from the page source and the script must make
3439            a request to media.soundcloud.com/crossdomain.xml. Then
3440            the media can be grabbed by requesting from an url composed
3441            of the stream token and uid
3442          """
3443
3444         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3445         IE_NAME = u'soundcloud'
3446
3447         def __init__(self, downloader=None):
3448                 InfoExtractor.__init__(self, downloader)
3449
3450         def report_webpage(self, video_id):
3451                 """Report information extraction."""
3452                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3453
3454         def report_extraction(self, video_id):
3455                 """Report information extraction."""
3456                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3457
3458         def _real_extract(self, url):
3459                 htmlParser = HTMLParser.HTMLParser()
3460
3461                 mobj = re.match(self._VALID_URL, url)
3462                 if mobj is None:
3463                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3464                         return
3465
3466                 # extract uploader (which is in the url)
3467                 uploader = mobj.group(1).decode('utf-8')
3468                 # extract simple title (uploader + slug of song title)
3469                 slug_title =  mobj.group(2).decode('utf-8')
3470                 simple_title = uploader + '-' + slug_title
3471
3472                 self.report_webpage('%s/%s' % (uploader, slug_title))
3473
3474                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3475                 try:
3476                         webpage = urllib2.urlopen(request).read()
3477                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3478                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3479                         return
3480
3481                 self.report_extraction('%s/%s' % (uploader, slug_title))
3482
3483                 # extract uid and stream token that soundcloud hands out for access
3484                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3485                 if mobj:
3486                         video_id = mobj.group(1)
3487                         stream_token = mobj.group(2)
3488
3489                 # extract unsimplified title
3490                 mobj = re.search('"title":"(.*?)",', webpage)
3491                 if mobj:
3492                         title = mobj.group(1)
3493
3494                 # construct media url (with uid/token)
3495                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3496                 mediaURL = mediaURL % (video_id, stream_token)
3497
3498                 # description
3499                 description = u'No description available'
3500                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3501                 if mobj:
3502                         description = mobj.group(1)
3503
3504                 # upload date
3505                 upload_date = None
3506                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3507                 if mobj:
3508                         try:
3509                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3510                         except Exception, e:
3511                                 print str(e)
3512
3513                 # for soundcloud, a request to a cross domain is required for cookies
3514                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3515
3516                 try:
3517                         self._downloader.process_info({
3518                                 'id':           video_id.decode('utf-8'),
3519                                 'url':          mediaURL,
3520                                 'uploader':     uploader.decode('utf-8'),
3521                                 'upload_date':  upload_date,
3522                                 'title':        simple_title.decode('utf-8'),
3523                                 'stitle':       simple_title.decode('utf-8'),
3524                                 'ext':          u'mp3',
3525                                 'format':       u'NA',
3526                                 'player_url':   None,
3527                                 'description': description.decode('utf-8')
3528                         })
3529                 except UnavailableVideoError:
3530                         self._downloader.trouble(u'\nERROR: unable to download video')
3531
3532
3533 class InfoQIE(InfoExtractor):
3534         """Information extractor for infoq.com"""
3535
3536         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3537         IE_NAME = u'infoq'
3538
3539         def report_webpage(self, video_id):
3540                 """Report information extraction."""
3541                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3542
3543         def report_extraction(self, video_id):
3544                 """Report information extraction."""
3545                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3546
3547         def _real_extract(self, url):
3548                 htmlParser = HTMLParser.HTMLParser()
3549
3550                 mobj = re.match(self._VALID_URL, url)
3551                 if mobj is None:
3552                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3553                         return
3554
3555                 self.report_webpage(url)
3556
3557                 request = urllib2.Request(url)
3558                 try:
3559                         webpage = urllib2.urlopen(request).read()
3560                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3561                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3562                         return
3563
3564                 self.report_extraction(url)
3565
3566
3567                 # Extract video URL
3568                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3569                 if mobj is None:
3570                         self._downloader.trouble(u'ERROR: unable to extract video url')
3571                         return
3572                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3573
3574
3575                 # Extract title
3576                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3577                 if mobj is None:
3578                         self._downloader.trouble(u'ERROR: unable to extract video title')
3579                         return
3580                 video_title = mobj.group(1).decode('utf-8')
3581
3582                 # Extract description
3583                 video_description = u'No description available.'
3584                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3585                 if mobj is not None:
3586                         video_description = mobj.group(1).decode('utf-8')
3587
3588                 video_filename = video_url.split('/')[-1]
3589                 video_id, extension = video_filename.split('.')
3590
3591                 self._downloader.increment_downloads()
3592                 info = {
3593                         'id': video_id,
3594                         'url': video_url,
3595                         'uploader': None,
3596                         'upload_date': None,
3597                         'title': video_title,
3598                         'stitle': _simplify_title(video_title),
3599                         'ext': extension,
3600                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3601                         'thumbnail': None,
3602                         'description': video_description,
3603                         'player_url': None,
3604                 }
3605
3606                 try:
3607                         self._downloader.process_info(info)
3608                 except UnavailableVideoError, err:
3609                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3610
3611
3612
3613 class PostProcessor(object):
3614         """Post Processor class.
3615
3616         PostProcessor objects can be added to downloaders with their
3617         add_post_processor() method. When the downloader has finished a
3618         successful download, it will take its internal chain of PostProcessors
3619         and start calling the run() method on each one of them, first with
3620         an initial argument and then with the returned value of the previous
3621         PostProcessor.
3622
3623         The chain will be stopped if one of them ever returns None or the end
3624         of the chain is reached.
3625
3626         PostProcessor objects follow a "mutual registration" process similar
3627         to InfoExtractor objects.
3628         """
3629
3630         _downloader = None
3631
3632         def __init__(self, downloader=None):
3633                 self._downloader = downloader
3634
3635         def set_downloader(self, downloader):
3636                 """Sets the downloader for this PP."""
3637                 self._downloader = downloader
3638
3639         def run(self, information):
3640                 """Run the PostProcessor.
3641
3642                 The "information" argument is a dictionary like the ones
3643                 composed by InfoExtractors. The only difference is that this
3644                 one has an extra field called "filepath" that points to the
3645                 downloaded file.
3646
3647                 When this method returns None, the postprocessing chain is
3648                 stopped. However, this method may return an information
3649                 dictionary that will be passed to the next postprocessing
3650                 object in the chain. It can be the one it received after
3651                 changing some fields.
3652
3653                 In addition, this method may raise a PostProcessingError
3654                 exception that will be taken into account by the downloader
3655                 it was called from.
3656                 """
3657                 return information # by default, do nothing
3658
3659
3660 class FFmpegExtractAudioPP(PostProcessor):
3661
3662         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3663                 PostProcessor.__init__(self, downloader)
3664                 if preferredcodec is None:
3665                         preferredcodec = 'best'
3666                 self._preferredcodec = preferredcodec
3667                 self._preferredquality = preferredquality
3668                 self._keepvideo = keepvideo
3669
3670         @staticmethod
3671         def get_audio_codec(path):
3672                 try:
3673                         cmd = ['ffprobe', '-show_streams', '--', path]
3674                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3675                         output = handle.communicate()[0]
3676                         if handle.wait() != 0:
3677                                 return None
3678                 except (IOError, OSError):
3679                         return None
3680                 audio_codec = None
3681                 for line in output.split('\n'):
3682                         if line.startswith('codec_name='):
3683                                 audio_codec = line.split('=')[1].strip()
3684                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3685                                 return audio_codec
3686                 return None
3687
3688         @staticmethod
3689         def run_ffmpeg(path, out_path, codec, more_opts):
3690                 try:
3691                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3692                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3693                         return (ret == 0)
3694                 except (IOError, OSError):
3695                         return False
3696
3697         def run(self, information):
3698                 path = information['filepath']
3699
3700                 filecodec = self.get_audio_codec(path)
3701                 if filecodec is None:
3702                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3703                         return None
3704
3705                 more_opts = []
3706                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3707                         if filecodec in ['aac', 'mp3', 'vorbis']:
3708                                 # Lossless if possible
3709                                 acodec = 'copy'
3710                                 extension = filecodec
3711                                 if filecodec == 'aac':
3712                                         more_opts = ['-f', 'adts']
3713                                 if filecodec == 'vorbis':
3714                                         extension = 'ogg'
3715                         else:
3716                                 # MP3 otherwise.
3717                                 acodec = 'libmp3lame'
3718                                 extension = 'mp3'
3719                                 more_opts = []
3720                                 if self._preferredquality is not None:
3721                                         more_opts += ['-ab', self._preferredquality]
3722                 else:
3723                         # We convert the audio (lossy)
3724                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3725                         extension = self._preferredcodec
3726                         more_opts = []
3727                         if self._preferredquality is not None:
3728                                 more_opts += ['-ab', self._preferredquality]
3729                         if self._preferredcodec == 'aac':
3730                                 more_opts += ['-f', 'adts']
3731                         if self._preferredcodec == 'vorbis':
3732                                 extension = 'ogg'
3733
3734                 (prefix, ext) = os.path.splitext(path)
3735                 new_path = prefix + '.' + extension
3736                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3737                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3738
3739                 if not status:
3740                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3741                         return None
3742
3743                 # Try to update the date time for extracted audio file.
3744                 if information.get('filetime') is not None:
3745                         try:
3746                                 os.utime(new_path, (time.time(), information['filetime']))
3747                         except:
3748                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3749
3750                 if not self._keepvideo:
3751                         try:
3752                                 os.remove(path)
3753                         except (IOError, OSError):
3754                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3755                                 return None
3756
3757                 information['filepath'] = new_path
3758                 return information
3759
3760
3761 def updateSelf(downloader, filename):
3762         ''' Update the program file with the latest version from the repository '''
3763         # Note: downloader only used for options
3764         if not os.access(filename, os.W_OK):
3765                 sys.exit('ERROR: no write permissions on %s' % filename)
3766
3767         downloader.to_screen('Updating to latest version...')
3768
3769         try:
3770                 try:
3771                         urlh = urllib.urlopen(UPDATE_URL)
3772                         newcontent = urlh.read()
3773
3774                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3775                         if vmatch is not None and vmatch.group(1) == __version__:
3776                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3777                                 return
3778                 finally:
3779                         urlh.close()
3780         except (IOError, OSError), err:
3781                 sys.exit('ERROR: unable to download latest version')
3782
3783         try:
3784                 outf = open(filename, 'wb')
3785                 try:
3786                         outf.write(newcontent)
3787                 finally:
3788                         outf.close()
3789         except (IOError, OSError), err:
3790                 sys.exit('ERROR: unable to overwrite current version')
3791
3792         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3793
3794 def parseOpts():
3795         # Deferred imports
3796         import getpass
3797         import optparse
3798
3799         def _format_option_string(option):
3800                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3801
3802                 opts = []
3803
3804                 if option._short_opts: opts.append(option._short_opts[0])
3805                 if option._long_opts: opts.append(option._long_opts[0])
3806                 if len(opts) > 1: opts.insert(1, ', ')
3807
3808                 if option.takes_value(): opts.append(' %s' % option.metavar)
3809
3810                 return "".join(opts)
3811
3812         def _find_term_columns():
3813                 columns = os.environ.get('COLUMNS', None)
3814                 if columns:
3815                         return int(columns)
3816
3817                 try:
3818                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3819                         out,err = sp.communicate()
3820                         return int(out.split()[1])
3821                 except:
3822                         pass
3823                 return None
3824
3825         max_width = 80
3826         max_help_position = 80
3827
3828         # No need to wrap help messages if we're on a wide console
3829         columns = _find_term_columns()
3830         if columns: max_width = columns
3831
3832         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3833         fmt.format_option_strings = _format_option_string
3834
3835         kw = {
3836                 'version'   : __version__,
3837                 'formatter' : fmt,
3838                 'usage' : '%prog [options] url [url...]',
3839                 'conflict_handler' : 'resolve',
3840         }
3841
3842         parser = optparse.OptionParser(**kw)
3843
3844         # option groups
3845         general        = optparse.OptionGroup(parser, 'General Options')
3846         selection      = optparse.OptionGroup(parser, 'Video Selection')
3847         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3848         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3849         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3850         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3851         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3852
3853         general.add_option('-h', '--help',
3854                         action='help', help='print this help text and exit')
3855         general.add_option('-v', '--version',
3856                         action='version', help='print program version and exit')
3857         general.add_option('-U', '--update',
3858                         action='store_true', dest='update_self', help='update this program to latest version')
3859         general.add_option('-i', '--ignore-errors',
3860                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3861         general.add_option('-r', '--rate-limit',
3862                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3863         general.add_option('-R', '--retries',
3864                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3865         general.add_option('--dump-user-agent',
3866                         action='store_true', dest='dump_user_agent',
3867                         help='display the current browser identification', default=False)
3868         general.add_option('--list-extractors',
3869                         action='store_true', dest='list_extractors',
3870                         help='List all supported extractors and the URLs they would handle', default=False)
3871
3872         selection.add_option('--playlist-start',
3873                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3874         selection.add_option('--playlist-end',
3875                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3876         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3877         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3878
3879         authentication.add_option('-u', '--username',
3880                         dest='username', metavar='USERNAME', help='account username')
3881         authentication.add_option('-p', '--password',
3882                         dest='password', metavar='PASSWORD', help='account password')
3883         authentication.add_option('-n', '--netrc',
3884                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3885
3886
3887         video_format.add_option('-f', '--format',
3888                         action='store', dest='format', metavar='FORMAT', help='video format code')
3889         video_format.add_option('--all-formats',
3890                         action='store_const', dest='format', help='download all available video formats', const='all')
3891         video_format.add_option('--max-quality',
3892                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3893         video_format.add_option('-F', '--list-formats',
3894                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
3895
3896
3897         verbosity.add_option('-q', '--quiet',
3898                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3899         verbosity.add_option('-s', '--simulate',
3900                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3901         verbosity.add_option('--skip-download',
3902                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3903         verbosity.add_option('-g', '--get-url',
3904                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3905         verbosity.add_option('-e', '--get-title',
3906                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3907         verbosity.add_option('--get-thumbnail',
3908                         action='store_true', dest='getthumbnail',
3909                         help='simulate, quiet but print thumbnail URL', default=False)
3910         verbosity.add_option('--get-description',
3911                         action='store_true', dest='getdescription',
3912                         help='simulate, quiet but print video description', default=False)
3913         verbosity.add_option('--get-filename',
3914                         action='store_true', dest='getfilename',
3915                         help='simulate, quiet but print output filename', default=False)
3916         verbosity.add_option('--get-format',
3917                         action='store_true', dest='getformat',
3918                         help='simulate, quiet but print output format', default=False)
3919         verbosity.add_option('--no-progress',
3920                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3921         verbosity.add_option('--console-title',
3922                         action='store_true', dest='consoletitle',
3923                         help='display progress in console titlebar', default=False)
3924
3925
3926         filesystem.add_option('-t', '--title',
3927                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3928         filesystem.add_option('-l', '--literal',
3929                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3930         filesystem.add_option('-A', '--auto-number',
3931                         action='store_true', dest='autonumber',
3932                         help='number downloaded files starting from 00000', default=False)
3933         filesystem.add_option('-o', '--output',
3934                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3935         filesystem.add_option('-a', '--batch-file',
3936                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3937         filesystem.add_option('-w', '--no-overwrites',
3938                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3939         filesystem.add_option('-c', '--continue',
3940                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3941         filesystem.add_option('--no-continue',
3942                         action='store_false', dest='continue_dl',
3943                         help='do not resume partially downloaded files (restart from beginning)')
3944         filesystem.add_option('--cookies',
3945                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3946         filesystem.add_option('--no-part',
3947                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3948         filesystem.add_option('--no-mtime',
3949                         action='store_false', dest='updatetime',
3950                         help='do not use the Last-modified header to set the file modification time', default=True)
3951         filesystem.add_option('--write-description',
3952                         action='store_true', dest='writedescription',
3953                         help='write video description to a .description file', default=False)
3954         filesystem.add_option('--write-info-json',
3955                         action='store_true', dest='writeinfojson',
3956                         help='write video metadata to a .info.json file', default=False)
3957
3958
3959         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3960                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3961         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3962                         help='"best", "aac", "vorbis" or "mp3"; best by default')
3963         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3964                         help='ffmpeg audio bitrate specification, 128k by default')
3965         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3966                         help='keeps the video file on disk after the post-processing; the video is erased by default')
3967
3968
3969         parser.add_option_group(general)
3970         parser.add_option_group(selection)
3971         parser.add_option_group(filesystem)
3972         parser.add_option_group(verbosity)
3973         parser.add_option_group(video_format)
3974         parser.add_option_group(authentication)
3975         parser.add_option_group(postproc)
3976
3977         opts, args = parser.parse_args()
3978
3979         return parser, opts, args
3980
3981 def gen_extractors():
3982         """ Return a list of an instance of every supported extractor.
3983         The order does matter; the first extractor matched is the one handling the URL.
3984         """
3985         youtube_ie = YoutubeIE()
3986         google_ie = GoogleIE()
3987         yahoo_ie = YahooIE()
3988         return [
3989                 YoutubePlaylistIE(youtube_ie),
3990                 YoutubeUserIE(youtube_ie),
3991                 YoutubeSearchIE(youtube_ie),
3992                 youtube_ie,
3993                 MetacafeIE(youtube_ie),
3994                 DailymotionIE(),
3995                 google_ie,
3996                 GoogleSearchIE(google_ie),
3997                 PhotobucketIE(),
3998                 yahoo_ie,
3999                 YahooSearchIE(yahoo_ie),
4000                 DepositFilesIE(),
4001                 FacebookIE(),
4002                 BlipTVIE(),
4003                 VimeoIE(),
4004                 MyVideoIE(),
4005                 ComedyCentralIE(),
4006                 EscapistIE(),
4007                 CollegeHumorIE(),
4008                 XVideosIE(),
4009                 SoundcloudIE(),
4010                 InfoQIE(),
4011
4012                 GenericIE()
4013         ]
4014
4015 def _real_main():
4016         parser, opts, args = parseOpts()
4017
4018         # Open appropriate CookieJar
4019         if opts.cookiefile is None:
4020                 jar = cookielib.CookieJar()
4021         else:
4022                 try:
4023                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4024                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4025                                 jar.load()
4026                 except (IOError, OSError), err:
4027                         sys.exit(u'ERROR: unable to open cookie file')
4028
4029         # Dump user agent
4030         if opts.dump_user_agent:
4031                 print std_headers['User-Agent']
4032                 sys.exit(0)
4033
4034         # Batch file verification
4035         batchurls = []
4036         if opts.batchfile is not None:
4037                 try:
4038                         if opts.batchfile == '-':
4039                                 batchfd = sys.stdin
4040                         else:
4041                                 batchfd = open(opts.batchfile, 'r')
4042                         batchurls = batchfd.readlines()
4043                         batchurls = [x.strip() for x in batchurls]
4044                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4045                 except IOError:
4046                         sys.exit(u'ERROR: batch file could not be read')
4047         all_urls = batchurls + args
4048
4049         # General configuration
4050         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4051         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4052         urllib2.install_opener(opener)
4053         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4054
4055         extractors = gen_extractors()
4056
4057         if opts.list_extractors:
4058                 for ie in extractors:
4059                         print(ie.IE_NAME)
4060                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4061                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4062                         for mu in matchedUrls:
4063                                 print(u'  ' + mu)
4064                 sys.exit(0)
4065
4066         # Conflicting, missing and erroneous options
4067         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4068                 parser.error(u'using .netrc conflicts with giving username/password')
4069         if opts.password is not None and opts.username is None:
4070                 parser.error(u'account username missing')
4071         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4072                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4073         if opts.usetitle and opts.useliteral:
4074                 parser.error(u'using title conflicts with using literal title')
4075         if opts.username is not None and opts.password is None:
4076                 opts.password = getpass.getpass(u'Type account password and press return:')
4077         if opts.ratelimit is not None:
4078                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4079                 if numeric_limit is None:
4080                         parser.error(u'invalid rate limit specified')
4081                 opts.ratelimit = numeric_limit
4082         if opts.retries is not None:
4083                 try:
4084                         opts.retries = long(opts.retries)
4085                 except (TypeError, ValueError), err:
4086                         parser.error(u'invalid retry count specified')
4087         try:
4088                 opts.playliststart = int(opts.playliststart)
4089                 if opts.playliststart <= 0:
4090                         raise ValueError(u'Playlist start must be positive')
4091         except (TypeError, ValueError), err:
4092                 parser.error(u'invalid playlist start number specified')
4093         try:
4094                 opts.playlistend = int(opts.playlistend)
4095                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4096                         raise ValueError(u'Playlist end must be greater than playlist start')
4097         except (TypeError, ValueError), err:
4098                 parser.error(u'invalid playlist end number specified')
4099         if opts.extractaudio:
4100                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4101                         parser.error(u'invalid audio format specified')
4102
4103         # File downloader
4104         fd = FileDownloader({
4105                 'usenetrc': opts.usenetrc,
4106                 'username': opts.username,
4107                 'password': opts.password,
4108                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4109                 'forceurl': opts.geturl,
4110                 'forcetitle': opts.gettitle,
4111                 'forcethumbnail': opts.getthumbnail,
4112                 'forcedescription': opts.getdescription,
4113                 'forcefilename': opts.getfilename,
4114                 'forceformat': opts.getformat,
4115                 'simulate': opts.simulate,
4116                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4117                 'format': opts.format,
4118                 'format_limit': opts.format_limit,
4119                 'listformats': opts.listformats,
4120                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4121                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4122                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4123                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4124                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4125                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4126                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4127                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4128                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4129                         or u'%(id)s.%(ext)s'),
4130                 'ignoreerrors': opts.ignoreerrors,
4131                 'ratelimit': opts.ratelimit,
4132                 'nooverwrites': opts.nooverwrites,
4133                 'retries': opts.retries,
4134                 'continuedl': opts.continue_dl,
4135                 'noprogress': opts.noprogress,
4136                 'playliststart': opts.playliststart,
4137                 'playlistend': opts.playlistend,
4138                 'logtostderr': opts.outtmpl == '-',
4139                 'consoletitle': opts.consoletitle,
4140                 'nopart': opts.nopart,
4141                 'updatetime': opts.updatetime,
4142                 'writedescription': opts.writedescription,
4143                 'writeinfojson': opts.writeinfojson,
4144                 'matchtitle': opts.matchtitle,
4145                 'rejecttitle': opts.rejecttitle,
4146                 })
4147         for extractor in extractors:
4148                 fd.add_info_extractor(extractor)
4149
4150         # PostProcessors
4151         if opts.extractaudio:
4152                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4153
4154         # Update version
4155         if opts.update_self:
4156                 updateSelf(fd, sys.argv[0])
4157
4158         # Maybe do nothing
4159         if len(all_urls) < 1:
4160                 if not opts.update_self:
4161                         parser.error(u'you must provide at least one URL')
4162                 else:
4163                         sys.exit()
4164         retcode = fd.download(all_urls)
4165
4166         # Dump cookie jar if requested
4167         if opts.cookiefile is not None:
4168                 try:
4169                         jar.save()
4170                 except (IOError, OSError), err:
4171                         sys.exit(u'ERROR: unable to save cookie jar')
4172
4173         sys.exit(retcode)
4174
4175 def main():
4176         try:
4177                 _real_main()
4178         except DownloadError:
4179                 sys.exit(1)
4180         except SameFileError:
4181                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4182         except KeyboardInterrupt:
4183                 sys.exit(u'\nERROR: Interrupted by user')
4184
4185 if __name__ == '__main__':
4186         main()
4187
4188 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: