youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # Author: Paweł Paprota
   9 # Author: Gergely Imreh
  10 # License: Public domain code
  11 import cookielib
  12 import ctypes
  13 import datetime
  14 import email.utils
  15 import gzip
  16 import htmlentitydefs
  17 import httplib
  18 import locale
  19 import math
  20 import netrc
  21 import os
  22 import os.path
  23 import re
  24 import socket
  25 import string
  26 import StringIO
  27 import subprocess
  28 import sys
  29 import time
  30 import urllib
  31 import urllib2
  32 import zlib
  33
  34 # parse_qs was moved from the cgi module to the urlparse module recently.
  35 try:
  36         from urlparse import parse_qs
  37 except ImportError:
  38         from cgi import parse_qs
  39
  40 std_headers = {
  41         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
  42         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  43         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  44         'Accept-Encoding': 'gzip, deflate',
  45         'Accept-Language': 'en-us,en;q=0.5',
  46 }
  47
  48 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  49
  50 def preferredencoding():
  51         """Get preferred encoding.
  52
  53         Returns the best encoding scheme for the system, based on
  54         locale.getpreferredencoding() and some further tweaks.
  55         """
  56         def yield_preferredencoding():
  57                 try:
  58                         pref = locale.getpreferredencoding()
  59                         u'TEST'.encode(pref)
  60                 except:
  61                         pref = 'UTF-8'
  62                 while True:
  63                         yield pref
  64         return yield_preferredencoding().next()
  65
  66 def htmlentity_transform(matchobj):
  67         """Transforms an HTML entity to a Unicode character.
  68
  69         This function receives a match object and is intended to be used with
  70         the re.sub() function.
  71         """
  72         entity = matchobj.group(1)
  73
  74         # Known non-numeric HTML entity
  75         if entity in htmlentitydefs.name2codepoint:
  76                 return unichr(htmlentitydefs.name2codepoint[entity])
  77
  78         # Unicode character
  79         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  80         if mobj is not None:
  81                 numstr = mobj.group(1)
  82                 if numstr.startswith(u'x'):
  83                         base = 16
  84                         numstr = u'0%s' % numstr
  85                 else:
  86                         base = 10
  87                 return unichr(long(numstr, base))
  88
  89         # Unknown entity in name, return its literal representation
  90         return (u'&%s;' % entity)
  91
  92 def sanitize_title(utitle):
  93         """Sanitizes a video title so it could be used as part of a filename."""
  94         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  95         return utitle.replace(unicode(os.sep), u'%')
  96
  97 def sanitize_open(filename, open_mode):
  98         """Try to open the given filename, and slightly tweak it if this fails.
  99
 100         Attempts to open the given filename. If this fails, it tries to change
 101         the filename slightly, step by step, until it's either able to open it
 102         or it fails and raises a final exception, like the standard open()
 103         function.
 104
 105         It returns the tuple (stream, definitive_file_name).
 106         """
 107         try:
 108                 if filename == u'-':
 109                         if sys.platform == 'win32':
 110                                 import msvcrt
 111                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 112                         return (sys.stdout, filename)
 113                 stream = open(filename, open_mode)
 114                 return (stream, filename)
 115         except (IOError, OSError), err:
 116                 # In case of error, try to remove win32 forbidden chars
 117                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 118
 119                 # An exception here should be caught in the caller
 120                 stream = open(filename, open_mode)
 121                 return (stream, filename)
 122
 123 def timeconvert(timestr):
 124     """Convert RFC 2822 defined time string into system timestamp"""
 125     timestamp = None
 126     timetuple = email.utils.parsedate_tz(timestr)
 127     if timetuple is not None:
 128         timestamp = email.utils.mktime_tz(timetuple)
 129     return timestamp
 130
 131 class DownloadError(Exception):
 132         """Download Error exception.
 133
 134         This exception may be thrown by FileDownloader objects if they are not
 135         configured to continue on errors. They will contain the appropriate
 136         error message.
 137         """
 138         pass
 139
 140 class SameFileError(Exception):
 141         """Same File exception.
 142
 143         This exception will be thrown by FileDownloader objects if they detect
 144         multiple files would have to be downloaded to the same file on disk.
 145         """
 146         pass
 147
 148 class PostProcessingError(Exception):
 149         """Post Processing exception.
 150
 151         This exception may be raised by PostProcessor's .run() method to
 152         indicate an error in the postprocessing task.
 153         """
 154         pass
 155
 156 class UnavailableVideoError(Exception):
 157         """Unavailable Format exception.
 158
 159         This exception will be thrown when a video is requested
 160         in a format that is not available for that video.
 161         """
 162         pass
 163
 164 class ContentTooShortError(Exception):
 165         """Content Too Short exception.
 166
 167         This exception may be raised by FileDownloader objects when a file they
 168         download is too small for what the server announced first, indicating
 169         the connection was probably interrupted.
 170         """
 171         # Both in bytes
 172         downloaded = None
 173         expected = None
 174
 175         def __init__(self, downloaded, expected):
 176                 self.downloaded = downloaded
 177                 self.expected = expected
 178
 179 class YoutubeDLHandler(urllib2.HTTPHandler):
 180         """Handler for HTTP requests and responses.
 181
 182         This class, when installed with an OpenerDirector, automatically adds
 183         the standard headers to every HTTP request and handles gzipped and
 184         deflated responses from web servers. If compression is to be avoided in
 185         a particular request, the original request in the program code only has
 186         to include the HTTP header "Youtubedl-No-Compression", which will be
 187         removed before making the real request.
 188
 189         Part of this code was copied from:
 190
 191           http://techknack.net/python-urllib2-handlers/
 192
 193         Andrew Rowls, the author of that code, agreed to release it to the
 194         public domain.
 195         """
 196
 197         @staticmethod
 198         def deflate(data):
 199                 try:
 200                         return zlib.decompress(data, -zlib.MAX_WBITS)
 201                 except zlib.error:
 202                         return zlib.decompress(data)
 203
 204         @staticmethod
 205         def addinfourl_wrapper(stream, headers, url, code):
 206                 if hasattr(urllib2.addinfourl, 'getcode'):
 207                         return urllib2.addinfourl(stream, headers, url, code)
 208                 ret = urllib2.addinfourl(stream, headers, url)
 209                 ret.code = code
 210                 return ret
 211
 212         def http_request(self, req):
 213                 for h in std_headers:
 214                         if h in req.headers:
 215                                 del req.headers[h]
 216                         req.add_header(h, std_headers[h])
 217                 if 'Youtubedl-no-compression' in req.headers:
 218                         if 'Accept-encoding' in req.headers:
 219                                 del req.headers['Accept-encoding']
 220                         del req.headers['Youtubedl-no-compression']
 221                 return req
 222
 223         def http_response(self, req, resp):
 224                 old_resp = resp
 225                 # gzip
 226                 if resp.headers.get('Content-encoding', '') == 'gzip':
 227                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 228                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 229                         resp.msg = old_resp.msg
 230                 # deflate
 231                 if resp.headers.get('Content-encoding', '') == 'deflate':
 232                         gz = StringIO.StringIO(self.deflate(resp.read()))
 233                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 234                         resp.msg = old_resp.msg
 235                 return resp
 236
 237 class FileDownloader(object):
 238         """File Downloader class.
 239
 240         File downloader objects are the ones responsible of downloading the
 241         actual video file and writing it to disk if the user has requested
 242         it, among some other tasks. In most cases there should be one per
 243         program. As, given a video URL, the downloader doesn't know how to
 244         extract all the needed information, task that InfoExtractors do, it
 245         has to pass the URL to one of them.
 246
 247         For this, file downloader objects have a method that allows
 248         InfoExtractors to be registered in a given order. When it is passed
 249         a URL, the file downloader handles it to the first InfoExtractor it
 250         finds that reports being able to handle it. The InfoExtractor extracts
 251         all the information about the video or videos the URL refers to, and
 252         asks the FileDownloader to process the video information, possibly
 253         downloading the video.
 254
 255         File downloaders accept a lot of parameters. In order not to saturate
 256         the object constructor with arguments, it receives a dictionary of
 257         options instead. These options are available through the params
 258         attribute for the InfoExtractors to use. The FileDownloader also
 259         registers itself as the downloader in charge for the InfoExtractors
 260         that are added to it, so this is a "mutual registration".
 261
 262         Available options:
 263
 264         username:         Username for authentication purposes.
 265         password:         Password for authentication purposes.
 266         usenetrc:         Use netrc for authentication instead.
 267         quiet:            Do not print messages to stdout.
 268         forceurl:         Force printing final URL.
 269         forcetitle:       Force printing title.
 270         forcethumbnail:   Force printing thumbnail URL.
 271         forcedescription: Force printing description.
 272         forcefilename:    Force printing final filename.
 273         simulate:         Do not download the video files.
 274         format:           Video format code.
 275         format_limit:     Highest quality format to try.
 276         outtmpl:          Template for output names.
 277         ignoreerrors:     Do not stop on download errors.
 278         ratelimit:        Download speed limit, in bytes/sec.
 279         nooverwrites:     Prevent overwriting files.
 280         retries:          Number of times to retry for HTTP error 5xx
 281         continuedl:       Try to continue downloads if possible.
 282         noprogress:       Do not print the progress bar.
 283         playliststart:    Playlist item to start at.
 284         playlistend:      Playlist item to end at.
 285         logtostderr:      Log messages to stderr instead of stdout.
 286         consoletitle:     Display progress in console window's titlebar.
 287         nopart:           Do not use temporary .part files.
 288         updatetime:       Use the Last-modified header to set output file timestamps.
 289         """
 290
 291         params = None
 292         _ies = []
 293         _pps = []
 294         _download_retcode = None
 295         _num_downloads = None
 296         _screen_file = None
 297
 298         def __init__(self, params):
 299                 """Create a FileDownloader object with the given options."""
 300                 self._ies = []
 301                 self._pps = []
 302                 self._download_retcode = 0
 303                 self._num_downloads = 0
 304                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 305                 self.params = params
 306
 307         @staticmethod
 308         def pmkdir(filename):
 309                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 310                 components = filename.split(os.sep)
 311                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 312                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 313                 for dir in aggregate:
 314                         if not os.path.exists(dir):
 315                                 os.mkdir(dir)
 316
 317         @staticmethod
 318         def format_bytes(bytes):
 319                 if bytes is None:
 320                         return 'N/A'
 321                 if type(bytes) is str:
 322                         bytes = float(bytes)
 323                 if bytes == 0.0:
 324                         exponent = 0
 325                 else:
 326                         exponent = long(math.log(bytes, 1024.0))
 327                 suffix = 'bkMGTPEZY'[exponent]
 328                 converted = float(bytes) / float(1024**exponent)
 329                 return '%.2f%s' % (converted, suffix)
 330
 331         @staticmethod
 332         def calc_percent(byte_counter, data_len):
 333                 if data_len is None:
 334                         return '---.-%'
 335                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 336
 337         @staticmethod
 338         def calc_eta(start, now, total, current):
 339                 if total is None:
 340                         return '--:--'
 341                 dif = now - start
 342                 if current == 0 or dif < 0.001: # One millisecond
 343                         return '--:--'
 344                 rate = float(current) / dif
 345                 eta = long((float(total) - float(current)) / rate)
 346                 (eta_mins, eta_secs) = divmod(eta, 60)
 347                 if eta_mins > 99:
 348                         return '--:--'
 349                 return '%02d:%02d' % (eta_mins, eta_secs)
 350
 351         @staticmethod
 352         def calc_speed(start, now, bytes):
 353                 dif = now - start
 354                 if bytes == 0 or dif < 0.001: # One millisecond
 355                         return '%10s' % '---b/s'
 356                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 357
 358         @staticmethod
 359         def best_block_size(elapsed_time, bytes):
 360                 new_min = max(bytes / 2.0, 1.0)
 361                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 362                 if elapsed_time < 0.001:
 363                         return long(new_max)
 364                 rate = bytes / elapsed_time
 365                 if rate > new_max:
 366                         return long(new_max)
 367                 if rate < new_min:
 368                         return long(new_min)
 369                 return long(rate)
 370
 371         @staticmethod
 372         def parse_bytes(bytestr):
 373                 """Parse a string indicating a byte quantity into a long integer."""
 374                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 375                 if matchobj is None:
 376                         return None
 377                 number = float(matchobj.group(1))
 378                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 379                 return long(round(number * multiplier))
 380
 381         def add_info_extractor(self, ie):
 382                 """Add an InfoExtractor object to the end of the list."""
 383                 self._ies.append(ie)
 384                 ie.set_downloader(self)
 385
 386         def add_post_processor(self, pp):
 387                 """Add a PostProcessor object to the end of the chain."""
 388                 self._pps.append(pp)
 389                 pp.set_downloader(self)
 390
 391         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 392                 """Print message to stdout if not in quiet mode."""
 393                 try:
 394                         if not self.params.get('quiet', False):
 395                                 terminator = [u'\n', u''][skip_eol]
 396                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 397                         self._screen_file.flush()
 398                 except (UnicodeEncodeError), err:
 399                         if not ignore_encoding_errors:
 400                                 raise
 401
 402         def to_stderr(self, message):
 403                 """Print message to stderr."""
 404                 print >>sys.stderr, message.encode(preferredencoding())
 405
 406         def to_cons_title(self, message):
 407                 """Set console/terminal window title to message."""
 408                 if not self.params.get('consoletitle', False):
 409                         return
 410                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 411                         # c_wchar_p() might not be necessary if `message` is
 412                         # already of type unicode()
 413                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 414                 elif 'TERM' in os.environ:
 415                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 416
 417         def fixed_template(self):
 418                 """Checks if the output template is fixed."""
 419                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 420
 421         def trouble(self, message=None):
 422                 """Determine action to take when a download problem appears.
 423
 424                 Depending on if the downloader has been configured to ignore
 425                 download errors or not, this method may throw an exception or
 426                 not when errors are found, after printing the message.
 427                 """
 428                 if message is not None:
 429                         self.to_stderr(message)
 430                 if not self.params.get('ignoreerrors', False):
 431                         raise DownloadError(message)
 432                 self._download_retcode = 1
 433
 434         def slow_down(self, start_time, byte_counter):
 435                 """Sleep if the download speed is over the rate limit."""
 436                 rate_limit = self.params.get('ratelimit', None)
 437                 if rate_limit is None or byte_counter == 0:
 438                         return
 439                 now = time.time()
 440                 elapsed = now - start_time
 441                 if elapsed <= 0.0:
 442                         return
 443                 speed = float(byte_counter) / elapsed
 444                 if speed > rate_limit:
 445                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 446
 447         def temp_name(self, filename):
 448                 """Returns a temporary filename for the given filename."""
 449                 if self.params.get('nopart', False) or filename == u'-' or \
 450                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 451                         return filename
 452                 return filename + u'.part'
 453
 454         def undo_temp_name(self, filename):
 455                 if filename.endswith(u'.part'):
 456                         return filename[:-len(u'.part')]
 457                 return filename
 458
 459         def try_rename(self, old_filename, new_filename):
 460                 try:
 461                         if old_filename == new_filename:
 462                                 return
 463                         os.rename(old_filename, new_filename)
 464                 except (IOError, OSError), err:
 465                         self.trouble(u'ERROR: unable to rename file')
 466
 467         def try_utime(self, filename, last_modified_hdr):
 468                 """Try to set the last-modified time of the given file."""
 469                 if last_modified_hdr is None:
 470                         return
 471                 if not os.path.isfile(filename):
 472                         return
 473                 timestr = last_modified_hdr
 474                 if timestr is None:
 475                         return
 476                 filetime = timeconvert(timestr)
 477                 if filetime is None:
 478                         return
 479                 try:
 480                         os.utime(filename,(time.time(), filetime))
 481                 except:
 482                         pass
 483
 484         def report_destination(self, filename):
 485                 """Report destination filename."""
 486                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 487
 488         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 489                 """Report download progress."""
 490                 if self.params.get('noprogress', False):
 491                         return
 492                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 493                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 494                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 495                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 496
 497         def report_resuming_byte(self, resume_len):
 498                 """Report attempt to resume at given byte."""
 499                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 500
 501         def report_retry(self, count, retries):
 502                 """Report retry in case of HTTP error 5xx"""
 503                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 504
 505         def report_file_already_downloaded(self, file_name):
 506                 """Report file has already been fully downloaded."""
 507                 try:
 508                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 509                 except (UnicodeEncodeError), err:
 510                         self.to_screen(u'[download] The file has already been downloaded')
 511
 512         def report_unable_to_resume(self):
 513                 """Report it was impossible to resume download."""
 514                 self.to_screen(u'[download] Unable to resume')
 515
 516         def report_finish(self):
 517                 """Report download finished."""
 518                 if self.params.get('noprogress', False):
 519                         self.to_screen(u'[download] Download completed')
 520                 else:
 521                         self.to_screen(u'')
 522
 523         def increment_downloads(self):
 524                 """Increment the ordinal that assigns a number to each file."""
 525                 self._num_downloads += 1
 526
 527         def prepare_filename(self, info_dict):
 528                 """Generate the output filename."""
 529                 try:
 530                         template_dict = dict(info_dict)
 531                         template_dict['epoch'] = unicode(long(time.time()))
 532                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 533                         filename = self.params['outtmpl'] % template_dict
 534                         return filename
 535                 except (ValueError, KeyError), err:
 536                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 537                         return None
 538
 539         def process_info(self, info_dict):
 540                 """Process a single dictionary returned by an InfoExtractor."""
 541                 filename = self.prepare_filename(info_dict)
 542                 # Do nothing else if in simulate mode
 543                 if self.params.get('simulate', False):
 544                         # Forced printings
 545                         if self.params.get('forcetitle', False):
 546                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 547                         if self.params.get('forceurl', False):
 548                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 549                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 550                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 551                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 552                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 553                         if self.params.get('forcefilename', False) and filename is not None:
 554                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 555
 556                         return
 557
 558                 if filename is None:
 559                         return
 560                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 561                         self.to_stderr(u'WARNING: file exists and will be skipped')
 562                         return
 563
 564                 try:
 565                         self.pmkdir(filename)
 566                 except (OSError, IOError), err:
 567                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 568                         return
 569
 570                 try:
 571                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 572                 except (OSError, IOError), err:
 573                         raise UnavailableVideoError
 574                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 575                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 576                         return
 577                 except (ContentTooShortError, ), err:
 578                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 579                         return
 580
 581                 if success:
 582                         try:
 583                                 self.post_process(filename, info_dict)
 584                         except (PostProcessingError), err:
 585                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 586                                 return
 587
 588         def download(self, url_list):
 589                 """Download a given list of URLs."""
 590                 if len(url_list) > 1 and self.fixed_template():
 591                         raise SameFileError(self.params['outtmpl'])
 592
 593                 for url in url_list:
 594                         suitable_found = False
 595                         for ie in self._ies:
 596                                 # Go to next InfoExtractor if not suitable
 597                                 if not ie.suitable(url):
 598                                         continue
 599
 600                                 # Suitable InfoExtractor found
 601                                 suitable_found = True
 602
 603                                 # Extract information from URL and process it
 604                                 ie.extract(url)
 605
 606                                 # Suitable InfoExtractor had been found; go to next URL
 607                                 break
 608
 609                         if not suitable_found:
 610                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 611
 612                 return self._download_retcode
 613
 614         def post_process(self, filename, ie_info):
 615                 """Run the postprocessing chain on the given file."""
 616                 info = dict(ie_info)
 617                 info['filepath'] = filename
 618                 for pp in self._pps:
 619                         info = pp.run(info)
 620                         if info is None:
 621                                 break
 622
 623         def _download_with_rtmpdump(self, filename, url, player_url):
 624                 self.report_destination(filename)
 625                 tmpfilename = self.temp_name(filename)
 626
 627                 # Check for rtmpdump first
 628                 try:
 629                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 630                 except (OSError, IOError):
 631                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 632                         return False
 633
 634                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 635                 # the connection was interrumpted and resuming appears to be
 636                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 637                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 638                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 639                 while retval == 2 or retval == 1:
 640                         prevsize = os.path.getsize(tmpfilename)
 641                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 642                         time.sleep(5.0) # This seems to be needed
 643                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 644                         cursize = os.path.getsize(tmpfilename)
 645                         if prevsize == cursize and retval == 1:
 646                                 break
 647                 if retval == 0:
 648                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 649                         self.try_rename(tmpfilename, filename)
 650                         return True
 651                 else:
 652                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 653                         return False
 654
 655         def _do_download(self, filename, url, player_url):
 656                 # Check file already present
 657                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 658                         self.report_file_already_downloaded(filename)
 659                         return True
 660
 661                 # Attempt to download using rtmpdump
 662                 if url.startswith('rtmp'):
 663                         return self._download_with_rtmpdump(filename, url, player_url)
 664
 665                 tmpfilename = self.temp_name(filename)
 666                 stream = None
 667                 open_mode = 'wb'
 668
 669                 # Do not include the Accept-Encoding header
 670                 headers = {'Youtubedl-no-compression': 'True'}
 671                 basic_request = urllib2.Request(url, None, headers)
 672                 request = urllib2.Request(url, None, headers)
 673
 674                 # Establish possible resume length
 675                 if os.path.isfile(tmpfilename):
 676                         resume_len = os.path.getsize(tmpfilename)
 677                 else:
 678                         resume_len = 0
 679
 680                 # Request parameters in case of being able to resume
 681                 if self.params.get('continuedl', False) and resume_len != 0:
 682                         self.report_resuming_byte(resume_len)
 683                         request.add_header('Range','bytes=%d-' % resume_len)
 684                         open_mode = 'ab'
 685
 686                 count = 0
 687                 retries = self.params.get('retries', 0)
 688                 while count <= retries:
 689                         # Establish connection
 690                         try:
 691                                 data = urllib2.urlopen(request)
 692                                 break
 693                         except (urllib2.HTTPError, ), err:
 694                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 695                                         # Unexpected HTTP error
 696                                         raise
 697                                 elif err.code == 416:
 698                                         # Unable to resume (requested range not satisfiable)
 699                                         try:
 700                                                 # Open the connection again without the range header
 701                                                 data = urllib2.urlopen(basic_request)
 702                                                 content_length = data.info()['Content-Length']
 703                                         except (urllib2.HTTPError, ), err:
 704                                                 if err.code < 500 or err.code >= 600:
 705                                                         raise
 706                                         else:
 707                                                 # Examine the reported length
 708                                                 if (content_length is not None and
 709                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 710                                                         # The file had already been fully downloaded.
 711                                                         # Explanation to the above condition: in issue #175 it was revealed that
 712                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 713                                                         # changing the file size slightly and causing problems for some users. So
 714                                                         # I decided to implement a suggested change and consider the file
 715                                                         # completely downloaded if the file size differs less than 100 bytes from
 716                                                         # the one in the hard drive.
 717                                                         self.report_file_already_downloaded(filename)
 718                                                         self.try_rename(tmpfilename, filename)
 719                                                         return True
 720                                                 else:
 721                                                         # The length does not match, we start the download over
 722                                                         self.report_unable_to_resume()
 723                                                         open_mode = 'wb'
 724                                                         break
 725                         # Retry
 726                         count += 1
 727                         if count <= retries:
 728                                 self.report_retry(count, retries)
 729
 730                 if count > retries:
 731                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 732                         return False
 733
 734                 data_len = data.info().get('Content-length', None)
 735                 if data_len is not None:
 736                         data_len = long(data_len) + resume_len
 737                 data_len_str = self.format_bytes(data_len)
 738                 byte_counter = 0 + resume_len
 739                 block_size = 1024
 740                 start = time.time()
 741                 while True:
 742                         # Download and write
 743                         before = time.time()
 744                         data_block = data.read(block_size)
 745                         after = time.time()
 746                         if len(data_block) == 0:
 747                                 break
 748                         byte_counter += len(data_block)
 749
 750                         # Open file just in time
 751                         if stream is None:
 752                                 try:
 753                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 754                                         filename = self.undo_temp_name(tmpfilename)
 755                                         self.report_destination(filename)
 756                                 except (OSError, IOError), err:
 757                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 758                                         return False
 759                         try:
 760                                 stream.write(data_block)
 761                         except (IOError, OSError), err:
 762                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 763                                 return False
 764                         block_size = self.best_block_size(after - before, len(data_block))
 765
 766                         # Progress message
 767                         percent_str = self.calc_percent(byte_counter, data_len)
 768                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 769                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 770                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 771
 772                         # Apply rate limit
 773                         self.slow_down(start, byte_counter - resume_len)
 774
 775                 stream.close()
 776                 self.report_finish()
 777                 if data_len is not None and byte_counter != data_len:
 778                         raise ContentTooShortError(byte_counter, long(data_len))
 779                 self.try_rename(tmpfilename, filename)
 780
 781                 # Update file modification time
 782                 if self.params.get('updatetime', True):
 783                         self.try_utime(filename, data.info().get('last-modified', None))
 784
 785                 return True
 786
 787 class InfoExtractor(object):
 788         """Information Extractor class.
 789
 790         Information extractors are the classes that, given a URL, extract
 791         information from the video (or videos) the URL refers to. This
 792         information includes the real video URL, the video title and simplified
 793         title, author and others. The information is stored in a dictionary
 794         which is then passed to the FileDownloader. The FileDownloader
 795         processes this information possibly downloading the video to the file
 796         system, among other possible outcomes. The dictionaries must include
 797         the following fields:
 798
 799         id:             Video identifier.
 800         url:            Final video URL.
 801         uploader:       Nickname of the video uploader.
 802         title:          Literal title.
 803         stitle:         Simplified title.
 804         ext:            Video filename extension.
 805         format:         Video format.
 806         player_url:     SWF Player URL (may be None).
 807
 808         The following fields are optional. Their primary purpose is to allow
 809         youtube-dl to serve as the backend for a video search function, such
 810         as the one in youtube2mp3.  They are only used when their respective
 811         forced printing functions are called:
 812
 813         thumbnail:      Full URL to a video thumbnail image.
 814         description:    One-line video description.
 815
 816         Subclasses of this one should re-define the _real_initialize() and
 817         _real_extract() methods, as well as the suitable() static method.
 818         Probably, they should also be instantiated and added to the main
 819         downloader.
 820         """
 821
 822         _ready = False
 823         _downloader = None
 824
 825         def __init__(self, downloader=None):
 826                 """Constructor. Receives an optional downloader."""
 827                 self._ready = False
 828                 self.set_downloader(downloader)
 829
 830         @staticmethod
 831         def suitable(url):
 832                 """Receives a URL and returns True if suitable for this IE."""
 833                 return False
 834
 835         def initialize(self):
 836                 """Initializes an instance (authentication, etc)."""
 837                 if not self._ready:
 838                         self._real_initialize()
 839                         self._ready = True
 840
 841         def extract(self, url):
 842                 """Extracts URL information and returns it in list of dicts."""
 843                 self.initialize()
 844                 return self._real_extract(url)
 845
 846         def set_downloader(self, downloader):
 847                 """Sets the downloader for this IE."""
 848                 self._downloader = downloader
 849
 850         def _real_initialize(self):
 851                 """Real initialization process. Redefine in subclasses."""
 852                 pass
 853
 854         def _real_extract(self, url):
 855                 """Real extraction process. Redefine in subclasses."""
 856                 pass
 857
 858 class YoutubeIE(InfoExtractor):
 859         """Information extractor for youtube.com."""
 860
 861         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 862         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 863         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 864         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 865         _NETRC_MACHINE = 'youtube'
 866         # Listed in order of quality
 867         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 868         _video_extensions = {
 869                 '13': '3gp',
 870                 '17': 'mp4',
 871                 '18': 'mp4',
 872                 '22': 'mp4',
 873                 '37': 'mp4',
 874                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 875                 '43': 'webm',
 876                 '45': 'webm',
 877         }
 878
 879         @staticmethod
 880         def suitable(url):
 881                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 882
 883         def report_lang(self):
 884                 """Report attempt to set language."""
 885                 self._downloader.to_screen(u'[youtube] Setting language')
 886
 887         def report_login(self):
 888                 """Report attempt to log in."""
 889                 self._downloader.to_screen(u'[youtube] Logging in')
 890
 891         def report_age_confirmation(self):
 892                 """Report attempt to confirm age."""
 893                 self._downloader.to_screen(u'[youtube] Confirming age')
 894
 895         def report_video_webpage_download(self, video_id):
 896                 """Report attempt to download video webpage."""
 897                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 898
 899         def report_video_info_webpage_download(self, video_id):
 900                 """Report attempt to download video info webpage."""
 901                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 902
 903         def report_information_extraction(self, video_id):
 904                 """Report attempt to extract video information."""
 905                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 906
 907         def report_unavailable_format(self, video_id, format):
 908                 """Report extracted video URL."""
 909                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 910
 911         def report_rtmp_download(self):
 912                 """Indicate the download will use the RTMP protocol."""
 913                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 914
 915         def _real_initialize(self):
 916                 if self._downloader is None:
 917                         return
 918
 919                 username = None
 920                 password = None
 921                 downloader_params = self._downloader.params
 922
 923                 # Attempt to use provided username and password or .netrc data
 924                 if downloader_params.get('username', None) is not None:
 925                         username = downloader_params['username']
 926                         password = downloader_params['password']
 927                 elif downloader_params.get('usenetrc', False):
 928                         try:
 929                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 930                                 if info is not None:
 931                                         username = info[0]
 932                                         password = info[2]
 933                                 else:
 934                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 935                         except (IOError, netrc.NetrcParseError), err:
 936                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 937                                 return
 938
 939                 # Set language
 940                 request = urllib2.Request(self._LANG_URL)
 941                 try:
 942                         self.report_lang()
 943                         urllib2.urlopen(request).read()
 944                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 945                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 946                         return
 947
 948                 # No authentication to be performed
 949                 if username is None:
 950                         return
 951
 952                 # Log in
 953                 login_form = {
 954                                 'current_form': 'loginForm',
 955                                 'next':         '/',
 956                                 'action_login': 'Log In',
 957                                 'username':     username,
 958                                 'password':     password,
 959                                 }
 960                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 961                 try:
 962                         self.report_login()
 963                         login_results = urllib2.urlopen(request).read()
 964                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 965                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 966                                 return
 967                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 968                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 969                         return
 970
 971                 # Confirm age
 972                 age_form = {
 973                                 'next_url':             '/',
 974                                 'action_confirm':       'Confirm',
 975                                 }
 976                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 977                 try:
 978                         self.report_age_confirmation()
 979                         age_results = urllib2.urlopen(request).read()
 980                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 981                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 982                         return
 983
 984         def _real_extract(self, url):
 985                 # Extract video id from URL
 986                 mobj = re.match(self._VALID_URL, url)
 987                 if mobj is None:
 988                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 989                         return
 990                 video_id = mobj.group(2)
 991
 992                 # Get video webpage
 993                 self.report_video_webpage_download(video_id)
 994                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
 995                 try:
 996                         video_webpage = urllib2.urlopen(request).read()
 997                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 998                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 999                         return
1000
1001                 # Attempt to extract SWF player URL
1002                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1003                 if mobj is not None:
1004                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1005                 else:
1006                         player_url = None
1007
1008                 # Get video info
1009                 self.report_video_info_webpage_download(video_id)
1010                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1011                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1012                                            % (video_id, el_type))
1013                         request = urllib2.Request(video_info_url)
1014                         try:
1015                                 video_info_webpage = urllib2.urlopen(request).read()
1016                                 video_info = parse_qs(video_info_webpage)
1017                                 if 'token' in video_info:
1018                                         break
1019                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1020                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1021                                 return
1022                 if 'token' not in video_info:
1023                         if 'reason' in video_info:
1024                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1025                         else:
1026                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1027                         return
1028
1029                 # Start extracting information
1030                 self.report_information_extraction(video_id)
1031
1032                 # uploader
1033                 if 'author' not in video_info:
1034                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1035                         return
1036                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1037
1038                 # title
1039                 if 'title' not in video_info:
1040                         self._downloader.trouble(u'ERROR: unable to extract video title')
1041                         return
1042                 video_title = urllib.unquote_plus(video_info['title'][0])
1043                 video_title = video_title.decode('utf-8')
1044                 video_title = sanitize_title(video_title)
1045
1046                 # simplified title
1047                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1048                 simple_title = simple_title.strip(ur'_')
1049
1050                 # thumbnail image
1051                 if 'thumbnail_url' not in video_info:
1052                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1053                         video_thumbnail = ''
1054                 else:   # don't panic if we can't find it
1055                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1056
1057                 # upload date
1058                 upload_date = u'NA'
1059                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1060                 if mobj is not None:
1061                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1062                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1063                         for expression in format_expressions:
1064                                 try:
1065                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1066                                 except:
1067                                         pass
1068
1069                 # description
1070                 video_description = 'No description available.'
1071                 if self._downloader.params.get('forcedescription', False):
1072                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1073                         if mobj is not None:
1074                                 video_description = mobj.group(1)
1075
1076                 # token
1077                 video_token = urllib.unquote_plus(video_info['token'][0])
1078
1079                 # Decide which formats to download
1080                 req_format = self._downloader.params.get('format', None)
1081
1082                 if 'fmt_url_map' in video_info and len(video_info['fmt_url_map']) >= 1 and ',' in video_info['fmt_url_map'][0]:
1083                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1084                         format_limit = self._downloader.params.get('format_limit', None)
1085                         if format_limit is not None and format_limit in self._available_formats:
1086                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1087                         else:
1088                                 format_list = self._available_formats
1089                         existing_formats = [x for x in format_list if x in url_map]
1090                         if len(existing_formats) == 0:
1091                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1092                                 return
1093                         if req_format is None:
1094                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1095                         elif req_format == '-1':
1096                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1097                         else:
1098                                 # Specific format
1099                                 if req_format not in url_map:
1100                                         self._downloader.trouble(u'ERROR: requested format not available')
1101                                         return
1102                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1103
1104                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1105                         self.report_rtmp_download()
1106                         video_url_list = [(None, video_info['conn'][0])]
1107
1108                 else:
1109                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1110                         return
1111
1112                 for format_param, video_real_url in video_url_list:
1113                         # At this point we have a new video
1114                         self._downloader.increment_downloads()
1115
1116                         # Extension
1117                         video_extension = self._video_extensions.get(format_param, 'flv')
1118
1119                         # Find the video URL in fmt_url_map or conn paramters
1120                         try:
1121                                 # Process video information
1122                                 self._downloader.process_info({
1123                                         'id':           video_id.decode('utf-8'),
1124                                         'url':          video_real_url.decode('utf-8'),
1125                                         'uploader':     video_uploader.decode('utf-8'),
1126                                         'upload_date':  upload_date,
1127                                         'title':        video_title,
1128                                         'stitle':       simple_title,
1129                                         'ext':          video_extension.decode('utf-8'),
1130                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1131                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1132                                         'description':  video_description.decode('utf-8'),
1133                                         'player_url':   player_url,
1134                                 })
1135                         except UnavailableVideoError, err:
1136                                 self._downloader.trouble(u'\nERROR: unable to download video')
1137
1138
1139 class MetacafeIE(InfoExtractor):
1140         """Information Extractor for metacafe.com."""
1141
1142         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1143         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1144         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1145         _youtube_ie = None
1146
1147         def __init__(self, youtube_ie, downloader=None):
1148                 InfoExtractor.__init__(self, downloader)
1149                 self._youtube_ie = youtube_ie
1150
1151         @staticmethod
1152         def suitable(url):
1153                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1154
1155         def report_disclaimer(self):
1156                 """Report disclaimer retrieval."""
1157                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1158
1159         def report_age_confirmation(self):
1160                 """Report attempt to confirm age."""
1161                 self._downloader.to_screen(u'[metacafe] Confirming age')
1162
1163         def report_download_webpage(self, video_id):
1164                 """Report webpage download."""
1165                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1166
1167         def report_extraction(self, video_id):
1168                 """Report information extraction."""
1169                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1170
1171         def _real_initialize(self):
1172                 # Retrieve disclaimer
1173                 request = urllib2.Request(self._DISCLAIMER)
1174                 try:
1175                         self.report_disclaimer()
1176                         disclaimer = urllib2.urlopen(request).read()
1177                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1178                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1179                         return
1180
1181                 # Confirm age
1182                 disclaimer_form = {
1183                         'filters': '0',
1184                         'submit': "Continue - I'm over 18",
1185                         }
1186                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1187                 try:
1188                         self.report_age_confirmation()
1189                         disclaimer = urllib2.urlopen(request).read()
1190                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1191                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1192                         return
1193
1194         def _real_extract(self, url):
1195                 # Extract id and simplified title from URL
1196                 mobj = re.match(self._VALID_URL, url)
1197                 if mobj is None:
1198                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1199                         return
1200
1201                 video_id = mobj.group(1)
1202
1203                 # Check if video comes from YouTube
1204                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1205                 if mobj2 is not None:
1206                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1207                         return
1208
1209                 # At this point we have a new video
1210                 self._downloader.increment_downloads()
1211
1212                 simple_title = mobj.group(2).decode('utf-8')
1213
1214                 # Retrieve video webpage to extract further information
1215                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1216                 try:
1217                         self.report_download_webpage(video_id)
1218                         webpage = urllib2.urlopen(request).read()
1219                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1220                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1221                         return
1222
1223                 # Extract URL, uploader and title from webpage
1224                 self.report_extraction(video_id)
1225                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1226                 if mobj is not None:
1227                         mediaURL = urllib.unquote(mobj.group(1))
1228                         video_extension = mediaURL[-3:]
1229
1230                         # Extract gdaKey if available
1231                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1232                         if mobj is None:
1233                                 video_url = mediaURL
1234                         else:
1235                                 gdaKey = mobj.group(1)
1236                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1237                 else:
1238                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1239                         if mobj is None:
1240                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1241                                 return
1242                         vardict = parse_qs(mobj.group(1))
1243                         if 'mediaData' not in vardict:
1244                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1245                                 return
1246                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1247                         if mobj is None:
1248                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1249                                 return
1250                         mediaURL = mobj.group(1).replace('\\/', '/')
1251                         video_extension = mediaURL[-3:]
1252                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1253
1254                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1255                 if mobj is None:
1256                         self._downloader.trouble(u'ERROR: unable to extract title')
1257                         return
1258                 video_title = mobj.group(1).decode('utf-8')
1259                 video_title = sanitize_title(video_title)
1260
1261                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1262                 if mobj is None:
1263                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1264                         return
1265                 video_uploader = mobj.group(1)
1266
1267                 try:
1268                         # Process video information
1269                         self._downloader.process_info({
1270                                 'id':           video_id.decode('utf-8'),
1271                                 'url':          video_url.decode('utf-8'),
1272                                 'uploader':     video_uploader.decode('utf-8'),
1273                                 'upload_date':  u'NA',
1274                                 'title':        video_title,
1275                                 'stitle':       simple_title,
1276                                 'ext':          video_extension.decode('utf-8'),
1277                                 'format':       u'NA',
1278                                 'player_url':   None,
1279                         })
1280                 except UnavailableVideoError:
1281                         self._downloader.trouble(u'\nERROR: unable to download video')
1282
1283
1284 class DailymotionIE(InfoExtractor):
1285         """Information Extractor for Dailymotion"""
1286
1287         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1288
1289         def __init__(self, downloader=None):
1290                 InfoExtractor.__init__(self, downloader)
1291
1292         @staticmethod
1293         def suitable(url):
1294                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1295
1296         def report_download_webpage(self, video_id):
1297                 """Report webpage download."""
1298                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1299
1300         def report_extraction(self, video_id):
1301                 """Report information extraction."""
1302                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1303
1304         def _real_initialize(self):
1305                 return
1306
1307         def _real_extract(self, url):
1308                 # Extract id and simplified title from URL
1309                 mobj = re.match(self._VALID_URL, url)
1310                 if mobj is None:
1311                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1312                         return
1313
1314                 # At this point we have a new video
1315                 self._downloader.increment_downloads()
1316                 video_id = mobj.group(1)
1317
1318                 simple_title = mobj.group(2).decode('utf-8')
1319                 video_extension = 'flv'
1320
1321                 # Retrieve video webpage to extract further information
1322                 request = urllib2.Request(url)
1323                 try:
1324                         self.report_download_webpage(video_id)
1325                         webpage = urllib2.urlopen(request).read()
1326                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1327                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1328                         return
1329
1330                 # Extract URL, uploader and title from webpage
1331                 self.report_extraction(video_id)
1332                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1333                 if mobj is None:
1334                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1335                         return
1336                 mediaURL = urllib.unquote(mobj.group(1))
1337
1338                 # if needed add http://www.dailymotion.com/ if relative URL
1339
1340                 video_url = mediaURL
1341
1342                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1343                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1344                 if mobj is None:
1345                         self._downloader.trouble(u'ERROR: unable to extract title')
1346                         return
1347                 video_title = mobj.group(1).decode('utf-8')
1348                 video_title = sanitize_title(video_title)
1349
1350                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1351                 if mobj is None:
1352                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1353                         return
1354                 video_uploader = mobj.group(1)
1355
1356                 try:
1357                         # Process video information
1358                         self._downloader.process_info({
1359                                 'id':           video_id.decode('utf-8'),
1360                                 'url':          video_url.decode('utf-8'),
1361                                 'uploader':     video_uploader.decode('utf-8'),
1362                                 'upload_date':  u'NA',
1363                                 'title':        video_title,
1364                                 'stitle':       simple_title,
1365                                 'ext':          video_extension.decode('utf-8'),
1366                                 'format':       u'NA',
1367                                 'player_url':   None,
1368                         })
1369                 except UnavailableVideoError:
1370                         self._downloader.trouble(u'\nERROR: unable to download video')
1371
1372 class GoogleIE(InfoExtractor):
1373         """Information extractor for video.google.com."""
1374
1375         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1376
1377         def __init__(self, downloader=None):
1378                 InfoExtractor.__init__(self, downloader)
1379
1380         @staticmethod
1381         def suitable(url):
1382                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1383
1384         def report_download_webpage(self, video_id):
1385                 """Report webpage download."""
1386                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1387
1388         def report_extraction(self, video_id):
1389                 """Report information extraction."""
1390                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1391
1392         def _real_initialize(self):
1393                 return
1394
1395         def _real_extract(self, url):
1396                 # Extract id from URL
1397                 mobj = re.match(self._VALID_URL, url)
1398                 if mobj is None:
1399                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1400                         return
1401
1402                 # At this point we have a new video
1403                 self._downloader.increment_downloads()
1404                 video_id = mobj.group(1)
1405
1406                 video_extension = 'mp4'
1407
1408                 # Retrieve video webpage to extract further information
1409                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1410                 try:
1411                         self.report_download_webpage(video_id)
1412                         webpage = urllib2.urlopen(request).read()
1413                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1414                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1415                         return
1416
1417                 # Extract URL, uploader, and title from webpage
1418                 self.report_extraction(video_id)
1419                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1420                 if mobj is None:
1421                         video_extension = 'flv'
1422                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1423                 if mobj is None:
1424                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1425                         return
1426                 mediaURL = urllib.unquote(mobj.group(1))
1427                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1428                 mediaURL = mediaURL.replace('\\x26', '\x26')
1429
1430                 video_url = mediaURL
1431
1432                 mobj = re.search(r'<title>(.*)</title>', webpage)
1433                 if mobj is None:
1434                         self._downloader.trouble(u'ERROR: unable to extract title')
1435                         return
1436                 video_title = mobj.group(1).decode('utf-8')
1437                 video_title = sanitize_title(video_title)
1438                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1439
1440                 # Extract video description
1441                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1442                 if mobj is None:
1443                         self._downloader.trouble(u'ERROR: unable to extract video description')
1444                         return
1445                 video_description = mobj.group(1).decode('utf-8')
1446                 if not video_description:
1447                         video_description = 'No description available.'
1448
1449                 # Extract video thumbnail
1450                 if self._downloader.params.get('forcethumbnail', False):
1451                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1452                         try:
1453                                 webpage = urllib2.urlopen(request).read()
1454                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1455                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1456                                 return
1457                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1458                         if mobj is None:
1459                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1460                                 return
1461                         video_thumbnail = mobj.group(1)
1462                 else:   # we need something to pass to process_info
1463                         video_thumbnail = ''
1464
1465
1466                 try:
1467                         # Process video information
1468                         self._downloader.process_info({
1469                                 'id':           video_id.decode('utf-8'),
1470                                 'url':          video_url.decode('utf-8'),
1471                                 'uploader':     u'NA',
1472                                 'upload_date':  u'NA',
1473                                 'title':        video_title,
1474                                 'stitle':       simple_title,
1475                                 'ext':          video_extension.decode('utf-8'),
1476                                 'format':       u'NA',
1477                                 'player_url':   None,
1478                         })
1479                 except UnavailableVideoError:
1480                         self._downloader.trouble(u'\nERROR: unable to download video')
1481
1482
1483 class PhotobucketIE(InfoExtractor):
1484         """Information extractor for photobucket.com."""
1485
1486         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1487
1488         def __init__(self, downloader=None):
1489                 InfoExtractor.__init__(self, downloader)
1490
1491         @staticmethod
1492         def suitable(url):
1493                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1494
1495         def report_download_webpage(self, video_id):
1496                 """Report webpage download."""
1497                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1498
1499         def report_extraction(self, video_id):
1500                 """Report information extraction."""
1501                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1502
1503         def _real_initialize(self):
1504                 return
1505
1506         def _real_extract(self, url):
1507                 # Extract id from URL
1508                 mobj = re.match(self._VALID_URL, url)
1509                 if mobj is None:
1510                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1511                         return
1512
1513                 # At this point we have a new video
1514                 self._downloader.increment_downloads()
1515                 video_id = mobj.group(1)
1516
1517                 video_extension = 'flv'
1518
1519                 # Retrieve video webpage to extract further information
1520                 request = urllib2.Request(url)
1521                 try:
1522                         self.report_download_webpage(video_id)
1523                         webpage = urllib2.urlopen(request).read()
1524                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1525                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1526                         return
1527
1528                 # Extract URL, uploader, and title from webpage
1529                 self.report_extraction(video_id)
1530                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1531                 if mobj is None:
1532                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1533                         return
1534                 mediaURL = urllib.unquote(mobj.group(1))
1535
1536                 video_url = mediaURL
1537
1538                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1539                 if mobj is None:
1540                         self._downloader.trouble(u'ERROR: unable to extract title')
1541                         return
1542                 video_title = mobj.group(1).decode('utf-8')
1543                 video_title = sanitize_title(video_title)
1544                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1545
1546                 video_uploader = mobj.group(2).decode('utf-8')
1547
1548                 try:
1549                         # Process video information
1550                         self._downloader.process_info({
1551                                 'id':           video_id.decode('utf-8'),
1552                                 'url':          video_url.decode('utf-8'),
1553                                 'uploader':     video_uploader,
1554                                 'upload_date':  u'NA',
1555                                 'title':        video_title,
1556                                 'stitle':       simple_title,
1557                                 'ext':          video_extension.decode('utf-8'),
1558                                 'format':       u'NA',
1559                                 'player_url':   None,
1560                         })
1561                 except UnavailableVideoError:
1562                         self._downloader.trouble(u'\nERROR: unable to download video')
1563
1564
1565 class YahooIE(InfoExtractor):
1566         """Information extractor for video.yahoo.com."""
1567
1568         # _VALID_URL matches all Yahoo! Video URLs
1569         # _VPAGE_URL matches only the extractable '/watch/' URLs
1570         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1571         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1572
1573         def __init__(self, downloader=None):
1574                 InfoExtractor.__init__(self, downloader)
1575
1576         @staticmethod
1577         def suitable(url):
1578                 return (re.match(YahooIE._VALID_URL, url) is not None)
1579
1580         def report_download_webpage(self, video_id):
1581                 """Report webpage download."""
1582                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1583
1584         def report_extraction(self, video_id):
1585                 """Report information extraction."""
1586                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1587
1588         def _real_initialize(self):
1589                 return
1590
1591         def _real_extract(self, url, new_video=True):
1592                 # Extract ID from URL
1593                 mobj = re.match(self._VALID_URL, url)
1594                 if mobj is None:
1595                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1596                         return
1597
1598                 # At this point we have a new video
1599                 self._downloader.increment_downloads()
1600                 video_id = mobj.group(2)
1601                 video_extension = 'flv'
1602
1603                 # Rewrite valid but non-extractable URLs as
1604                 # extractable English language /watch/ URLs
1605                 if re.match(self._VPAGE_URL, url) is None:
1606                         request = urllib2.Request(url)
1607                         try:
1608                                 webpage = urllib2.urlopen(request).read()
1609                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1610                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1611                                 return
1612
1613                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1614                         if mobj is None:
1615                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1616                                 return
1617                         yahoo_id = mobj.group(1)
1618
1619                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1620                         if mobj is None:
1621                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1622                                 return
1623                         yahoo_vid = mobj.group(1)
1624
1625                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1626                         return self._real_extract(url, new_video=False)
1627
1628                 # Retrieve video webpage to extract further information
1629                 request = urllib2.Request(url)
1630                 try:
1631                         self.report_download_webpage(video_id)
1632                         webpage = urllib2.urlopen(request).read()
1633                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1634                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1635                         return
1636
1637                 # Extract uploader and title from webpage
1638                 self.report_extraction(video_id)
1639                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1640                 if mobj is None:
1641                         self._downloader.trouble(u'ERROR: unable to extract video title')
1642                         return
1643                 video_title = mobj.group(1).decode('utf-8')
1644                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1645
1646                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1647                 if mobj is None:
1648                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1649                         return
1650                 video_uploader = mobj.group(1).decode('utf-8')
1651
1652                 # Extract video thumbnail
1653                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1654                 if mobj is None:
1655                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1656                         return
1657                 video_thumbnail = mobj.group(1).decode('utf-8')
1658
1659                 # Extract video description
1660                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1661                 if mobj is None:
1662                         self._downloader.trouble(u'ERROR: unable to extract video description')
1663                         return
1664                 video_description = mobj.group(1).decode('utf-8')
1665                 if not video_description: video_description = 'No description available.'
1666
1667                 # Extract video height and width
1668                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1669                 if mobj is None:
1670                         self._downloader.trouble(u'ERROR: unable to extract video height')
1671                         return
1672                 yv_video_height = mobj.group(1)
1673
1674                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1675                 if mobj is None:
1676                         self._downloader.trouble(u'ERROR: unable to extract video width')
1677                         return
1678                 yv_video_width = mobj.group(1)
1679
1680                 # Retrieve video playlist to extract media URL
1681                 # I'm not completely sure what all these options are, but we
1682                 # seem to need most of them, otherwise the server sends a 401.
1683                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1684                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1685                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1686                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1687                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1688                 try:
1689                         self.report_download_webpage(video_id)
1690                         webpage = urllib2.urlopen(request).read()
1691                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1692                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1693                         return
1694
1695                 # Extract media URL from playlist XML
1696                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1697                 if mobj is None:
1698                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1699                         return
1700                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1701                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1702
1703                 try:
1704                         # Process video information
1705                         self._downloader.process_info({
1706                                 'id':           video_id.decode('utf-8'),
1707                                 'url':          video_url,
1708                                 'uploader':     video_uploader,
1709                                 'upload_date':  u'NA',
1710                                 'title':        video_title,
1711                                 'stitle':       simple_title,
1712                                 'ext':          video_extension.decode('utf-8'),
1713                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1714                                 'description':  video_description,
1715                                 'thumbnail':    video_thumbnail,
1716                                 'description':  video_description,
1717                                 'player_url':   None,
1718                         })
1719                 except UnavailableVideoError:
1720                         self._downloader.trouble(u'\nERROR: unable to download video')
1721
1722
1723 class VimeoIE(InfoExtractor):
1724         """Information extractor for vimeo.com."""
1725
1726         # _VALID_URL matches Vimeo URLs
1727         _VALID_URL = r'(?:http://)?(?:(?:www|player).)?vimeo\.com/(?:video/)?([0-9]+)'
1728
1729         def __init__(self, downloader=None):
1730                 InfoExtractor.__init__(self, downloader)
1731
1732         @staticmethod
1733         def suitable(url):
1734                 return (re.match(VimeoIE._VALID_URL, url) is not None)
1735
1736         def report_download_webpage(self, video_id):
1737                 """Report webpage download."""
1738                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1739
1740         def report_extraction(self, video_id):
1741                 """Report information extraction."""
1742                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1743
1744         def _real_initialize(self):
1745                 return
1746
1747         def _real_extract(self, url, new_video=True):
1748                 # Extract ID from URL
1749                 mobj = re.match(self._VALID_URL, url)
1750                 if mobj is None:
1751                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1752                         return
1753
1754                 # At this point we have a new video
1755                 self._downloader.increment_downloads()
1756                 video_id = mobj.group(1)
1757                 video_extension = 'flv' # FIXME
1758
1759                 # Retrieve video webpage to extract further information
1760                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1761                 try:
1762                         self.report_download_webpage(video_id)
1763                         webpage = urllib2.urlopen(request).read()
1764                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1765                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1766                         return
1767
1768                 # Now we begin extracting as much information as we can from what we
1769                 # retrieved. First we extract the information common to all extractors,
1770                 # and latter we extract those that are Vimeo specific.
1771                 self.report_extraction(video_id)
1772
1773                 # Extract title
1774                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1775                 if mobj is None:
1776                         self._downloader.trouble(u'ERROR: unable to extract video title')
1777                         return
1778                 video_title = mobj.group(1).decode('utf-8')
1779                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1780
1781                 # Extract uploader
1782                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1783                 if mobj is None:
1784                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1785                         return
1786                 video_uploader = mobj.group(1).decode('utf-8')
1787
1788                 # Extract video thumbnail
1789                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1790                 if mobj is None:
1791                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1792                         return
1793                 video_thumbnail = mobj.group(1).decode('utf-8')
1794
1795                 # # Extract video description
1796                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1797                 # if mobj is None:
1798                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
1799                 #       return
1800                 # video_description = mobj.group(1).decode('utf-8')
1801                 # if not video_description: video_description = 'No description available.'
1802                 video_description = 'Foo.'
1803
1804                 # Vimeo specific: extract request signature
1805                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1806                 if mobj is None:
1807                         self._downloader.trouble(u'ERROR: unable to extract request signature')
1808                         return
1809                 sig = mobj.group(1).decode('utf-8')
1810
1811                 # Vimeo specific: Extract request signature expiration
1812                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
1813                 if mobj is None:
1814                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
1815                         return
1816                 sig_exp = mobj.group(1).decode('utf-8')
1817
1818                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
1819
1820                 try:
1821                         # Process video information
1822                         self._downloader.process_info({
1823                                 'id':           video_id.decode('utf-8'),
1824                                 'url':          video_url,
1825                                 'uploader':     video_uploader,
1826                                 'upload_date':  u'NA',
1827                                 'title':        video_title,
1828                                 'stitle':       simple_title,
1829                                 'ext':          video_extension.decode('utf-8'),
1830                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1831                                 'description':  video_description,
1832                                 'thumbnail':    video_thumbnail,
1833                                 'description':  video_description,
1834                                 'player_url':   None,
1835                         })
1836                 except UnavailableVideoError:
1837                         self._downloader.trouble(u'ERROR: unable to download video')
1838
1839
1840 class GenericIE(InfoExtractor):
1841         """Generic last-resort information extractor."""
1842
1843         def __init__(self, downloader=None):
1844                 InfoExtractor.__init__(self, downloader)
1845
1846         @staticmethod
1847         def suitable(url):
1848                 return True
1849
1850         def report_download_webpage(self, video_id):
1851                 """Report webpage download."""
1852                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1853                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1854
1855         def report_extraction(self, video_id):
1856                 """Report information extraction."""
1857                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1858
1859         def _real_initialize(self):
1860                 return
1861
1862         def _real_extract(self, url):
1863                 # At this point we have a new video
1864                 self._downloader.increment_downloads()
1865
1866                 video_id = url.split('/')[-1]
1867                 request = urllib2.Request(url)
1868                 try:
1869                         self.report_download_webpage(video_id)
1870                         webpage = urllib2.urlopen(request).read()
1871                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1872                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1873                         return
1874                 except ValueError, err:
1875                         # since this is the last-resort InfoExtractor, if
1876                         # this error is thrown, it'll be thrown here
1877                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1878                         return
1879
1880                 self.report_extraction(video_id)
1881                 # Start with something easy: JW Player in SWFObject
1882                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1883                 if mobj is None:
1884                         # Broaden the search a little bit
1885                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1886                 if mobj is None:
1887                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1888                         return
1889
1890                 # It's possible that one of the regexes
1891                 # matched, but returned an empty group:
1892                 if mobj.group(1) is None:
1893                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1894                         return
1895
1896                 video_url = urllib.unquote(mobj.group(1))
1897                 video_id  = os.path.basename(video_url)
1898
1899                 # here's a fun little line of code for you:
1900                 video_extension = os.path.splitext(video_id)[1][1:]
1901                 video_id        = os.path.splitext(video_id)[0]
1902
1903                 # it's tempting to parse this further, but you would
1904                 # have to take into account all the variations like
1905                 #   Video Title - Site Name
1906                 #   Site Name | Video Title
1907                 #   Video Title - Tagline | Site Name
1908                 # and so on and so forth; it's just not practical
1909                 mobj = re.search(r'<title>(.*)</title>', webpage)
1910                 if mobj is None:
1911                         self._downloader.trouble(u'ERROR: unable to extract title')
1912                         return
1913                 video_title = mobj.group(1).decode('utf-8')
1914                 video_title = sanitize_title(video_title)
1915                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1916
1917                 # video uploader is domain name
1918                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1919                 if mobj is None:
1920                         self._downloader.trouble(u'ERROR: unable to extract title')
1921                         return
1922                 video_uploader = mobj.group(1).decode('utf-8')
1923
1924                 try:
1925                         # Process video information
1926                         self._downloader.process_info({
1927                                 'id':           video_id.decode('utf-8'),
1928                                 'url':          video_url.decode('utf-8'),
1929                                 'uploader':     video_uploader,
1930                                 'upload_date':  u'NA',
1931                                 'title':        video_title,
1932                                 'stitle':       simple_title,
1933                                 'ext':          video_extension.decode('utf-8'),
1934                                 'format':       u'NA',
1935                                 'player_url':   None,
1936                         })
1937                 except UnavailableVideoError, err:
1938                         self._downloader.trouble(u'\nERROR: unable to download video')
1939
1940
1941 class YoutubeSearchIE(InfoExtractor):
1942         """Information Extractor for YouTube search queries."""
1943         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1944         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1945         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1946         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1947         _youtube_ie = None
1948         _max_youtube_results = 1000
1949
1950         def __init__(self, youtube_ie, downloader=None):
1951                 InfoExtractor.__init__(self, downloader)
1952                 self._youtube_ie = youtube_ie
1953
1954         @staticmethod
1955         def suitable(url):
1956                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1957
1958         def report_download_page(self, query, pagenum):
1959                 """Report attempt to download playlist page with given number."""
1960                 query = query.decode(preferredencoding())
1961                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1962
1963         def _real_initialize(self):
1964                 self._youtube_ie.initialize()
1965
1966         def _real_extract(self, query):
1967                 mobj = re.match(self._VALID_QUERY, query)
1968                 if mobj is None:
1969                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1970                         return
1971
1972                 prefix, query = query.split(':')
1973                 prefix = prefix[8:]
1974                 query  = query.encode('utf-8')
1975                 if prefix == '':
1976                         self._download_n_results(query, 1)
1977                         return
1978                 elif prefix == 'all':
1979                         self._download_n_results(query, self._max_youtube_results)
1980                         return
1981                 else:
1982                         try:
1983                                 n = long(prefix)
1984                                 if n <= 0:
1985                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1986                                         return
1987                                 elif n > self._max_youtube_results:
1988                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1989                                         n = self._max_youtube_results
1990                                 self._download_n_results(query, n)
1991                                 return
1992                         except ValueError: # parsing prefix as integer fails
1993                                 self._download_n_results(query, 1)
1994                                 return
1995
1996         def _download_n_results(self, query, n):
1997                 """Downloads a specified number of results for a query"""
1998
1999                 video_ids = []
2000                 already_seen = set()
2001                 pagenum = 1
2002
2003                 while True:
2004                         self.report_download_page(query, pagenum)
2005                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2006                         request = urllib2.Request(result_url)
2007                         try:
2008                                 page = urllib2.urlopen(request).read()
2009                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2010                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2011                                 return
2012
2013                         # Extract video identifiers
2014                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2015                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2016                                 if video_id not in already_seen:
2017                                         video_ids.append(video_id)
2018                                         already_seen.add(video_id)
2019                                         if len(video_ids) == n:
2020                                                 # Specified n videos reached
2021                                                 for id in video_ids:
2022                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2023                                                 return
2024
2025                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2026                                 for id in video_ids:
2027                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2028                                 return
2029
2030                         pagenum = pagenum + 1
2031
2032 class GoogleSearchIE(InfoExtractor):
2033         """Information Extractor for Google Video search queries."""
2034         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2035         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2036         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2037         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2038         _google_ie = None
2039         _max_google_results = 1000
2040
2041         def __init__(self, google_ie, downloader=None):
2042                 InfoExtractor.__init__(self, downloader)
2043                 self._google_ie = google_ie
2044
2045         @staticmethod
2046         def suitable(url):
2047                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2048
2049         def report_download_page(self, query, pagenum):
2050                 """Report attempt to download playlist page with given number."""
2051                 query = query.decode(preferredencoding())
2052                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2053
2054         def _real_initialize(self):
2055                 self._google_ie.initialize()
2056
2057         def _real_extract(self, query):
2058                 mobj = re.match(self._VALID_QUERY, query)
2059                 if mobj is None:
2060                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2061                         return
2062
2063                 prefix, query = query.split(':')
2064                 prefix = prefix[8:]
2065                 query  = query.encode('utf-8')
2066                 if prefix == '':
2067                         self._download_n_results(query, 1)
2068                         return
2069                 elif prefix == 'all':
2070                         self._download_n_results(query, self._max_google_results)
2071                         return
2072                 else:
2073                         try:
2074                                 n = long(prefix)
2075                                 if n <= 0:
2076                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2077                                         return
2078                                 elif n > self._max_google_results:
2079                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2080                                         n = self._max_google_results
2081                                 self._download_n_results(query, n)
2082                                 return
2083                         except ValueError: # parsing prefix as integer fails
2084                                 self._download_n_results(query, 1)
2085                                 return
2086
2087         def _download_n_results(self, query, n):
2088                 """Downloads a specified number of results for a query"""
2089
2090                 video_ids = []
2091                 already_seen = set()
2092                 pagenum = 1
2093
2094                 while True:
2095                         self.report_download_page(query, pagenum)
2096                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2097                         request = urllib2.Request(result_url)
2098                         try:
2099                                 page = urllib2.urlopen(request).read()
2100                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2101                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2102                                 return
2103
2104                         # Extract video identifiers
2105                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2106                                 video_id = mobj.group(1)
2107                                 if video_id not in already_seen:
2108                                         video_ids.append(video_id)
2109                                         already_seen.add(video_id)
2110                                         if len(video_ids) == n:
2111                                                 # Specified n videos reached
2112                                                 for id in video_ids:
2113                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2114                                                 return
2115
2116                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2117                                 for id in video_ids:
2118                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2119                                 return
2120
2121                         pagenum = pagenum + 1
2122
2123 class YahooSearchIE(InfoExtractor):
2124         """Information Extractor for Yahoo! Video search queries."""
2125         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2126         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2127         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2128         _MORE_PAGES_INDICATOR = r'\s*Next'
2129         _yahoo_ie = None
2130         _max_yahoo_results = 1000
2131
2132         def __init__(self, yahoo_ie, downloader=None):
2133                 InfoExtractor.__init__(self, downloader)
2134                 self._yahoo_ie = yahoo_ie
2135
2136         @staticmethod
2137         def suitable(url):
2138                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2139
2140         def report_download_page(self, query, pagenum):
2141                 """Report attempt to download playlist page with given number."""
2142                 query = query.decode(preferredencoding())
2143                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2144
2145         def _real_initialize(self):
2146                 self._yahoo_ie.initialize()
2147
2148         def _real_extract(self, query):
2149                 mobj = re.match(self._VALID_QUERY, query)
2150                 if mobj is None:
2151                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2152                         return
2153
2154                 prefix, query = query.split(':')
2155                 prefix = prefix[8:]
2156                 query  = query.encode('utf-8')
2157                 if prefix == '':
2158                         self._download_n_results(query, 1)
2159                         return
2160                 elif prefix == 'all':
2161                         self._download_n_results(query, self._max_yahoo_results)
2162                         return
2163                 else:
2164                         try:
2165                                 n = long(prefix)
2166                                 if n <= 0:
2167                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2168                                         return
2169                                 elif n > self._max_yahoo_results:
2170                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2171                                         n = self._max_yahoo_results
2172                                 self._download_n_results(query, n)
2173                                 return
2174                         except ValueError: # parsing prefix as integer fails
2175                                 self._download_n_results(query, 1)
2176                                 return
2177
2178         def _download_n_results(self, query, n):
2179                 """Downloads a specified number of results for a query"""
2180
2181                 video_ids = []
2182                 already_seen = set()
2183                 pagenum = 1
2184
2185                 while True:
2186                         self.report_download_page(query, pagenum)
2187                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2188                         request = urllib2.Request(result_url)
2189                         try:
2190                                 page = urllib2.urlopen(request).read()
2191                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2192                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2193                                 return
2194
2195                         # Extract video identifiers
2196                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2197                                 video_id = mobj.group(1)
2198                                 if video_id not in already_seen:
2199                                         video_ids.append(video_id)
2200                                         already_seen.add(video_id)
2201                                         if len(video_ids) == n:
2202                                                 # Specified n videos reached
2203                                                 for id in video_ids:
2204                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2205                                                 return
2206
2207                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2208                                 for id in video_ids:
2209                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2210                                 return
2211
2212                         pagenum = pagenum + 1
2213
2214 class YoutubePlaylistIE(InfoExtractor):
2215         """Information Extractor for YouTube playlists."""
2216
2217         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2218         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2219         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2220         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2221         _youtube_ie = None
2222
2223         def __init__(self, youtube_ie, downloader=None):
2224                 InfoExtractor.__init__(self, downloader)
2225                 self._youtube_ie = youtube_ie
2226
2227         @staticmethod
2228         def suitable(url):
2229                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2230
2231         def report_download_page(self, playlist_id, pagenum):
2232                 """Report attempt to download playlist page with given number."""
2233                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2234
2235         def _real_initialize(self):
2236                 self._youtube_ie.initialize()
2237
2238         def _real_extract(self, url):
2239                 # Extract playlist id
2240                 mobj = re.match(self._VALID_URL, url)
2241                 if mobj is None:
2242                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2243                         return
2244
2245                 # Single video case
2246                 if mobj.group(3) is not None:
2247                         self._youtube_ie.extract(mobj.group(3))
2248                         return
2249
2250                 # Download playlist pages
2251                 # prefix is 'p' as default for playlists but there are other types that need extra care
2252                 playlist_prefix = mobj.group(1)
2253                 if playlist_prefix == 'a':
2254                         playlist_access = 'artist'
2255                 else:
2256                         playlist_prefix = 'p'
2257                         playlist_access = 'view_play_list'
2258                 playlist_id = mobj.group(2)
2259                 video_ids = []
2260                 pagenum = 1
2261
2262                 while True:
2263                         self.report_download_page(playlist_id, pagenum)
2264                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2265                         try:
2266                                 page = urllib2.urlopen(request).read()
2267                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2268                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2269                                 return
2270
2271                         # Extract video identifiers
2272                         ids_in_page = []
2273                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2274                                 if mobj.group(1) not in ids_in_page:
2275                                         ids_in_page.append(mobj.group(1))
2276                         video_ids.extend(ids_in_page)
2277
2278                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2279                                 break
2280                         pagenum = pagenum + 1
2281
2282                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2283                 playlistend = self._downloader.params.get('playlistend', -1)
2284                 video_ids = video_ids[playliststart:playlistend]
2285
2286                 for id in video_ids:
2287                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2288                 return
2289
2290 class YoutubeUserIE(InfoExtractor):
2291         """Information Extractor for YouTube users."""
2292
2293         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2294         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2295         _GDATA_PAGE_SIZE = 50
2296         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2297         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2298         _youtube_ie = None
2299
2300         def __init__(self, youtube_ie, downloader=None):
2301                 InfoExtractor.__init__(self, downloader)
2302                 self._youtube_ie = youtube_ie
2303
2304         @staticmethod
2305         def suitable(url):
2306                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2307
2308         def report_download_page(self, username, start_index):
2309                 """Report attempt to download user page."""
2310                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2311                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2312
2313         def _real_initialize(self):
2314                 self._youtube_ie.initialize()
2315
2316         def _real_extract(self, url):
2317                 # Extract username
2318                 mobj = re.match(self._VALID_URL, url)
2319                 if mobj is None:
2320                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2321                         return
2322
2323                 username = mobj.group(1)
2324
2325                 # Download video ids using YouTube Data API. Result size per
2326                 # query is limited (currently to 50 videos) so we need to query
2327                 # page by page until there are no video ids - it means we got
2328                 # all of them.
2329
2330                 video_ids = []
2331                 pagenum = 0
2332
2333                 while True:
2334                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2335                         self.report_download_page(username, start_index)
2336
2337                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2338
2339                         try:
2340                                 page = urllib2.urlopen(request).read()
2341                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2342                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2343                                 return
2344
2345                         # Extract video identifiers
2346                         ids_in_page = []
2347
2348                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2349                                 if mobj.group(1) not in ids_in_page:
2350                                         ids_in_page.append(mobj.group(1))
2351
2352                         video_ids.extend(ids_in_page)
2353
2354                         # A little optimization - if current page is not
2355                         # "full", ie. does not contain PAGE_SIZE video ids then
2356                         # we can assume that this page is the last one - there
2357                         # are no more ids on further pages - no need to query
2358                         # again.
2359
2360                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2361                                 break
2362
2363                         pagenum += 1
2364
2365                 all_ids_count = len(video_ids)
2366                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2367                 playlistend = self._downloader.params.get('playlistend', -1)
2368
2369                 if playlistend == -1:
2370                         video_ids = video_ids[playliststart:]
2371                 else:
2372                         video_ids = video_ids[playliststart:playlistend]
2373
2374                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2375                                            (username, all_ids_count, len(video_ids)))
2376
2377                 for video_id in video_ids:
2378                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2379
2380
2381 class DepositFilesIE(InfoExtractor):
2382         """Information extractor for depositfiles.com"""
2383
2384         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2385
2386         def __init__(self, downloader=None):
2387                 InfoExtractor.__init__(self, downloader)
2388
2389         @staticmethod
2390         def suitable(url):
2391                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2392
2393         def report_download_webpage(self, file_id):
2394                 """Report webpage download."""
2395                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2396
2397         def report_extraction(self, file_id):
2398                 """Report information extraction."""
2399                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2400
2401         def _real_initialize(self):
2402                 return
2403
2404         def _real_extract(self, url):
2405                 # At this point we have a new file
2406                 self._downloader.increment_downloads()
2407
2408                 file_id = url.split('/')[-1]
2409                 # Rebuild url in english locale
2410                 url = 'http://depositfiles.com/en/files/' + file_id
2411
2412                 # Retrieve file webpage with 'Free download' button pressed
2413                 free_download_indication = { 'gateway_result' : '1' }
2414                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2415                 try:
2416                         self.report_download_webpage(file_id)
2417                         webpage = urllib2.urlopen(request).read()
2418                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2419                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2420                         return
2421
2422                 # Search for the real file URL
2423                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2424                 if (mobj is None) or (mobj.group(1) is None):
2425                         # Try to figure out reason of the error.
2426                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2427                         if (mobj is not None) and (mobj.group(1) is not None):
2428                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2429                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2430                         else:
2431                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2432                         return
2433
2434                 file_url = mobj.group(1)
2435                 file_extension = os.path.splitext(file_url)[1][1:]
2436
2437                 # Search for file title
2438                 mobj = re.search(r'<b title="(.*?)">', webpage)
2439                 if mobj is None:
2440                         self._downloader.trouble(u'ERROR: unable to extract title')
2441                         return
2442                 file_title = mobj.group(1).decode('utf-8')
2443
2444                 try:
2445                         # Process file information
2446                         self._downloader.process_info({
2447                                 'id':           file_id.decode('utf-8'),
2448                                 'url':          file_url.decode('utf-8'),
2449                                 'uploader':     u'NA',
2450                                 'upload_date':  u'NA',
2451                                 'title':        file_title,
2452                                 'stitle':       file_title,
2453                                 'ext':          file_extension.decode('utf-8'),
2454                                 'format':       u'NA',
2455                                 'player_url':   None,
2456                         })
2457                 except UnavailableVideoError, err:
2458                         self._downloader.trouble(u'ERROR: unable to download file')
2459
2460 class FacebookIE(InfoExtractor):
2461         """Information Extractor for Facebook"""
2462
2463         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2464         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2465         _NETRC_MACHINE = 'facebook'
2466         _available_formats = ['highqual', 'lowqual']
2467         _video_extensions = {
2468                 'highqual': 'mp4',
2469                 'lowqual': 'mp4',
2470         }
2471
2472         def __init__(self, downloader=None):
2473                 InfoExtractor.__init__(self, downloader)
2474
2475         @staticmethod
2476         def suitable(url):
2477                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2478
2479         def _reporter(self, message):
2480                 """Add header and report message."""
2481                 self._downloader.to_screen(u'[facebook] %s' % message)
2482
2483         def report_login(self):
2484                 """Report attempt to log in."""
2485                 self._reporter(u'Logging in')
2486
2487         def report_video_webpage_download(self, video_id):
2488                 """Report attempt to download video webpage."""
2489                 self._reporter(u'%s: Downloading video webpage' % video_id)
2490
2491         def report_information_extraction(self, video_id):
2492                 """Report attempt to extract video information."""
2493                 self._reporter(u'%s: Extracting video information' % video_id)
2494
2495         def _parse_page(self, video_webpage):
2496                 """Extract video information from page"""
2497                 # General data
2498                 data = {'title': r'class="video_title datawrap">(.*?)</',
2499                         'description': r'<div class="datawrap">(.*?)</div>',
2500                         'owner': r'\("video_owner_name", "(.*?)"\)',
2501                         'upload_date': r'data-date="(.*?)"',
2502                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2503                         }
2504                 video_info = {}
2505                 for piece in data.keys():
2506                         mobj = re.search(data[piece], video_webpage)
2507                         if mobj is not None:
2508                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2509
2510                 # Video urls
2511                 video_urls = {}
2512                 for fmt in self._available_formats:
2513                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2514                         if mobj is not None:
2515                                 # URL is in a Javascript segment inside an escaped Unicode format within
2516                                 # the generally utf-8 page
2517                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2518                 video_info['video_urls'] = video_urls
2519
2520                 return video_info
2521
2522         def _real_initialize(self):
2523                 if self._downloader is None:
2524                         return
2525
2526                 useremail = None
2527                 password = None
2528                 downloader_params = self._downloader.params
2529
2530                 # Attempt to use provided username and password or .netrc data
2531                 if downloader_params.get('username', None) is not None:
2532                         useremail = downloader_params['username']
2533                         password = downloader_params['password']
2534                 elif downloader_params.get('usenetrc', False):
2535                         try:
2536                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2537                                 if info is not None:
2538                                         useremail = info[0]
2539                                         password = info[2]
2540                                 else:
2541                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2542                         except (IOError, netrc.NetrcParseError), err:
2543                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2544                                 return
2545
2546                 if useremail is None:
2547                         return
2548
2549                 # Log in
2550                 login_form = {
2551                         'email': useremail,
2552                         'pass': password,
2553                         'login': 'Log+In'
2554                         }
2555                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2556                 try:
2557                         self.report_login()
2558                         login_results = urllib2.urlopen(request).read()
2559                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2560                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2561                                 return
2562                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2563                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2564                         return
2565
2566         def _real_extract(self, url):
2567                 mobj = re.match(self._VALID_URL, url)
2568                 if mobj is None:
2569                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2570                         return
2571                 video_id = mobj.group('ID')
2572
2573                 # Get video webpage
2574                 self.report_video_webpage_download(video_id)
2575                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2576                 try:
2577                         page = urllib2.urlopen(request)
2578                         video_webpage = page.read()
2579                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2580                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2581                         return
2582
2583                 # Start extracting information
2584                 self.report_information_extraction(video_id)
2585
2586                 # Extract information
2587                 video_info = self._parse_page(video_webpage)
2588
2589                 # uploader
2590                 if 'owner' not in video_info:
2591                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2592                         return
2593                 video_uploader = video_info['owner']
2594
2595                 # title
2596                 if 'title' not in video_info:
2597                         self._downloader.trouble(u'ERROR: unable to extract video title')
2598                         return
2599                 video_title = video_info['title']
2600                 video_title = video_title.decode('utf-8')
2601                 video_title = sanitize_title(video_title)
2602
2603                 # simplified title
2604                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2605                 simple_title = simple_title.strip(ur'_')
2606
2607                 # thumbnail image
2608                 if 'thumbnail' not in video_info:
2609                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2610                         video_thumbnail = ''
2611                 else:
2612                         video_thumbnail = video_info['thumbnail']
2613
2614                 # upload date
2615                 upload_date = u'NA'
2616                 if 'upload_date' in video_info:
2617                         upload_time = video_info['upload_date']
2618                         timetuple = email.utils.parsedate_tz(upload_time)
2619                         if timetuple is not None:
2620                                 try:
2621                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2622                                 except:
2623                                         pass
2624
2625                 # description
2626                 video_description = 'No description available.'
2627                 if (self._downloader.params.get('forcedescription', False) and
2628                     'description' in video_info):
2629                         video_description = video_info['description']
2630
2631                 url_map = video_info['video_urls']
2632                 if len(url_map.keys()) > 0:
2633                         # Decide which formats to download
2634                         req_format = self._downloader.params.get('format', None)
2635                         format_limit = self._downloader.params.get('format_limit', None)
2636
2637                         if format_limit is not None and format_limit in self._available_formats:
2638                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2639                         else:
2640                                 format_list = self._available_formats
2641                         existing_formats = [x for x in format_list if x in url_map]
2642                         if len(existing_formats) == 0:
2643                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2644                                 return
2645                         if req_format is None:
2646                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2647                         elif req_format == '-1':
2648                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2649                         else:
2650                                 # Specific format
2651                                 if req_format not in url_map:
2652                                         self._downloader.trouble(u'ERROR: requested format not available')
2653                                         return
2654                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2655
2656                 for format_param, video_real_url in video_url_list:
2657
2658                         # At this point we have a new video
2659                         self._downloader.increment_downloads()
2660
2661                         # Extension
2662                         video_extension = self._video_extensions.get(format_param, 'mp4')
2663
2664                         # Find the video URL in fmt_url_map or conn paramters
2665                         try:
2666                                 # Process video information
2667                                 self._downloader.process_info({
2668                                         'id':           video_id.decode('utf-8'),
2669                                         'url':          video_real_url.decode('utf-8'),
2670                                         'uploader':     video_uploader.decode('utf-8'),
2671                                         'upload_date':  upload_date,
2672                                         'title':        video_title,
2673                                         'stitle':       simple_title,
2674                                         'ext':          video_extension.decode('utf-8'),
2675                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2676                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2677                                         'description':  video_description.decode('utf-8'),
2678                                         'player_url':   None,
2679                                 })
2680                         except UnavailableVideoError, err:
2681                                 self._downloader.trouble(u'\nERROR: unable to download video')
2682
2683 class PostProcessor(object):
2684         """Post Processor class.
2685
2686         PostProcessor objects can be added to downloaders with their
2687         add_post_processor() method. When the downloader has finished a
2688         successful download, it will take its internal chain of PostProcessors
2689         and start calling the run() method on each one of them, first with
2690         an initial argument and then with the returned value of the previous
2691         PostProcessor.
2692
2693         The chain will be stopped if one of them ever returns None or the end
2694         of the chain is reached.
2695
2696         PostProcessor objects follow a "mutual registration" process similar
2697         to InfoExtractor objects.
2698         """
2699
2700         _downloader = None
2701
2702         def __init__(self, downloader=None):
2703                 self._downloader = downloader
2704
2705         def set_downloader(self, downloader):
2706                 """Sets the downloader for this PP."""
2707                 self._downloader = downloader
2708
2709         def run(self, information):
2710                 """Run the PostProcessor.
2711
2712                 The "information" argument is a dictionary like the ones
2713                 composed by InfoExtractors. The only difference is that this
2714                 one has an extra field called "filepath" that points to the
2715                 downloaded file.
2716
2717                 When this method returns None, the postprocessing chain is
2718                 stopped. However, this method may return an information
2719                 dictionary that will be passed to the next postprocessing
2720                 object in the chain. It can be the one it received after
2721                 changing some fields.
2722
2723                 In addition, this method may raise a PostProcessingError
2724                 exception that will be taken into account by the downloader
2725                 it was called from.
2726                 """
2727                 return information # by default, do nothing
2728
2729 class FFmpegExtractAudioPP(PostProcessor):
2730
2731         def __init__(self, downloader=None, preferredcodec=None):
2732                 PostProcessor.__init__(self, downloader)
2733                 if preferredcodec is None:
2734                         preferredcodec = 'best'
2735                 self._preferredcodec = preferredcodec
2736
2737         @staticmethod
2738         def get_audio_codec(path):
2739                 try:
2740                         cmd = ['ffprobe', '-show_streams', '--', path]
2741                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2742                         output = handle.communicate()[0]
2743                         if handle.wait() != 0:
2744                                 return None
2745                 except (IOError, OSError):
2746                         return None
2747                 audio_codec = None
2748                 for line in output.split('\n'):
2749                         if line.startswith('codec_name='):
2750                                 audio_codec = line.split('=')[1].strip()
2751                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2752                                 return audio_codec
2753                 return None
2754
2755         @staticmethod
2756         def run_ffmpeg(path, out_path, codec, more_opts):
2757                 try:
2758                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2759                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2760                         return (ret == 0)
2761                 except (IOError, OSError):
2762                         return False
2763
2764         def run(self, information):
2765                 path = information['filepath']
2766
2767                 filecodec = self.get_audio_codec(path)
2768                 if filecodec is None:
2769                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2770                         return None
2771
2772                 more_opts = []
2773                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2774                         if filecodec == 'aac' or filecodec == 'mp3':
2775                                 # Lossless if possible
2776                                 acodec = 'copy'
2777                                 extension = filecodec
2778                                 if filecodec == 'aac':
2779                                         more_opts = ['-f', 'adts']
2780                         else:
2781                                 # MP3 otherwise.
2782                                 acodec = 'libmp3lame'
2783                                 extension = 'mp3'
2784                                 more_opts = ['-ab', '128k']
2785                 else:
2786                         # We convert the audio (lossy)
2787                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2788                         extension = self._preferredcodec
2789                         more_opts = ['-ab', '128k']
2790                         if self._preferredcodec == 'aac':
2791                                 more_opts += ['-f', 'adts']
2792
2793                 (prefix, ext) = os.path.splitext(path)
2794                 new_path = prefix + '.' + extension
2795                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2796                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2797
2798                 if not status:
2799                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2800                         return None
2801
2802                 try:
2803                         os.remove(path)
2804                 except (IOError, OSError):
2805                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2806                         return None
2807
2808                 information['filepath'] = new_path
2809                 return information
2810
2811 ### MAIN PROGRAM ###
2812 if __name__ == '__main__':
2813         try:
2814                 # Modules needed only when running the main program
2815                 import getpass
2816                 import optparse
2817
2818                 # Function to update the program file with the latest version from the repository.
2819                 def update_self(downloader, filename):
2820                         # Note: downloader only used for options
2821                         if not os.access(filename, os.W_OK):
2822                                 sys.exit('ERROR: no write permissions on %s' % filename)
2823
2824                         downloader.to_screen('Updating to latest stable version...')
2825                         try:
2826                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2827                                 latest_version = urllib.urlopen(latest_url).read().strip()
2828                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2829                                 newcontent = urllib.urlopen(prog_url).read()
2830                         except (IOError, OSError), err:
2831                                 sys.exit('ERROR: unable to download latest version')
2832                         try:
2833                                 stream = open(filename, 'w')
2834                                 stream.write(newcontent)
2835                                 stream.close()
2836                         except (IOError, OSError), err:
2837                                 sys.exit('ERROR: unable to overwrite current version')
2838                         downloader.to_screen('Updated to version %s' % latest_version)
2839
2840                 # Parse command line
2841                 parser = optparse.OptionParser(
2842                         usage='Usage: %prog [options] url...',
2843                         version='2011.03.29',
2844                         conflict_handler='resolve',
2845                 )
2846
2847                 parser.add_option('-h', '--help',
2848                                 action='help', help='print this help text and exit')
2849                 parser.add_option('-v', '--version',
2850                                 action='version', help='print program version and exit')
2851                 parser.add_option('-U', '--update',
2852                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2853                 parser.add_option('-i', '--ignore-errors',
2854                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2855                 parser.add_option('-r', '--rate-limit',
2856                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2857                 parser.add_option('-R', '--retries',
2858                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2859                 parser.add_option('--playlist-start',
2860                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2861                 parser.add_option('--playlist-end',
2862                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2863                 parser.add_option('--dump-user-agent',
2864                                 action='store_true', dest='dump_user_agent',
2865                                 help='display the current browser identification', default=False)
2866
2867                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2868                 authentication.add_option('-u', '--username',
2869                                 dest='username', metavar='USERNAME', help='account username')
2870                 authentication.add_option('-p', '--password',
2871                                 dest='password', metavar='PASSWORD', help='account password')
2872                 authentication.add_option('-n', '--netrc',
2873                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2874                 parser.add_option_group(authentication)
2875
2876                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2877                 video_format.add_option('-f', '--format',
2878                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2879                 video_format.add_option('--all-formats',
2880                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2881                 video_format.add_option('--max-quality',
2882                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2883                 parser.add_option_group(video_format)
2884
2885                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2886                 verbosity.add_option('-q', '--quiet',
2887                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2888                 verbosity.add_option('-s', '--simulate',
2889                                 action='store_true', dest='simulate', help='do not download video', default=False)
2890                 verbosity.add_option('-g', '--get-url',
2891                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2892                 verbosity.add_option('-e', '--get-title',
2893                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2894                 verbosity.add_option('--get-thumbnail',
2895                                 action='store_true', dest='getthumbnail',
2896                                 help='simulate, quiet but print thumbnail URL', default=False)
2897                 verbosity.add_option('--get-description',
2898                                 action='store_true', dest='getdescription',
2899                                 help='simulate, quiet but print video description', default=False)
2900                 verbosity.add_option('--get-filename',
2901                                 action='store_true', dest='getfilename',
2902                                 help='simulate, quiet but print output filename', default=False)
2903                 verbosity.add_option('--no-progress',
2904                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2905                 verbosity.add_option('--console-title',
2906                                 action='store_true', dest='consoletitle',
2907                                 help='display progress in console titlebar', default=False)
2908                 parser.add_option_group(verbosity)
2909
2910                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2911                 filesystem.add_option('-t', '--title',
2912                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2913                 filesystem.add_option('-l', '--literal',
2914                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2915                 filesystem.add_option('-A', '--auto-number',
2916                                 action='store_true', dest='autonumber',
2917                                 help='number downloaded files starting from 00000', default=False)
2918                 filesystem.add_option('-o', '--output',
2919                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2920                 filesystem.add_option('-a', '--batch-file',
2921                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2922                 filesystem.add_option('-w', '--no-overwrites',
2923                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2924                 filesystem.add_option('-c', '--continue',
2925                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2926                 filesystem.add_option('--cookies',
2927                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2928                 filesystem.add_option('--no-part',
2929                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
2930                 filesystem.add_option('--no-mtime',
2931                                 action='store_false', dest='updatetime',
2932                                 help='do not use the Last-modified header to set the file modification time', default=True)
2933                 parser.add_option_group(filesystem)
2934
2935                 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
2936                 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
2937                                 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
2938                 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
2939                                 help='"best", "aac" or "mp3"; best by default')
2940                 parser.add_option_group(postproc)
2941
2942                 (opts, args) = parser.parse_args()
2943
2944                 # Open appropriate CookieJar
2945                 if opts.cookiefile is None:
2946                         jar = cookielib.CookieJar()
2947                 else:
2948                         try:
2949                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2950                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2951                                         jar.load()
2952                         except (IOError, OSError), err:
2953                                 sys.exit(u'ERROR: unable to open cookie file')
2954
2955                 # Dump user agent
2956                 if opts.dump_user_agent:
2957                         print std_headers['User-Agent']
2958                         sys.exit(0)
2959
2960                 # General configuration
2961                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2962                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2963                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2964
2965                 # Batch file verification
2966                 batchurls = []
2967                 if opts.batchfile is not None:
2968                         try:
2969                                 if opts.batchfile == '-':
2970                                         batchfd = sys.stdin
2971                                 else:
2972                                         batchfd = open(opts.batchfile, 'r')
2973                                 batchurls = batchfd.readlines()
2974                                 batchurls = [x.strip() for x in batchurls]
2975                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2976                         except IOError:
2977                                 sys.exit(u'ERROR: batch file could not be read')
2978                 all_urls = batchurls + args
2979
2980                 # Conflicting, missing and erroneous options
2981                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2982                         parser.error(u'using .netrc conflicts with giving username/password')
2983                 if opts.password is not None and opts.username is None:
2984                         parser.error(u'account username missing')
2985                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2986                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2987                 if opts.usetitle and opts.useliteral:
2988                         parser.error(u'using title conflicts with using literal title')
2989                 if opts.username is not None and opts.password is None:
2990                         opts.password = getpass.getpass(u'Type account password and press return:')
2991                 if opts.ratelimit is not None:
2992                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2993                         if numeric_limit is None:
2994                                 parser.error(u'invalid rate limit specified')
2995                         opts.ratelimit = numeric_limit
2996                 if opts.retries is not None:
2997                         try:
2998                                 opts.retries = long(opts.retries)
2999                         except (TypeError, ValueError), err:
3000                                 parser.error(u'invalid retry count specified')
3001                 try:
3002                         opts.playliststart = long(opts.playliststart)
3003                         if opts.playliststart <= 0:
3004                                 raise ValueError
3005                 except (TypeError, ValueError), err:
3006                         parser.error(u'invalid playlist start number specified')
3007                 try:
3008                         opts.playlistend = long(opts.playlistend)
3009                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3010                                 raise ValueError
3011                 except (TypeError, ValueError), err:
3012                         parser.error(u'invalid playlist end number specified')
3013                 if opts.extractaudio:
3014                         if opts.audioformat not in ['best', 'aac', 'mp3']:
3015                                 parser.error(u'invalid audio format specified')
3016
3017                 # Information extractors
3018                 vimeo_ie = VimeoIE()
3019                 youtube_ie = YoutubeIE()
3020                 metacafe_ie = MetacafeIE(youtube_ie)
3021                 dailymotion_ie = DailymotionIE()
3022                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3023                 youtube_user_ie = YoutubeUserIE(youtube_ie)
3024                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3025                 google_ie = GoogleIE()
3026                 google_search_ie = GoogleSearchIE(google_ie)
3027                 photobucket_ie = PhotobucketIE()
3028                 yahoo_ie = YahooIE()
3029                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3030                 deposit_files_ie = DepositFilesIE()
3031                 facebook_ie = FacebookIE()
3032                 generic_ie = GenericIE()
3033
3034                 # File downloader
3035                 fd = FileDownloader({
3036                         'usenetrc': opts.usenetrc,
3037                         'username': opts.username,
3038                         'password': opts.password,
3039                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3040                         'forceurl': opts.geturl,
3041                         'forcetitle': opts.gettitle,
3042                         'forcethumbnail': opts.getthumbnail,
3043                         'forcedescription': opts.getdescription,
3044                         'forcefilename': opts.getfilename,
3045                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3046                         'format': opts.format,
3047                         'format_limit': opts.format_limit,
3048                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3049                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3050                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3051                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3052                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3053                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3054                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3055                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3056                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3057                                 or u'%(id)s.%(ext)s'),
3058                         'ignoreerrors': opts.ignoreerrors,
3059                         'ratelimit': opts.ratelimit,
3060                         'nooverwrites': opts.nooverwrites,
3061                         'retries': opts.retries,
3062                         'continuedl': opts.continue_dl,
3063                         'noprogress': opts.noprogress,
3064                         'playliststart': opts.playliststart,
3065                         'playlistend': opts.playlistend,
3066                         'logtostderr': opts.outtmpl == '-',
3067                         'consoletitle': opts.consoletitle,
3068                         'nopart': opts.nopart,
3069                         'updatetime': opts.updatetime,
3070                         })
3071                 fd.add_info_extractor(vimeo_ie)
3072                 fd.add_info_extractor(youtube_search_ie)
3073                 fd.add_info_extractor(youtube_pl_ie)
3074                 fd.add_info_extractor(youtube_user_ie)
3075                 fd.add_info_extractor(metacafe_ie)
3076                 fd.add_info_extractor(dailymotion_ie)
3077                 fd.add_info_extractor(youtube_ie)
3078                 fd.add_info_extractor(google_ie)
3079                 fd.add_info_extractor(google_search_ie)
3080                 fd.add_info_extractor(photobucket_ie)
3081                 fd.add_info_extractor(yahoo_ie)
3082                 fd.add_info_extractor(yahoo_search_ie)
3083                 fd.add_info_extractor(deposit_files_ie)
3084                 fd.add_info_extractor(facebook_ie)
3085
3086                 # This must come last since it's the
3087                 # fallback if none of the others work
3088                 fd.add_info_extractor(generic_ie)
3089
3090                 # PostProcessors
3091                 if opts.extractaudio:
3092                         fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3093
3094                 # Update version
3095                 if opts.update_self:
3096                         update_self(fd, sys.argv[0])
3097
3098                 # Maybe do nothing
3099                 if len(all_urls) < 1:
3100                         if not opts.update_self:
3101                                 parser.error(u'you must provide at least one URL')
3102                         else:
3103                                 sys.exit()
3104                 retcode = fd.download(all_urls)
3105
3106                 # Dump cookie jar if requested
3107                 if opts.cookiefile is not None:
3108                         try:
3109                                 jar.save()
3110                         except (IOError, OSError), err:
3111                                 sys.exit(u'ERROR: unable to save cookie jar')
3112
3113                 sys.exit(retcode)
3114
3115         except DownloadError:
3116                 sys.exit(1)
3117         except SameFileError:
3118                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3119         except KeyboardInterrupt:
3120                 sys.exit(u'\nERROR: Interrupted by user')