youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __author__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         )
  19
  20 __license__ = 'Public Domain'
  21 __version__ = '2011.12.18'
  22
  23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  24
  25 import cookielib
  26 import datetime
  27 import gzip
  28 import htmlentitydefs
  29 import HTMLParser
  30 import httplib
  31 import locale
  32 import math
  33 import netrc
  34 import os
  35 import os.path
  36 import re
  37 import socket
  38 import string
  39 import subprocess
  40 import sys
  41 import time
  42 import urllib
  43 import urllib2
  44 import warnings
  45 import zlib
  46
  47 if os.name == 'nt':
  48         import ctypes
  49
  50 try:
  51         import email.utils
  52 except ImportError: # Python 2.4
  53         import email.Utils
  54 try:
  55         import cStringIO as StringIO
  56 except ImportError:
  57         import StringIO
  58
  59 # parse_qs was moved from the cgi module to the urlparse module recently.
  60 try:
  61         from urlparse import parse_qs
  62 except ImportError:
  63         from cgi import parse_qs
  64
  65 try:
  66         import lxml.etree
  67 except ImportError:
  68         pass # Handled below
  69
  70 try:
  71         import xml.etree.ElementTree
  72 except ImportError: # Python<2.5: Not officially supported, but let it slip
  73         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  74
  75 std_headers = {
  76         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  77         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  78         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  79         'Accept-Encoding': 'gzip, deflate',
  80         'Accept-Language': 'en-us,en;q=0.5',
  81 }
  82
  83 try:
  84         import json
  85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  86         import re
  87         class json(object):
  88                 @staticmethod
  89                 def loads(s):
  90                         s = s.decode('UTF-8')
  91                         def raiseError(msg, i):
  92                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  93                         def skipSpace(i, expectMore=True):
  94                                 while i < len(s) and s[i] in ' \t\r\n':
  95                                         i += 1
  96                                 if expectMore:
  97                                         if i >= len(s):
  98                                                 raiseError('Premature end', i)
  99                                 return i
 100                         def decodeEscape(match):
 101                                 esc = match.group(1)
 102                                 _STATIC = {
 103                                         '"': '"',
 104                                         '\\': '\\',
 105                                         '/': '/',
 106                                         'b': unichr(0x8),
 107                                         'f': unichr(0xc),
 108                                         'n': '\n',
 109                                         'r': '\r',
 110                                         't': '\t',
 111                                 }
 112                                 if esc in _STATIC:
 113                                         return _STATIC[esc]
 114                                 if esc[0] == 'u':
 115                                         if len(esc) == 1+4:
 116                                                 return unichr(int(esc[1:5], 16))
 117                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 118                                                 hi = int(esc[1:5], 16)
 119                                                 low = int(esc[7:11], 16)
 120                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 121                                 raise ValueError('Unknown escape ' + str(esc))
 122                         def parseString(i):
 123                                 i += 1
 124                                 e = i
 125                                 while True:
 126                                         e = s.index('"', e)
 127                                         bslashes = 0
 128                                         while s[e-bslashes-1] == '\\':
 129                                                 bslashes += 1
 130                                         if bslashes % 2 == 1:
 131                                                 e += 1
 132                                                 continue
 133                                         break
 134                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 135                                 stri = rexp.sub(decodeEscape, s[i:e])
 136                                 return (e+1,stri)
 137                         def parseObj(i):
 138                                 i += 1
 139                                 res = {}
 140                                 i = skipSpace(i)
 141                                 if s[i] == '}': # Empty dictionary
 142                                         return (i+1,res)
 143                                 while True:
 144                                         if s[i] != '"':
 145                                                 raiseError('Expected a string object key', i)
 146                                         i,key = parseString(i)
 147                                         i = skipSpace(i)
 148                                         if i >= len(s) or s[i] != ':':
 149                                                 raiseError('Expected a colon', i)
 150                                         i,val = parse(i+1)
 151                                         res[key] = val
 152                                         i = skipSpace(i)
 153                                         if s[i] == '}':
 154                                                 return (i+1, res)
 155                                         if s[i] != ',':
 156                                                 raiseError('Expected comma or closing curly brace', i)
 157                                         i = skipSpace(i+1)
 158                         def parseArray(i):
 159                                 res = []
 160                                 i = skipSpace(i+1)
 161                                 if s[i] == ']': # Empty array
 162                                         return (i+1,res)
 163                                 while True:
 164                                         i,val = parse(i)
 165                                         res.append(val)
 166                                         i = skipSpace(i) # Raise exception if premature end
 167                                         if s[i] == ']':
 168                                                 return (i+1, res)
 169                                         if s[i] != ',':
 170                                                 raiseError('Expected a comma or closing bracket', i)
 171                                         i = skipSpace(i+1)
 172                         def parseDiscrete(i):
 173                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 174                                         if s.startswith(k, i):
 175                                                 return (i+len(k), v)
 176                                 raiseError('Not a boolean (or null)', i)
 177                         def parseNumber(i):
 178                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 179                                 if mobj is None:
 180                                         raiseError('Not a number', i)
 181                                 nums = mobj.group(1)
 182                                 if '.' in nums or 'e' in nums or 'E' in nums:
 183                                         return (i+len(nums), float(nums))
 184                                 return (i+len(nums), int(nums))
 185                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 186                         def parse(i):
 187                                 i = skipSpace(i)
 188                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 189                                 i = skipSpace(i, False)
 190                                 return (i,res)
 191                         i,res = parse(0)
 192                         if i < len(s):
 193                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 194                         return res
 195
 196 def preferredencoding():
 197         """Get preferred encoding.
 198
 199         Returns the best encoding scheme for the system, based on
 200         locale.getpreferredencoding() and some further tweaks.
 201         """
 202         def yield_preferredencoding():
 203                 try:
 204                         pref = locale.getpreferredencoding()
 205                         u'TEST'.encode(pref)
 206                 except:
 207                         pref = 'UTF-8'
 208                 while True:
 209                         yield pref
 210         return yield_preferredencoding().next()
 211
 212
 213 def htmlentity_transform(matchobj):
 214         """Transforms an HTML entity to a Unicode character.
 215
 216         This function receives a match object and is intended to be used with
 217         the re.sub() function.
 218         """
 219         entity = matchobj.group(1)
 220
 221         # Known non-numeric HTML entity
 222         if entity in htmlentitydefs.name2codepoint:
 223                 return unichr(htmlentitydefs.name2codepoint[entity])
 224
 225         # Unicode character
 226         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 227         if mobj is not None:
 228                 numstr = mobj.group(1)
 229                 if numstr.startswith(u'x'):
 230                         base = 16
 231                         numstr = u'0%s' % numstr
 232                 else:
 233                         base = 10
 234                 return unichr(long(numstr, base))
 235
 236         # Unknown entity in name, return its literal representation
 237         return (u'&%s;' % entity)
 238
 239
 240 def sanitize_title(utitle):
 241         """Sanitizes a video title so it could be used as part of a filename."""
 242         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 243         return utitle.replace(unicode(os.sep), u'%')
 244
 245
 246 def sanitize_open(filename, open_mode):
 247         """Try to open the given filename, and slightly tweak it if this fails.
 248
 249         Attempts to open the given filename. If this fails, it tries to change
 250         the filename slightly, step by step, until it's either able to open it
 251         or it fails and raises a final exception, like the standard open()
 252         function.
 253
 254         It returns the tuple (stream, definitive_file_name).
 255         """
 256         try:
 257                 if filename == u'-':
 258                         if sys.platform == 'win32':
 259                                 import msvcrt
 260                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 261                         return (sys.stdout, filename)
 262                 stream = open(filename, open_mode)
 263                 return (stream, filename)
 264         except (IOError, OSError), err:
 265                 # In case of error, try to remove win32 forbidden chars
 266                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 267
 268                 # An exception here should be caught in the caller
 269                 stream = open(filename, open_mode)
 270                 return (stream, filename)
 271
 272
 273 def timeconvert(timestr):
 274         """Convert RFC 2822 defined time string into system timestamp"""
 275         timestamp = None
 276         timetuple = email.utils.parsedate_tz(timestr)
 277         if timetuple is not None:
 278                 timestamp = email.utils.mktime_tz(timetuple)
 279         return timestamp
 280
 281 def _simplify_title(title):
 282         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 283         return expr.sub(u'_', title).strip(u'_')
 284
 285 def _orderedSet(iterable):
 286         """ Remove all duplicates from the input iterable """
 287         res = []
 288         for el in iterable:
 289                 if el not in res:
 290                         res.append(el)
 291         return res
 292
 293 class DownloadError(Exception):
 294         """Download Error exception.
 295
 296         This exception may be thrown by FileDownloader objects if they are not
 297         configured to continue on errors. They will contain the appropriate
 298         error message.
 299         """
 300         pass
 301
 302
 303 class SameFileError(Exception):
 304         """Same File exception.
 305
 306         This exception will be thrown by FileDownloader objects if they detect
 307         multiple files would have to be downloaded to the same file on disk.
 308         """
 309         pass
 310
 311
 312 class PostProcessingError(Exception):
 313         """Post Processing exception.
 314
 315         This exception may be raised by PostProcessor's .run() method to
 316         indicate an error in the postprocessing task.
 317         """
 318         pass
 319
 320 class MaxDownloadsReached(Exception):
 321         """ --max-downloads limit has been reached. """
 322         pass
 323
 324
 325 class UnavailableVideoError(Exception):
 326         """Unavailable Format exception.
 327
 328         This exception will be thrown when a video is requested
 329         in a format that is not available for that video.
 330         """
 331         pass
 332
 333
 334 class ContentTooShortError(Exception):
 335         """Content Too Short exception.
 336
 337         This exception may be raised by FileDownloader objects when a file they
 338         download is too small for what the server announced first, indicating
 339         the connection was probably interrupted.
 340         """
 341         # Both in bytes
 342         downloaded = None
 343         expected = None
 344
 345         def __init__(self, downloaded, expected):
 346                 self.downloaded = downloaded
 347                 self.expected = expected
 348
 349
 350 class YoutubeDLHandler(urllib2.HTTPHandler):
 351         """Handler for HTTP requests and responses.
 352
 353         This class, when installed with an OpenerDirector, automatically adds
 354         the standard headers to every HTTP request and handles gzipped and
 355         deflated responses from web servers. If compression is to be avoided in
 356         a particular request, the original request in the program code only has
 357         to include the HTTP header "Youtubedl-No-Compression", which will be
 358         removed before making the real request.
 359
 360         Part of this code was copied from:
 361
 362         http://techknack.net/python-urllib2-handlers/
 363
 364         Andrew Rowls, the author of that code, agreed to release it to the
 365         public domain.
 366         """
 367
 368         @staticmethod
 369         def deflate(data):
 370                 try:
 371                         return zlib.decompress(data, -zlib.MAX_WBITS)
 372                 except zlib.error:
 373                         return zlib.decompress(data)
 374
 375         @staticmethod
 376         def addinfourl_wrapper(stream, headers, url, code):
 377                 if hasattr(urllib2.addinfourl, 'getcode'):
 378                         return urllib2.addinfourl(stream, headers, url, code)
 379                 ret = urllib2.addinfourl(stream, headers, url)
 380                 ret.code = code
 381                 return ret
 382
 383         def http_request(self, req):
 384                 for h in std_headers:
 385                         if h in req.headers:
 386                                 del req.headers[h]
 387                         req.add_header(h, std_headers[h])
 388                 if 'Youtubedl-no-compression' in req.headers:
 389                         if 'Accept-encoding' in req.headers:
 390                                 del req.headers['Accept-encoding']
 391                         del req.headers['Youtubedl-no-compression']
 392                 return req
 393
 394         def http_response(self, req, resp):
 395                 old_resp = resp
 396                 # gzip
 397                 if resp.headers.get('Content-encoding', '') == 'gzip':
 398                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 399                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 400                         resp.msg = old_resp.msg
 401                 # deflate
 402                 if resp.headers.get('Content-encoding', '') == 'deflate':
 403                         gz = StringIO.StringIO(self.deflate(resp.read()))
 404                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 405                         resp.msg = old_resp.msg
 406                 return resp
 407
 408
 409 class FileDownloader(object):
 410         """File Downloader class.
 411
 412         File downloader objects are the ones responsible of downloading the
 413         actual video file and writing it to disk if the user has requested
 414         it, among some other tasks. In most cases there should be one per
 415         program. As, given a video URL, the downloader doesn't know how to
 416         extract all the needed information, task that InfoExtractors do, it
 417         has to pass the URL to one of them.
 418
 419         For this, file downloader objects have a method that allows
 420         InfoExtractors to be registered in a given order. When it is passed
 421         a URL, the file downloader handles it to the first InfoExtractor it
 422         finds that reports being able to handle it. The InfoExtractor extracts
 423         all the information about the video or videos the URL refers to, and
 424         asks the FileDownloader to process the video information, possibly
 425         downloading the video.
 426
 427         File downloaders accept a lot of parameters. In order not to saturate
 428         the object constructor with arguments, it receives a dictionary of
 429         options instead. These options are available through the params
 430         attribute for the InfoExtractors to use. The FileDownloader also
 431         registers itself as the downloader in charge for the InfoExtractors
 432         that are added to it, so this is a "mutual registration".
 433
 434         Available options:
 435
 436         username:         Username for authentication purposes.
 437         password:         Password for authentication purposes.
 438         usenetrc:         Use netrc for authentication instead.
 439         quiet:            Do not print messages to stdout.
 440         forceurl:         Force printing final URL.
 441         forcetitle:       Force printing title.
 442         forcethumbnail:   Force printing thumbnail URL.
 443         forcedescription: Force printing description.
 444         forcefilename:    Force printing final filename.
 445         simulate:         Do not download the video files.
 446         format:           Video format code.
 447         format_limit:     Highest quality format to try.
 448         outtmpl:          Template for output names.
 449         ignoreerrors:     Do not stop on download errors.
 450         ratelimit:        Download speed limit, in bytes/sec.
 451         nooverwrites:     Prevent overwriting files.
 452         retries:          Number of times to retry for HTTP error 5xx
 453         continuedl:       Try to continue downloads if possible.
 454         noprogress:       Do not print the progress bar.
 455         playliststart:    Playlist item to start at.
 456         playlistend:      Playlist item to end at.
 457         matchtitle:       Download only matching titles.
 458         rejecttitle:      Reject downloads for matching titles.
 459         logtostderr:      Log messages to stderr instead of stdout.
 460         consoletitle:     Display progress in console window's titlebar.
 461         nopart:           Do not use temporary .part files.
 462         updatetime:       Use the Last-modified header to set output file timestamps.
 463         writedescription: Write the video description to a .description file
 464         writeinfojson:    Write the video description to a .info.json file
 465         """
 466
 467         params = None
 468         _ies = []
 469         _pps = []
 470         _download_retcode = None
 471         _num_downloads = None
 472         _screen_file = None
 473
 474         def __init__(self, params):
 475                 """Create a FileDownloader object with the given options."""
 476                 self._ies = []
 477                 self._pps = []
 478                 self._download_retcode = 0
 479                 self._num_downloads = 0
 480                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 481                 self.params = params
 482
 483         @staticmethod
 484         def format_bytes(bytes):
 485                 if bytes is None:
 486                         return 'N/A'
 487                 if type(bytes) is str:
 488                         bytes = float(bytes)
 489                 if bytes == 0.0:
 490                         exponent = 0
 491                 else:
 492                         exponent = long(math.log(bytes, 1024.0))
 493                 suffix = 'bkMGTPEZY'[exponent]
 494                 converted = float(bytes) / float(1024 ** exponent)
 495                 return '%.2f%s' % (converted, suffix)
 496
 497         @staticmethod
 498         def calc_percent(byte_counter, data_len):
 499                 if data_len is None:
 500                         return '---.-%'
 501                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 502
 503         @staticmethod
 504         def calc_eta(start, now, total, current):
 505                 if total is None:
 506                         return '--:--'
 507                 dif = now - start
 508                 if current == 0 or dif < 0.001: # One millisecond
 509                         return '--:--'
 510                 rate = float(current) / dif
 511                 eta = long((float(total) - float(current)) / rate)
 512                 (eta_mins, eta_secs) = divmod(eta, 60)
 513                 if eta_mins > 99:
 514                         return '--:--'
 515                 return '%02d:%02d' % (eta_mins, eta_secs)
 516
 517         @staticmethod
 518         def calc_speed(start, now, bytes):
 519                 dif = now - start
 520                 if bytes == 0 or dif < 0.001: # One millisecond
 521                         return '%10s' % '---b/s'
 522                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 523
 524         @staticmethod
 525         def best_block_size(elapsed_time, bytes):
 526                 new_min = max(bytes / 2.0, 1.0)
 527                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 528                 if elapsed_time < 0.001:
 529                         return long(new_max)
 530                 rate = bytes / elapsed_time
 531                 if rate > new_max:
 532                         return long(new_max)
 533                 if rate < new_min:
 534                         return long(new_min)
 535                 return long(rate)
 536
 537         @staticmethod
 538         def parse_bytes(bytestr):
 539                 """Parse a string indicating a byte quantity into a long integer."""
 540                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 541                 if matchobj is None:
 542                         return None
 543                 number = float(matchobj.group(1))
 544                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 545                 return long(round(number * multiplier))
 546
 547         def add_info_extractor(self, ie):
 548                 """Add an InfoExtractor object to the end of the list."""
 549                 self._ies.append(ie)
 550                 ie.set_downloader(self)
 551
 552         def add_post_processor(self, pp):
 553                 """Add a PostProcessor object to the end of the chain."""
 554                 self._pps.append(pp)
 555                 pp.set_downloader(self)
 556
 557         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 558                 """Print message to stdout if not in quiet mode."""
 559                 try:
 560                         if not self.params.get('quiet', False):
 561                                 terminator = [u'\n', u''][skip_eol]
 562                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 563                         self._screen_file.flush()
 564                 except (UnicodeEncodeError), err:
 565                         if not ignore_encoding_errors:
 566                                 raise
 567
 568         def to_stderr(self, message):
 569                 """Print message to stderr."""
 570                 print >>sys.stderr, message.encode(preferredencoding())
 571
 572         def to_cons_title(self, message):
 573                 """Set console/terminal window title to message."""
 574                 if not self.params.get('consoletitle', False):
 575                         return
 576                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 577                         # c_wchar_p() might not be necessary if `message` is
 578                         # already of type unicode()
 579                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 580                 elif 'TERM' in os.environ:
 581                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 582
 583         def fixed_template(self):
 584                 """Checks if the output template is fixed."""
 585                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 586
 587         def trouble(self, message=None):
 588                 """Determine action to take when a download problem appears.
 589
 590                 Depending on if the downloader has been configured to ignore
 591                 download errors or not, this method may throw an exception or
 592                 not when errors are found, after printing the message.
 593                 """
 594                 if message is not None:
 595                         self.to_stderr(message)
 596                 if not self.params.get('ignoreerrors', False):
 597                         raise DownloadError(message)
 598                 self._download_retcode = 1
 599
 600         def slow_down(self, start_time, byte_counter):
 601                 """Sleep if the download speed is over the rate limit."""
 602                 rate_limit = self.params.get('ratelimit', None)
 603                 if rate_limit is None or byte_counter == 0:
 604                         return
 605                 now = time.time()
 606                 elapsed = now - start_time
 607                 if elapsed <= 0.0:
 608                         return
 609                 speed = float(byte_counter) / elapsed
 610                 if speed > rate_limit:
 611                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 612
 613         def temp_name(self, filename):
 614                 """Returns a temporary filename for the given filename."""
 615                 if self.params.get('nopart', False) or filename == u'-' or \
 616                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 617                         return filename
 618                 return filename + u'.part'
 619
 620         def undo_temp_name(self, filename):
 621                 if filename.endswith(u'.part'):
 622                         return filename[:-len(u'.part')]
 623                 return filename
 624
 625         def try_rename(self, old_filename, new_filename):
 626                 try:
 627                         if old_filename == new_filename:
 628                                 return
 629                         os.rename(old_filename, new_filename)
 630                 except (IOError, OSError), err:
 631                         self.trouble(u'ERROR: unable to rename file')
 632
 633         def try_utime(self, filename, last_modified_hdr):
 634                 """Try to set the last-modified time of the given file."""
 635                 if last_modified_hdr is None:
 636                         return
 637                 if not os.path.isfile(filename):
 638                         return
 639                 timestr = last_modified_hdr
 640                 if timestr is None:
 641                         return
 642                 filetime = timeconvert(timestr)
 643                 if filetime is None:
 644                         return filetime
 645                 try:
 646                         os.utime(filename, (time.time(), filetime))
 647                 except:
 648                         pass
 649                 return filetime
 650
 651         def report_writedescription(self, descfn):
 652                 """ Report that the description file is being written """
 653                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
 654
 655         def report_writeinfojson(self, infofn):
 656                 """ Report that the metadata file has been written """
 657                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
 658
 659         def report_destination(self, filename):
 660                 """Report destination filename."""
 661                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 662
 663         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 664                 """Report download progress."""
 665                 if self.params.get('noprogress', False):
 666                         return
 667                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 668                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 669                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 670                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 671
 672         def report_resuming_byte(self, resume_len):
 673                 """Report attempt to resume at given byte."""
 674                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 675
 676         def report_retry(self, count, retries):
 677                 """Report retry in case of HTTP error 5xx"""
 678                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 679
 680         def report_file_already_downloaded(self, file_name):
 681                 """Report file has already been fully downloaded."""
 682                 try:
 683                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 684                 except (UnicodeEncodeError), err:
 685                         self.to_screen(u'[download] The file has already been downloaded')
 686
 687         def report_unable_to_resume(self):
 688                 """Report it was impossible to resume download."""
 689                 self.to_screen(u'[download] Unable to resume')
 690
 691         def report_finish(self):
 692                 """Report download finished."""
 693                 if self.params.get('noprogress', False):
 694                         self.to_screen(u'[download] Download completed')
 695                 else:
 696                         self.to_screen(u'')
 697
 698         def increment_downloads(self):
 699                 """Increment the ordinal that assigns a number to each file."""
 700                 self._num_downloads += 1
 701
 702         def prepare_filename(self, info_dict):
 703                 """Generate the output filename."""
 704                 try:
 705                         template_dict = dict(info_dict)
 706                         template_dict['epoch'] = unicode(long(time.time()))
 707                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 708                         filename = self.params['outtmpl'] % template_dict
 709                         return filename
 710                 except (ValueError, KeyError), err:
 711                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 712                         return None
 713
 714         def _match_entry(self, info_dict):
 715                 """ Returns None iff the file should be downloaded """
 716
 717                 title = info_dict['title']
 718                 matchtitle = self.params.get('matchtitle', False)
 719                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 720                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 721                 rejecttitle = self.params.get('rejecttitle', False)
 722                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 723                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 724                 return None
 725
 726         def process_info(self, info_dict):
 727                 """Process a single dictionary returned by an InfoExtractor."""
 728
 729                 reason = self._match_entry(info_dict)
 730                 if reason is not None:
 731                         self.to_screen(u'[download] ' + reason)
 732                         return
 733
 734                 max_downloads = self.params.get('max_downloads')
 735                 if max_downloads is not None:
 736                         if self._num_downloads > int(max_downloads):
 737                                 raise MaxDownloadsReached()
 738
 739                 filename = self.prepare_filename(info_dict)
 740
 741                 # Forced printings
 742                 if self.params.get('forcetitle', False):
 743                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 744                 if self.params.get('forceurl', False):
 745                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 746                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 747                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 748                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 749                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 750                 if self.params.get('forcefilename', False) and filename is not None:
 751                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 752                 if self.params.get('forceformat', False):
 753                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 754
 755                 # Do nothing else if in simulate mode
 756                 if self.params.get('simulate', False):
 757                         return
 758
 759                 if filename is None:
 760                         return
 761
 762                 try:
 763                         dn = os.path.dirname(filename)
 764                         if dn != '' and not os.path.exists(dn):
 765                                 os.makedirs(dn)
 766                 except (OSError, IOError), err:
 767                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 768                         return
 769
 770                 if self.params.get('writedescription', False):
 771                         try:
 772                                 descfn = filename + '.description'
 773                                 self.report_writedescription(descfn)
 774                                 descfile = open(descfn, 'wb')
 775                                 try:
 776                                         descfile.write(info_dict['description'].encode('utf-8'))
 777                                 finally:
 778                                         descfile.close()
 779                         except (OSError, IOError):
 780                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 781                                 return
 782
 783                 if self.params.get('writeinfojson', False):
 784                         infofn = filename + '.info.json'
 785                         self.report_writeinfojson(infofn)
 786                         try:
 787                                 json.dump
 788                         except (NameError,AttributeError):
 789                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 790                                 return
 791                         try:
 792                                 infof = open(infofn, 'wb')
 793                                 try:
 794                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 795                                         json.dump(json_info_dict, infof)
 796                                 finally:
 797                                         infof.close()
 798                         except (OSError, IOError):
 799                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 800                                 return
 801
 802                 if not self.params.get('skip_download', False):
 803                         if self.params.get('nooverwrites', False) and os.path.exists(filename):
 804                                 success = True
 805                         else:
 806                                 try:
 807                                         success = self._do_download(filename, info_dict)
 808                                 except (OSError, IOError), err:
 809                                         raise UnavailableVideoError
 810                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 811                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 812                                         return
 813                                 except (ContentTooShortError, ), err:
 814                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 815                                         return
 816
 817                         if success:
 818                                 try:
 819                                         self.post_process(filename, info_dict)
 820                                 except (PostProcessingError), err:
 821                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 822                                         return
 823
 824         def download(self, url_list):
 825                 """Download a given list of URLs."""
 826                 if len(url_list) > 1 and self.fixed_template():
 827                         raise SameFileError(self.params['outtmpl'])
 828
 829                 for url in url_list:
 830                         suitable_found = False
 831                         for ie in self._ies:
 832                                 # Go to next InfoExtractor if not suitable
 833                                 if not ie.suitable(url):
 834                                         continue
 835
 836                                 # Suitable InfoExtractor found
 837                                 suitable_found = True
 838
 839                                 # Extract information from URL and process it
 840                                 ie.extract(url)
 841
 842                                 # Suitable InfoExtractor had been found; go to next URL
 843                                 break
 844
 845                         if not suitable_found:
 846                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 847
 848                 return self._download_retcode
 849
 850         def post_process(self, filename, ie_info):
 851                 """Run the postprocessing chain on the given file."""
 852                 info = dict(ie_info)
 853                 info['filepath'] = filename
 854                 for pp in self._pps:
 855                         info = pp.run(info)
 856                         if info is None:
 857                                 break
 858
 859         def _download_with_rtmpdump(self, filename, url, player_url):
 860                 self.report_destination(filename)
 861                 tmpfilename = self.temp_name(filename)
 862
 863                 # Check for rtmpdump first
 864                 try:
 865                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 866                 except (OSError, IOError):
 867                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 868                         return False
 869
 870                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 871                 # the connection was interrumpted and resuming appears to be
 872                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 873                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 874                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 875                 while retval == 2 or retval == 1:
 876                         prevsize = os.path.getsize(tmpfilename)
 877                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 878                         time.sleep(5.0) # This seems to be needed
 879                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 880                         cursize = os.path.getsize(tmpfilename)
 881                         if prevsize == cursize and retval == 1:
 882                                 break
 883                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 884                         if prevsize == cursize and retval == 2 and cursize > 1024:
 885                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 886                                 retval = 0
 887                                 break
 888                 if retval == 0:
 889                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 890                         self.try_rename(tmpfilename, filename)
 891                         return True
 892                 else:
 893                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 894                         return False
 895
 896         def _do_download(self, filename, info_dict):
 897                 url = info_dict['url']
 898                 player_url = info_dict.get('player_url', None)
 899
 900                 # Check file already present
 901                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 902                         self.report_file_already_downloaded(filename)
 903                         return True
 904
 905                 # Attempt to download using rtmpdump
 906                 if url.startswith('rtmp'):
 907                         return self._download_with_rtmpdump(filename, url, player_url)
 908
 909                 tmpfilename = self.temp_name(filename)
 910                 stream = None
 911
 912                 # Do not include the Accept-Encoding header
 913                 headers = {'Youtubedl-no-compression': 'True'}
 914                 basic_request = urllib2.Request(url, None, headers)
 915                 request = urllib2.Request(url, None, headers)
 916
 917                 # Establish possible resume length
 918                 if os.path.isfile(tmpfilename):
 919                         resume_len = os.path.getsize(tmpfilename)
 920                 else:
 921                         resume_len = 0
 922
 923                 open_mode = 'wb'
 924                 if resume_len != 0:
 925                         if self.params.get('continuedl', False):
 926                                 self.report_resuming_byte(resume_len)
 927                                 request.add_header('Range','bytes=%d-' % resume_len)
 928                                 open_mode = 'ab'
 929                         else:
 930                                 resume_len = 0
 931
 932                 count = 0
 933                 retries = self.params.get('retries', 0)
 934                 while count <= retries:
 935                         # Establish connection
 936                         try:
 937                                 if count == 0 and 'urlhandle' in info_dict:
 938                                         data = info_dict['urlhandle']
 939                                 data = urllib2.urlopen(request)
 940                                 break
 941                         except (urllib2.HTTPError, ), err:
 942                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 943                                         # Unexpected HTTP error
 944                                         raise
 945                                 elif err.code == 416:
 946                                         # Unable to resume (requested range not satisfiable)
 947                                         try:
 948                                                 # Open the connection again without the range header
 949                                                 data = urllib2.urlopen(basic_request)
 950                                                 content_length = data.info()['Content-Length']
 951                                         except (urllib2.HTTPError, ), err:
 952                                                 if err.code < 500 or err.code >= 600:
 953                                                         raise
 954                                         else:
 955                                                 # Examine the reported length
 956                                                 if (content_length is not None and
 957                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
 958                                                         # The file had already been fully downloaded.
 959                                                         # Explanation to the above condition: in issue #175 it was revealed that
 960                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 961                                                         # changing the file size slightly and causing problems for some users. So
 962                                                         # I decided to implement a suggested change and consider the file
 963                                                         # completely downloaded if the file size differs less than 100 bytes from
 964                                                         # the one in the hard drive.
 965                                                         self.report_file_already_downloaded(filename)
 966                                                         self.try_rename(tmpfilename, filename)
 967                                                         return True
 968                                                 else:
 969                                                         # The length does not match, we start the download over
 970                                                         self.report_unable_to_resume()
 971                                                         open_mode = 'wb'
 972                                                         break
 973                         # Retry
 974                         count += 1
 975                         if count <= retries:
 976                                 self.report_retry(count, retries)
 977
 978                 if count > retries:
 979                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 980                         return False
 981
 982                 data_len = data.info().get('Content-length', None)
 983                 if data_len is not None:
 984                         data_len = long(data_len) + resume_len
 985                 data_len_str = self.format_bytes(data_len)
 986                 byte_counter = 0 + resume_len
 987                 block_size = 1024
 988                 start = time.time()
 989                 while True:
 990                         # Download and write
 991                         before = time.time()
 992                         data_block = data.read(block_size)
 993                         after = time.time()
 994                         if len(data_block) == 0:
 995                                 break
 996                         byte_counter += len(data_block)
 997
 998                         # Open file just in time
 999                         if stream is None:
1000                                 try:
1001                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1002                                         assert stream is not None
1003                                         filename = self.undo_temp_name(tmpfilename)
1004                                         self.report_destination(filename)
1005                                 except (OSError, IOError), err:
1006                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1007                                         return False
1008                         try:
1009                                 stream.write(data_block)
1010                         except (IOError, OSError), err:
1011                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1012                                 return False
1013                         block_size = self.best_block_size(after - before, len(data_block))
1014
1015                         # Progress message
1016                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1017                         if data_len is None:
1018                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1019                         else:
1020                                 percent_str = self.calc_percent(byte_counter, data_len)
1021                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1022                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1023
1024                         # Apply rate limit
1025                         self.slow_down(start, byte_counter - resume_len)
1026
1027                 if stream is None:
1028                         self.trouble(u'\nERROR: Did not get any data blocks')
1029                         return False
1030                 stream.close()
1031                 self.report_finish()
1032                 if data_len is not None and byte_counter != data_len:
1033                         raise ContentTooShortError(byte_counter, long(data_len))
1034                 self.try_rename(tmpfilename, filename)
1035
1036                 # Update file modification time
1037                 if self.params.get('updatetime', True):
1038                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1039
1040                 return True
1041
1042
1043 class InfoExtractor(object):
1044         """Information Extractor class.
1045
1046         Information extractors are the classes that, given a URL, extract
1047         information from the video (or videos) the URL refers to. This
1048         information includes the real video URL, the video title and simplified
1049         title, author and others. The information is stored in a dictionary
1050         which is then passed to the FileDownloader. The FileDownloader
1051         processes this information possibly downloading the video to the file
1052         system, among other possible outcomes. The dictionaries must include
1053         the following fields:
1054
1055         id:             Video identifier.
1056         url:            Final video URL.
1057         uploader:       Nickname of the video uploader.
1058         title:          Literal title.
1059         stitle:         Simplified title.
1060         ext:            Video filename extension.
1061         format:         Video format.
1062         player_url:     SWF Player URL (may be None).
1063
1064         The following fields are optional. Their primary purpose is to allow
1065         youtube-dl to serve as the backend for a video search function, such
1066         as the one in youtube2mp3.  They are only used when their respective
1067         forced printing functions are called:
1068
1069         thumbnail:      Full URL to a video thumbnail image.
1070         description:    One-line video description.
1071
1072         Subclasses of this one should re-define the _real_initialize() and
1073         _real_extract() methods and define a _VALID_URL regexp.
1074         Probably, they should also be added to the list of extractors.
1075         """
1076
1077         _ready = False
1078         _downloader = None
1079
1080         def __init__(self, downloader=None):
1081                 """Constructor. Receives an optional downloader."""
1082                 self._ready = False
1083                 self.set_downloader(downloader)
1084
1085         def suitable(self, url):
1086                 """Receives a URL and returns True if suitable for this IE."""
1087                 return re.match(self._VALID_URL, url) is not None
1088
1089         def initialize(self):
1090                 """Initializes an instance (authentication, etc)."""
1091                 if not self._ready:
1092                         self._real_initialize()
1093                         self._ready = True
1094
1095         def extract(self, url):
1096                 """Extracts URL information and returns it in list of dicts."""
1097                 self.initialize()
1098                 return self._real_extract(url)
1099
1100         def set_downloader(self, downloader):
1101                 """Sets the downloader for this IE."""
1102                 self._downloader = downloader
1103
1104         def _real_initialize(self):
1105                 """Real initialization process. Redefine in subclasses."""
1106                 pass
1107
1108         def _real_extract(self, url):
1109                 """Real extraction process. Redefine in subclasses."""
1110                 pass
1111
1112
1113 class YoutubeIE(InfoExtractor):
1114         """Information extractor for youtube.com."""
1115
1116         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1117         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1118         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1119         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1120         _NETRC_MACHINE = 'youtube'
1121         # Listed in order of quality
1122         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1123         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1124         _video_extensions = {
1125                 '13': '3gp',
1126                 '17': 'mp4',
1127                 '18': 'mp4',
1128                 '22': 'mp4',
1129                 '37': 'mp4',
1130                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1131                 '43': 'webm',
1132                 '44': 'webm',
1133                 '45': 'webm',
1134         }
1135         _video_dimensions = {
1136                 '5': '240x400',
1137                 '6': '???',
1138                 '13': '???',
1139                 '17': '144x176',
1140                 '18': '360x640',
1141                 '22': '720x1280',
1142                 '34': '360x640',
1143                 '35': '480x854',
1144                 '37': '1080x1920',
1145                 '38': '3072x4096',
1146                 '43': '360x640',
1147                 '44': '480x854',
1148                 '45': '720x1280',
1149         }
1150         IE_NAME = u'youtube'
1151
1152         def report_lang(self):
1153                 """Report attempt to set language."""
1154                 self._downloader.to_screen(u'[youtube] Setting language')
1155
1156         def report_login(self):
1157                 """Report attempt to log in."""
1158                 self._downloader.to_screen(u'[youtube] Logging in')
1159
1160         def report_age_confirmation(self):
1161                 """Report attempt to confirm age."""
1162                 self._downloader.to_screen(u'[youtube] Confirming age')
1163
1164         def report_video_webpage_download(self, video_id):
1165                 """Report attempt to download video webpage."""
1166                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1167
1168         def report_video_info_webpage_download(self, video_id):
1169                 """Report attempt to download video info webpage."""
1170                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1171
1172         def report_information_extraction(self, video_id):
1173                 """Report attempt to extract video information."""
1174                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1175
1176         def report_unavailable_format(self, video_id, format):
1177                 """Report extracted video URL."""
1178                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1179
1180         def report_rtmp_download(self):
1181                 """Indicate the download will use the RTMP protocol."""
1182                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1183
1184         def _print_formats(self, formats):
1185                 print 'Available formats:'
1186                 for x in formats:
1187                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1188
1189         def _real_initialize(self):
1190                 if self._downloader is None:
1191                         return
1192
1193                 username = None
1194                 password = None
1195                 downloader_params = self._downloader.params
1196
1197                 # Attempt to use provided username and password or .netrc data
1198                 if downloader_params.get('username', None) is not None:
1199                         username = downloader_params['username']
1200                         password = downloader_params['password']
1201                 elif downloader_params.get('usenetrc', False):
1202                         try:
1203                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1204                                 if info is not None:
1205                                         username = info[0]
1206                                         password = info[2]
1207                                 else:
1208                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1209                         except (IOError, netrc.NetrcParseError), err:
1210                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1211                                 return
1212
1213                 # Set language
1214                 request = urllib2.Request(self._LANG_URL)
1215                 try:
1216                         self.report_lang()
1217                         urllib2.urlopen(request).read()
1218                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1220                         return
1221
1222                 # No authentication to be performed
1223                 if username is None:
1224                         return
1225
1226                 # Log in
1227                 login_form = {
1228                                 'current_form': 'loginForm',
1229                                 'next':         '/',
1230                                 'action_login': 'Log In',
1231                                 'username':     username,
1232                                 'password':     password,
1233                                 }
1234                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1235                 try:
1236                         self.report_login()
1237                         login_results = urllib2.urlopen(request).read()
1238                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1239                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1240                                 return
1241                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1242                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1243                         return
1244
1245                 # Confirm age
1246                 age_form = {
1247                                 'next_url':             '/',
1248                                 'action_confirm':       'Confirm',
1249                                 }
1250                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1251                 try:
1252                         self.report_age_confirmation()
1253                         age_results = urllib2.urlopen(request).read()
1254                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1255                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1256                         return
1257
1258         def _real_extract(self, url):
1259                 # Extract video id from URL
1260                 mobj = re.match(self._VALID_URL, url)
1261                 if mobj is None:
1262                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1263                         return
1264                 video_id = mobj.group(2)
1265
1266                 # Get video webpage
1267                 self.report_video_webpage_download(video_id)
1268                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1269                 try:
1270                         video_webpage = urllib2.urlopen(request).read()
1271                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1272                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1273                         return
1274
1275                 # Attempt to extract SWF player URL
1276                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1277                 if mobj is not None:
1278                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1279                 else:
1280                         player_url = None
1281
1282                 # Get video info
1283                 self.report_video_info_webpage_download(video_id)
1284                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1285                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1286                                         % (video_id, el_type))
1287                         request = urllib2.Request(video_info_url)
1288                         try:
1289                                 video_info_webpage = urllib2.urlopen(request).read()
1290                                 video_info = parse_qs(video_info_webpage)
1291                                 if 'token' in video_info:
1292                                         break
1293                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1294                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1295                                 return
1296                 if 'token' not in video_info:
1297                         if 'reason' in video_info:
1298                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1299                         else:
1300                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1301                         return
1302
1303                 # Start extracting information
1304                 self.report_information_extraction(video_id)
1305
1306                 # uploader
1307                 if 'author' not in video_info:
1308                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1309                         return
1310                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1311
1312                 # title
1313                 if 'title' not in video_info:
1314                         self._downloader.trouble(u'ERROR: unable to extract video title')
1315                         return
1316                 video_title = urllib.unquote_plus(video_info['title'][0])
1317                 video_title = video_title.decode('utf-8')
1318                 video_title = sanitize_title(video_title)
1319
1320                 # simplified title
1321                 simple_title = _simplify_title(video_title)
1322
1323                 # thumbnail image
1324                 if 'thumbnail_url' not in video_info:
1325                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1326                         video_thumbnail = ''
1327                 else:   # don't panic if we can't find it
1328                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1329
1330                 # upload date
1331                 upload_date = u'NA'
1332                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1333                 if mobj is not None:
1334                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1335                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1336                         for expression in format_expressions:
1337                                 try:
1338                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1339                                 except:
1340                                         pass
1341
1342                 # description
1343                 try:
1344                         lxml.etree
1345                 except NameError:
1346                         video_description = u'No description available.'
1347                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1348                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1349                                 if mobj is not None:
1350                                         video_description = mobj.group(1).decode('utf-8')
1351                 else:
1352                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1353                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1354                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1355                         # TODO use another parser
1356
1357                 # token
1358                 video_token = urllib.unquote_plus(video_info['token'][0])
1359
1360                 # Decide which formats to download
1361                 req_format = self._downloader.params.get('format', None)
1362
1363                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1364                         self.report_rtmp_download()
1365                         video_url_list = [(None, video_info['conn'][0])]
1366                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1367                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1368                         url_data = [parse_qs(uds) for uds in url_data_strs]
1369                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1370                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1371
1372                         format_limit = self._downloader.params.get('format_limit', None)
1373                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1374                         if format_limit is not None and format_limit in available_formats:
1375                                 format_list = available_formats[available_formats.index(format_limit):]
1376                         else:
1377                                 format_list = available_formats
1378                         existing_formats = [x for x in format_list if x in url_map]
1379                         if len(existing_formats) == 0:
1380                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1381                                 return
1382                         if self._downloader.params.get('listformats', None):
1383                                 self._print_formats(existing_formats)
1384                                 return
1385                         if req_format is None or req_format == 'best':
1386                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1387                         elif req_format == 'worst':
1388                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1389                         elif req_format in ('-1', 'all'):
1390                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1391                         else:
1392                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1393                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1394                                 req_formats = req_format.split('/')
1395                                 video_url_list = None
1396                                 for rf in req_formats:
1397                                         if rf in url_map:
1398                                                 video_url_list = [(rf, url_map[rf])]
1399                                                 break
1400                                 if video_url_list is None:
1401                                         self._downloader.trouble(u'ERROR: requested format not available')
1402                                         return
1403                 else:
1404                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1405                         return
1406
1407                 for format_param, video_real_url in video_url_list:
1408                         # At this point we have a new video
1409                         self._downloader.increment_downloads()
1410
1411                         # Extension
1412                         video_extension = self._video_extensions.get(format_param, 'flv')
1413
1414                         try:
1415                                 # Process video information
1416                                 self._downloader.process_info({
1417                                         'id':           video_id.decode('utf-8'),
1418                                         'url':          video_real_url.decode('utf-8'),
1419                                         'uploader':     video_uploader.decode('utf-8'),
1420                                         'upload_date':  upload_date,
1421                                         'title':        video_title,
1422                                         'stitle':       simple_title,
1423                                         'ext':          video_extension.decode('utf-8'),
1424                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1425                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1426                                         'description':  video_description,
1427                                         'player_url':   player_url,
1428                                 })
1429                         except UnavailableVideoError, err:
1430                                 self._downloader.trouble(u'\nERROR: unable to download video')
1431
1432
1433 class MetacafeIE(InfoExtractor):
1434         """Information Extractor for metacafe.com."""
1435
1436         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1437         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1438         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1439         _youtube_ie = None
1440         IE_NAME = u'metacafe'
1441
1442         def __init__(self, youtube_ie, downloader=None):
1443                 InfoExtractor.__init__(self, downloader)
1444                 self._youtube_ie = youtube_ie
1445
1446         def report_disclaimer(self):
1447                 """Report disclaimer retrieval."""
1448                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1449
1450         def report_age_confirmation(self):
1451                 """Report attempt to confirm age."""
1452                 self._downloader.to_screen(u'[metacafe] Confirming age')
1453
1454         def report_download_webpage(self, video_id):
1455                 """Report webpage download."""
1456                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1457
1458         def report_extraction(self, video_id):
1459                 """Report information extraction."""
1460                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1461
1462         def _real_initialize(self):
1463                 # Retrieve disclaimer
1464                 request = urllib2.Request(self._DISCLAIMER)
1465                 try:
1466                         self.report_disclaimer()
1467                         disclaimer = urllib2.urlopen(request).read()
1468                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1469                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1470                         return
1471
1472                 # Confirm age
1473                 disclaimer_form = {
1474                         'filters': '0',
1475                         'submit': "Continue - I'm over 18",
1476                         }
1477                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1478                 try:
1479                         self.report_age_confirmation()
1480                         disclaimer = urllib2.urlopen(request).read()
1481                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1482                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1483                         return
1484
1485         def _real_extract(self, url):
1486                 # Extract id and simplified title from URL
1487                 mobj = re.match(self._VALID_URL, url)
1488                 if mobj is None:
1489                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1490                         return
1491
1492                 video_id = mobj.group(1)
1493
1494                 # Check if video comes from YouTube
1495                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1496                 if mobj2 is not None:
1497                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1498                         return
1499
1500                 # At this point we have a new video
1501                 self._downloader.increment_downloads()
1502
1503                 simple_title = mobj.group(2).decode('utf-8')
1504
1505                 # Retrieve video webpage to extract further information
1506                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1507                 try:
1508                         self.report_download_webpage(video_id)
1509                         webpage = urllib2.urlopen(request).read()
1510                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1511                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1512                         return
1513
1514                 # Extract URL, uploader and title from webpage
1515                 self.report_extraction(video_id)
1516                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1517                 if mobj is not None:
1518                         mediaURL = urllib.unquote(mobj.group(1))
1519                         video_extension = mediaURL[-3:]
1520
1521                         # Extract gdaKey if available
1522                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1523                         if mobj is None:
1524                                 video_url = mediaURL
1525                         else:
1526                                 gdaKey = mobj.group(1)
1527                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1528                 else:
1529                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1530                         if mobj is None:
1531                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1532                                 return
1533                         vardict = parse_qs(mobj.group(1))
1534                         if 'mediaData' not in vardict:
1535                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1536                                 return
1537                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1538                         if mobj is None:
1539                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1540                                 return
1541                         mediaURL = mobj.group(1).replace('\\/', '/')
1542                         video_extension = mediaURL[-3:]
1543                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1544
1545                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1546                 if mobj is None:
1547                         self._downloader.trouble(u'ERROR: unable to extract title')
1548                         return
1549                 video_title = mobj.group(1).decode('utf-8')
1550                 video_title = sanitize_title(video_title)
1551
1552                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1553                 if mobj is None:
1554                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1555                         return
1556                 video_uploader = mobj.group(1)
1557
1558                 try:
1559                         # Process video information
1560                         self._downloader.process_info({
1561                                 'id':           video_id.decode('utf-8'),
1562                                 'url':          video_url.decode('utf-8'),
1563                                 'uploader':     video_uploader.decode('utf-8'),
1564                                 'upload_date':  u'NA',
1565                                 'title':        video_title,
1566                                 'stitle':       simple_title,
1567                                 'ext':          video_extension.decode('utf-8'),
1568                                 'format':       u'NA',
1569                                 'player_url':   None,
1570                         })
1571                 except UnavailableVideoError:
1572                         self._downloader.trouble(u'\nERROR: unable to download video')
1573
1574
1575 class DailymotionIE(InfoExtractor):
1576         """Information Extractor for Dailymotion"""
1577
1578         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1579         IE_NAME = u'dailymotion'
1580
1581         def __init__(self, downloader=None):
1582                 InfoExtractor.__init__(self, downloader)
1583
1584         def report_download_webpage(self, video_id):
1585                 """Report webpage download."""
1586                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1587
1588         def report_extraction(self, video_id):
1589                 """Report information extraction."""
1590                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1591
1592         def _real_extract(self, url):
1593                 htmlParser = HTMLParser.HTMLParser()
1594
1595                 # Extract id and simplified title from URL
1596                 mobj = re.match(self._VALID_URL, url)
1597                 if mobj is None:
1598                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1599                         return
1600
1601                 # At this point we have a new video
1602                 self._downloader.increment_downloads()
1603                 video_id = mobj.group(1)
1604
1605                 video_extension = 'flv'
1606
1607                 # Retrieve video webpage to extract further information
1608                 request = urllib2.Request(url)
1609                 request.add_header('Cookie', 'family_filter=off')
1610                 try:
1611                         self.report_download_webpage(video_id)
1612                         webpage = urllib2.urlopen(request).read()
1613                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1614                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1615                         return
1616
1617                 # Extract URL, uploader and title from webpage
1618                 self.report_extraction(video_id)
1619                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1620                 if mobj is None:
1621                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1622                         return
1623                 sequence = urllib.unquote(mobj.group(1))
1624                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1625                 if mobj is None:
1626                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1627                         return
1628                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1629
1630                 # if needed add http://www.dailymotion.com/ if relative URL
1631
1632                 video_url = mediaURL
1633
1634                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1635                 if mobj is None:
1636                         self._downloader.trouble(u'ERROR: unable to extract title')
1637                         return
1638                 video_title = htmlParser.unescape(mobj.group('title')).decode('utf-8')
1639                 video_title = sanitize_title(video_title)
1640                 simple_title = _simplify_title(video_title)
1641
1642                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1643                 if mobj is None:
1644                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1645                         return
1646                 video_uploader = mobj.group(1)
1647
1648                 try:
1649                         # Process video information
1650                         self._downloader.process_info({
1651                                 'id':           video_id.decode('utf-8'),
1652                                 'url':          video_url.decode('utf-8'),
1653                                 'uploader':     video_uploader.decode('utf-8'),
1654                                 'upload_date':  u'NA',
1655                                 'title':        video_title,
1656                                 'stitle':       simple_title,
1657                                 'ext':          video_extension.decode('utf-8'),
1658                                 'format':       u'NA',
1659                                 'player_url':   None,
1660                         })
1661                 except UnavailableVideoError:
1662                         self._downloader.trouble(u'\nERROR: unable to download video')
1663
1664
1665 class GoogleIE(InfoExtractor):
1666         """Information extractor for video.google.com."""
1667
1668         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1669         IE_NAME = u'video.google'
1670
1671         def __init__(self, downloader=None):
1672                 InfoExtractor.__init__(self, downloader)
1673
1674         def report_download_webpage(self, video_id):
1675                 """Report webpage download."""
1676                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1677
1678         def report_extraction(self, video_id):
1679                 """Report information extraction."""
1680                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1681
1682         def _real_extract(self, url):
1683                 # Extract id from URL
1684                 mobj = re.match(self._VALID_URL, url)
1685                 if mobj is None:
1686                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1687                         return
1688
1689                 # At this point we have a new video
1690                 self._downloader.increment_downloads()
1691                 video_id = mobj.group(1)
1692
1693                 video_extension = 'mp4'
1694
1695                 # Retrieve video webpage to extract further information
1696                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1697                 try:
1698                         self.report_download_webpage(video_id)
1699                         webpage = urllib2.urlopen(request).read()
1700                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1701                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1702                         return
1703
1704                 # Extract URL, uploader, and title from webpage
1705                 self.report_extraction(video_id)
1706                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1707                 if mobj is None:
1708                         video_extension = 'flv'
1709                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1710                 if mobj is None:
1711                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1712                         return
1713                 mediaURL = urllib.unquote(mobj.group(1))
1714                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1715                 mediaURL = mediaURL.replace('\\x26', '\x26')
1716
1717                 video_url = mediaURL
1718
1719                 mobj = re.search(r'<title>(.*)</title>', webpage)
1720                 if mobj is None:
1721                         self._downloader.trouble(u'ERROR: unable to extract title')
1722                         return
1723                 video_title = mobj.group(1).decode('utf-8')
1724                 video_title = sanitize_title(video_title)
1725                 simple_title = _simplify_title(video_title)
1726
1727                 # Extract video description
1728                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1729                 if mobj is None:
1730                         self._downloader.trouble(u'ERROR: unable to extract video description')
1731                         return
1732                 video_description = mobj.group(1).decode('utf-8')
1733                 if not video_description:
1734                         video_description = 'No description available.'
1735
1736                 # Extract video thumbnail
1737                 if self._downloader.params.get('forcethumbnail', False):
1738                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1739                         try:
1740                                 webpage = urllib2.urlopen(request).read()
1741                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1742                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1743                                 return
1744                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1745                         if mobj is None:
1746                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1747                                 return
1748                         video_thumbnail = mobj.group(1)
1749                 else:   # we need something to pass to process_info
1750                         video_thumbnail = ''
1751
1752                 try:
1753                         # Process video information
1754                         self._downloader.process_info({
1755                                 'id':           video_id.decode('utf-8'),
1756                                 'url':          video_url.decode('utf-8'),
1757                                 'uploader':     u'NA',
1758                                 'upload_date':  u'NA',
1759                                 'title':        video_title,
1760                                 'stitle':       simple_title,
1761                                 'ext':          video_extension.decode('utf-8'),
1762                                 'format':       u'NA',
1763                                 'player_url':   None,
1764                         })
1765                 except UnavailableVideoError:
1766                         self._downloader.trouble(u'\nERROR: unable to download video')
1767
1768
1769 class PhotobucketIE(InfoExtractor):
1770         """Information extractor for photobucket.com."""
1771
1772         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1773         IE_NAME = u'photobucket'
1774
1775         def __init__(self, downloader=None):
1776                 InfoExtractor.__init__(self, downloader)
1777
1778         def report_download_webpage(self, video_id):
1779                 """Report webpage download."""
1780                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1781
1782         def report_extraction(self, video_id):
1783                 """Report information extraction."""
1784                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1785
1786         def _real_extract(self, url):
1787                 # Extract id from URL
1788                 mobj = re.match(self._VALID_URL, url)
1789                 if mobj is None:
1790                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1791                         return
1792
1793                 # At this point we have a new video
1794                 self._downloader.increment_downloads()
1795                 video_id = mobj.group(1)
1796
1797                 video_extension = 'flv'
1798
1799                 # Retrieve video webpage to extract further information
1800                 request = urllib2.Request(url)
1801                 try:
1802                         self.report_download_webpage(video_id)
1803                         webpage = urllib2.urlopen(request).read()
1804                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1805                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1806                         return
1807
1808                 # Extract URL, uploader, and title from webpage
1809                 self.report_extraction(video_id)
1810                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1811                 if mobj is None:
1812                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1813                         return
1814                 mediaURL = urllib.unquote(mobj.group(1))
1815
1816                 video_url = mediaURL
1817
1818                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1819                 if mobj is None:
1820                         self._downloader.trouble(u'ERROR: unable to extract title')
1821                         return
1822                 video_title = mobj.group(1).decode('utf-8')
1823                 video_title = sanitize_title(video_title)
1824                 simple_title = _simplify_title(vide_title)
1825
1826                 video_uploader = mobj.group(2).decode('utf-8')
1827
1828                 try:
1829                         # Process video information
1830                         self._downloader.process_info({
1831                                 'id':           video_id.decode('utf-8'),
1832                                 'url':          video_url.decode('utf-8'),
1833                                 'uploader':     video_uploader,
1834                                 'upload_date':  u'NA',
1835                                 'title':        video_title,
1836                                 'stitle':       simple_title,
1837                                 'ext':          video_extension.decode('utf-8'),
1838                                 'format':       u'NA',
1839                                 'player_url':   None,
1840                         })
1841                 except UnavailableVideoError:
1842                         self._downloader.trouble(u'\nERROR: unable to download video')
1843
1844
1845 class YahooIE(InfoExtractor):
1846         """Information extractor for video.yahoo.com."""
1847
1848         # _VALID_URL matches all Yahoo! Video URLs
1849         # _VPAGE_URL matches only the extractable '/watch/' URLs
1850         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1851         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1852         IE_NAME = u'video.yahoo'
1853
1854         def __init__(self, downloader=None):
1855                 InfoExtractor.__init__(self, downloader)
1856
1857         def report_download_webpage(self, video_id):
1858                 """Report webpage download."""
1859                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1860
1861         def report_extraction(self, video_id):
1862                 """Report information extraction."""
1863                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1864
1865         def _real_extract(self, url, new_video=True):
1866                 # Extract ID from URL
1867                 mobj = re.match(self._VALID_URL, url)
1868                 if mobj is None:
1869                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1870                         return
1871
1872                 # At this point we have a new video
1873                 self._downloader.increment_downloads()
1874                 video_id = mobj.group(2)
1875                 video_extension = 'flv'
1876
1877                 # Rewrite valid but non-extractable URLs as
1878                 # extractable English language /watch/ URLs
1879                 if re.match(self._VPAGE_URL, url) is None:
1880                         request = urllib2.Request(url)
1881                         try:
1882                                 webpage = urllib2.urlopen(request).read()
1883                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1884                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1885                                 return
1886
1887                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1888                         if mobj is None:
1889                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1890                                 return
1891                         yahoo_id = mobj.group(1)
1892
1893                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1894                         if mobj is None:
1895                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1896                                 return
1897                         yahoo_vid = mobj.group(1)
1898
1899                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1900                         return self._real_extract(url, new_video=False)
1901
1902                 # Retrieve video webpage to extract further information
1903                 request = urllib2.Request(url)
1904                 try:
1905                         self.report_download_webpage(video_id)
1906                         webpage = urllib2.urlopen(request).read()
1907                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1908                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1909                         return
1910
1911                 # Extract uploader and title from webpage
1912                 self.report_extraction(video_id)
1913                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1914                 if mobj is None:
1915                         self._downloader.trouble(u'ERROR: unable to extract video title')
1916                         return
1917                 video_title = mobj.group(1).decode('utf-8')
1918                 simple_title = _simplify_title(video_title)
1919
1920                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1921                 if mobj is None:
1922                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1923                         return
1924                 video_uploader = mobj.group(1).decode('utf-8')
1925
1926                 # Extract video thumbnail
1927                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1928                 if mobj is None:
1929                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1930                         return
1931                 video_thumbnail = mobj.group(1).decode('utf-8')
1932
1933                 # Extract video description
1934                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1935                 if mobj is None:
1936                         self._downloader.trouble(u'ERROR: unable to extract video description')
1937                         return
1938                 video_description = mobj.group(1).decode('utf-8')
1939                 if not video_description:
1940                         video_description = 'No description available.'
1941
1942                 # Extract video height and width
1943                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1944                 if mobj is None:
1945                         self._downloader.trouble(u'ERROR: unable to extract video height')
1946                         return
1947                 yv_video_height = mobj.group(1)
1948
1949                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1950                 if mobj is None:
1951                         self._downloader.trouble(u'ERROR: unable to extract video width')
1952                         return
1953                 yv_video_width = mobj.group(1)
1954
1955                 # Retrieve video playlist to extract media URL
1956                 # I'm not completely sure what all these options are, but we
1957                 # seem to need most of them, otherwise the server sends a 401.
1958                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1959                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1960                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1961                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1962                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1963                 try:
1964                         self.report_download_webpage(video_id)
1965                         webpage = urllib2.urlopen(request).read()
1966                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1967                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1968                         return
1969
1970                 # Extract media URL from playlist XML
1971                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1972                 if mobj is None:
1973                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1974                         return
1975                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1976                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1977
1978                 try:
1979                         # Process video information
1980                         self._downloader.process_info({
1981                                 'id':           video_id.decode('utf-8'),
1982                                 'url':          video_url,
1983                                 'uploader':     video_uploader,
1984                                 'upload_date':  u'NA',
1985                                 'title':        video_title,
1986                                 'stitle':       simple_title,
1987                                 'ext':          video_extension.decode('utf-8'),
1988                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1989                                 'description':  video_description,
1990                                 'thumbnail':    video_thumbnail,
1991                                 'player_url':   None,
1992                         })
1993                 except UnavailableVideoError:
1994                         self._downloader.trouble(u'\nERROR: unable to download video')
1995
1996
1997 class VimeoIE(InfoExtractor):
1998         """Information extractor for vimeo.com."""
1999
2000         # _VALID_URL matches Vimeo URLs
2001         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2002         IE_NAME = u'vimeo'
2003
2004         def __init__(self, downloader=None):
2005                 InfoExtractor.__init__(self, downloader)
2006
2007         def report_download_webpage(self, video_id):
2008                 """Report webpage download."""
2009                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2010
2011         def report_extraction(self, video_id):
2012                 """Report information extraction."""
2013                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2014
2015         def _real_extract(self, url, new_video=True):
2016                 # Extract ID from URL
2017                 mobj = re.match(self._VALID_URL, url)
2018                 if mobj is None:
2019                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2020                         return
2021
2022                 # At this point we have a new video
2023                 self._downloader.increment_downloads()
2024                 video_id = mobj.group(1)
2025
2026                 # Retrieve video webpage to extract further information
2027                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2028                 try:
2029                         self.report_download_webpage(video_id)
2030                         webpage = urllib2.urlopen(request).read()
2031                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2032                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2033                         return
2034
2035                 # Now we begin extracting as much information as we can from what we
2036                 # retrieved. First we extract the information common to all extractors,
2037                 # and latter we extract those that are Vimeo specific.
2038                 self.report_extraction(video_id)
2039
2040                 # Extract title
2041                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2042                 if mobj is None:
2043                         self._downloader.trouble(u'ERROR: unable to extract video title')
2044                         return
2045                 video_title = mobj.group(1).decode('utf-8')
2046                 simple_title = _simplify_title(video_title)
2047
2048                 # Extract uploader
2049                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2050                 if mobj is None:
2051                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2052                         return
2053                 video_uploader = mobj.group(1).decode('utf-8')
2054
2055                 # Extract video thumbnail
2056                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2057                 if mobj is None:
2058                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2059                         return
2060                 video_thumbnail = mobj.group(1).decode('utf-8')
2061
2062                 # # Extract video description
2063                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2064                 # if mobj is None:
2065                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2066                 #       return
2067                 # video_description = mobj.group(1).decode('utf-8')
2068                 # if not video_description: video_description = 'No description available.'
2069                 video_description = 'Foo.'
2070
2071                 # Vimeo specific: extract request signature
2072                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2073                 if mobj is None:
2074                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2075                         return
2076                 sig = mobj.group(1).decode('utf-8')
2077
2078                 # Vimeo specific: extract video quality information
2079                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2080                 if mobj is None:
2081                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2082                         return
2083                 quality = mobj.group(1).decode('utf-8')
2084
2085                 if int(quality) == 1:
2086                         quality = 'hd'
2087                 else:
2088                         quality = 'sd'
2089
2090                 # Vimeo specific: Extract request signature expiration
2091                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2092                 if mobj is None:
2093                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2094                         return
2095                 sig_exp = mobj.group(1).decode('utf-8')
2096
2097                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2098
2099                 try:
2100                         # Process video information
2101                         self._downloader.process_info({
2102                                 'id':           video_id.decode('utf-8'),
2103                                 'url':          video_url,
2104                                 'uploader':     video_uploader,
2105                                 'upload_date':  u'NA',
2106                                 'title':        video_title,
2107                                 'stitle':       simple_title,
2108                                 'ext':          u'mp4',
2109                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2110                                 'description':  video_description,
2111                                 'thumbnail':    video_thumbnail,
2112                                 'description':  video_description,
2113                                 'player_url':   None,
2114                         })
2115                 except UnavailableVideoError:
2116                         self._downloader.trouble(u'ERROR: unable to download video')
2117
2118
2119 class GenericIE(InfoExtractor):
2120         """Generic last-resort information extractor."""
2121
2122         _VALID_URL = r'.*'
2123         IE_NAME = u'generic'
2124
2125         def __init__(self, downloader=None):
2126                 InfoExtractor.__init__(self, downloader)
2127
2128         def report_download_webpage(self, video_id):
2129                 """Report webpage download."""
2130                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2131                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2132
2133         def report_extraction(self, video_id):
2134                 """Report information extraction."""
2135                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2136
2137         def _real_extract(self, url):
2138                 # At this point we have a new video
2139                 self._downloader.increment_downloads()
2140
2141                 video_id = url.split('/')[-1]
2142                 request = urllib2.Request(url)
2143                 try:
2144                         self.report_download_webpage(video_id)
2145                         webpage = urllib2.urlopen(request).read()
2146                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2147                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2148                         return
2149                 except ValueError, err:
2150                         # since this is the last-resort InfoExtractor, if
2151                         # this error is thrown, it'll be thrown here
2152                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2153                         return
2154
2155                 self.report_extraction(video_id)
2156                 # Start with something easy: JW Player in SWFObject
2157                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2158                 if mobj is None:
2159                         # Broaden the search a little bit
2160                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2161                 if mobj is None:
2162                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2163                         return
2164
2165                 # It's possible that one of the regexes
2166                 # matched, but returned an empty group:
2167                 if mobj.group(1) is None:
2168                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2169                         return
2170
2171                 video_url = urllib.unquote(mobj.group(1))
2172                 video_id = os.path.basename(video_url)
2173
2174                 # here's a fun little line of code for you:
2175                 video_extension = os.path.splitext(video_id)[1][1:]
2176                 video_id = os.path.splitext(video_id)[0]
2177
2178                 # it's tempting to parse this further, but you would
2179                 # have to take into account all the variations like
2180                 #   Video Title - Site Name
2181                 #   Site Name | Video Title
2182                 #   Video Title - Tagline | Site Name
2183                 # and so on and so forth; it's just not practical
2184                 mobj = re.search(r'<title>(.*)</title>', webpage)
2185                 if mobj is None:
2186                         self._downloader.trouble(u'ERROR: unable to extract title')
2187                         return
2188                 video_title = mobj.group(1).decode('utf-8')
2189                 video_title = sanitize_title(video_title)
2190                 simple_title = _simplify_title(video_title)
2191
2192                 # video uploader is domain name
2193                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2194                 if mobj is None:
2195                         self._downloader.trouble(u'ERROR: unable to extract title')
2196                         return
2197                 video_uploader = mobj.group(1).decode('utf-8')
2198
2199                 try:
2200                         # Process video information
2201                         self._downloader.process_info({
2202                                 'id':           video_id.decode('utf-8'),
2203                                 'url':          video_url.decode('utf-8'),
2204                                 'uploader':     video_uploader,
2205                                 'upload_date':  u'NA',
2206                                 'title':        video_title,
2207                                 'stitle':       simple_title,
2208                                 'ext':          video_extension.decode('utf-8'),
2209                                 'format':       u'NA',
2210                                 'player_url':   None,
2211                         })
2212                 except UnavailableVideoError, err:
2213                         self._downloader.trouble(u'\nERROR: unable to download video')
2214
2215
2216 class YoutubeSearchIE(InfoExtractor):
2217         """Information Extractor for YouTube search queries."""
2218         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2219         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2220         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2221         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2222         _youtube_ie = None
2223         _max_youtube_results = 1000
2224         IE_NAME = u'youtube:search'
2225
2226         def __init__(self, youtube_ie, downloader=None):
2227                 InfoExtractor.__init__(self, downloader)
2228                 self._youtube_ie = youtube_ie
2229
2230         def report_download_page(self, query, pagenum):
2231                 """Report attempt to download playlist page with given number."""
2232                 query = query.decode(preferredencoding())
2233                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2234
2235         def _real_initialize(self):
2236                 self._youtube_ie.initialize()
2237
2238         def _real_extract(self, query):
2239                 mobj = re.match(self._VALID_URL, query)
2240                 if mobj is None:
2241                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2242                         return
2243
2244                 prefix, query = query.split(':')
2245                 prefix = prefix[8:]
2246                 query = query.encode('utf-8')
2247                 if prefix == '':
2248                         self._download_n_results(query, 1)
2249                         return
2250                 elif prefix == 'all':
2251                         self._download_n_results(query, self._max_youtube_results)
2252                         return
2253                 else:
2254                         try:
2255                                 n = long(prefix)
2256                                 if n <= 0:
2257                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2258                                         return
2259                                 elif n > self._max_youtube_results:
2260                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2261                                         n = self._max_youtube_results
2262                                 self._download_n_results(query, n)
2263                                 return
2264                         except ValueError: # parsing prefix as integer fails
2265                                 self._download_n_results(query, 1)
2266                                 return
2267
2268         def _download_n_results(self, query, n):
2269                 """Downloads a specified number of results for a query"""
2270
2271                 video_ids = []
2272                 already_seen = set()
2273                 pagenum = 1
2274
2275                 while True:
2276                         self.report_download_page(query, pagenum)
2277                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2278                         request = urllib2.Request(result_url)
2279                         try:
2280                                 page = urllib2.urlopen(request).read()
2281                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2282                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2283                                 return
2284
2285                         # Extract video identifiers
2286                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2287                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2288                                 if video_id not in already_seen:
2289                                         video_ids.append(video_id)
2290                                         already_seen.add(video_id)
2291                                         if len(video_ids) == n:
2292                                                 # Specified n videos reached
2293                                                 for id in video_ids:
2294                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2295                                                 return
2296
2297                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2298                                 for id in video_ids:
2299                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2300                                 return
2301
2302                         pagenum = pagenum + 1
2303
2304
2305 class GoogleSearchIE(InfoExtractor):
2306         """Information Extractor for Google Video search queries."""
2307         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2308         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2309         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2310         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2311         _google_ie = None
2312         _max_google_results = 1000
2313         IE_NAME = u'video.google:search'
2314
2315         def __init__(self, google_ie, downloader=None):
2316                 InfoExtractor.__init__(self, downloader)
2317                 self._google_ie = google_ie
2318
2319         def report_download_page(self, query, pagenum):
2320                 """Report attempt to download playlist page with given number."""
2321                 query = query.decode(preferredencoding())
2322                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2323
2324         def _real_initialize(self):
2325                 self._google_ie.initialize()
2326
2327         def _real_extract(self, query):
2328                 mobj = re.match(self._VALID_URL, query)
2329                 if mobj is None:
2330                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2331                         return
2332
2333                 prefix, query = query.split(':')
2334                 prefix = prefix[8:]
2335                 query = query.encode('utf-8')
2336                 if prefix == '':
2337                         self._download_n_results(query, 1)
2338                         return
2339                 elif prefix == 'all':
2340                         self._download_n_results(query, self._max_google_results)
2341                         return
2342                 else:
2343                         try:
2344                                 n = long(prefix)
2345                                 if n <= 0:
2346                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2347                                         return
2348                                 elif n > self._max_google_results:
2349                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2350                                         n = self._max_google_results
2351                                 self._download_n_results(query, n)
2352                                 return
2353                         except ValueError: # parsing prefix as integer fails
2354                                 self._download_n_results(query, 1)
2355                                 return
2356
2357         def _download_n_results(self, query, n):
2358                 """Downloads a specified number of results for a query"""
2359
2360                 video_ids = []
2361                 already_seen = set()
2362                 pagenum = 1
2363
2364                 while True:
2365                         self.report_download_page(query, pagenum)
2366                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2367                         request = urllib2.Request(result_url)
2368                         try:
2369                                 page = urllib2.urlopen(request).read()
2370                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2371                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2372                                 return
2373
2374                         # Extract video identifiers
2375                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2376                                 video_id = mobj.group(1)
2377                                 if video_id not in already_seen:
2378                                         video_ids.append(video_id)
2379                                         already_seen.add(video_id)
2380                                         if len(video_ids) == n:
2381                                                 # Specified n videos reached
2382                                                 for id in video_ids:
2383                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2384                                                 return
2385
2386                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2387                                 for id in video_ids:
2388                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2389                                 return
2390
2391                         pagenum = pagenum + 1
2392
2393
2394 class YahooSearchIE(InfoExtractor):
2395         """Information Extractor for Yahoo! Video search queries."""
2396         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2397         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2398         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2399         _MORE_PAGES_INDICATOR = r'\s*Next'
2400         _yahoo_ie = None
2401         _max_yahoo_results = 1000
2402         IE_NAME = u'video.yahoo:search'
2403
2404         def __init__(self, yahoo_ie, downloader=None):
2405                 InfoExtractor.__init__(self, downloader)
2406                 self._yahoo_ie = yahoo_ie
2407
2408         def report_download_page(self, query, pagenum):
2409                 """Report attempt to download playlist page with given number."""
2410                 query = query.decode(preferredencoding())
2411                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2412
2413         def _real_initialize(self):
2414                 self._yahoo_ie.initialize()
2415
2416         def _real_extract(self, query):
2417                 mobj = re.match(self._VALID_URL, query)
2418                 if mobj is None:
2419                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2420                         return
2421
2422                 prefix, query = query.split(':')
2423                 prefix = prefix[8:]
2424                 query = query.encode('utf-8')
2425                 if prefix == '':
2426                         self._download_n_results(query, 1)
2427                         return
2428                 elif prefix == 'all':
2429                         self._download_n_results(query, self._max_yahoo_results)
2430                         return
2431                 else:
2432                         try:
2433                                 n = long(prefix)
2434                                 if n <= 0:
2435                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2436                                         return
2437                                 elif n > self._max_yahoo_results:
2438                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2439                                         n = self._max_yahoo_results
2440                                 self._download_n_results(query, n)
2441                                 return
2442                         except ValueError: # parsing prefix as integer fails
2443                                 self._download_n_results(query, 1)
2444                                 return
2445
2446         def _download_n_results(self, query, n):
2447                 """Downloads a specified number of results for a query"""
2448
2449                 video_ids = []
2450                 already_seen = set()
2451                 pagenum = 1
2452
2453                 while True:
2454                         self.report_download_page(query, pagenum)
2455                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2456                         request = urllib2.Request(result_url)
2457                         try:
2458                                 page = urllib2.urlopen(request).read()
2459                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2461                                 return
2462
2463                         # Extract video identifiers
2464                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2465                                 video_id = mobj.group(1)
2466                                 if video_id not in already_seen:
2467                                         video_ids.append(video_id)
2468                                         already_seen.add(video_id)
2469                                         if len(video_ids) == n:
2470                                                 # Specified n videos reached
2471                                                 for id in video_ids:
2472                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2473                                                 return
2474
2475                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2476                                 for id in video_ids:
2477                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2478                                 return
2479
2480                         pagenum = pagenum + 1
2481
2482
2483 class YoutubePlaylistIE(InfoExtractor):
2484         """Information Extractor for YouTube playlists."""
2485
2486         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2487         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2488         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2489         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2490         _youtube_ie = None
2491         IE_NAME = u'youtube:playlist'
2492
2493         def __init__(self, youtube_ie, downloader=None):
2494                 InfoExtractor.__init__(self, downloader)
2495                 self._youtube_ie = youtube_ie
2496
2497         def report_download_page(self, playlist_id, pagenum):
2498                 """Report attempt to download playlist page with given number."""
2499                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2500
2501         def _real_initialize(self):
2502                 self._youtube_ie.initialize()
2503
2504         def _real_extract(self, url):
2505                 # Extract playlist id
2506                 mobj = re.match(self._VALID_URL, url)
2507                 if mobj is None:
2508                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2509                         return
2510
2511                 # Single video case
2512                 if mobj.group(3) is not None:
2513                         self._youtube_ie.extract(mobj.group(3))
2514                         return
2515
2516                 # Download playlist pages
2517                 # prefix is 'p' as default for playlists but there are other types that need extra care
2518                 playlist_prefix = mobj.group(1)
2519                 if playlist_prefix == 'a':
2520                         playlist_access = 'artist'
2521                 else:
2522                         playlist_prefix = 'p'
2523                         playlist_access = 'view_play_list'
2524                 playlist_id = mobj.group(2)
2525                 video_ids = []
2526                 pagenum = 1
2527
2528                 while True:
2529                         self.report_download_page(playlist_id, pagenum)
2530                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2531                         request = urllib2.Request(url)
2532                         try:
2533                                 page = urllib2.urlopen(request).read()
2534                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2536                                 return
2537
2538                         # Extract video identifiers
2539                         ids_in_page = []
2540                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2541                                 if mobj.group(1) not in ids_in_page:
2542                                         ids_in_page.append(mobj.group(1))
2543                         video_ids.extend(ids_in_page)
2544
2545                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2546                                 break
2547                         pagenum = pagenum + 1
2548
2549                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2550                 playlistend = self._downloader.params.get('playlistend', -1)
2551                 video_ids = video_ids[playliststart:playlistend]
2552
2553                 for id in video_ids:
2554                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2555                 return
2556
2557
2558 class YoutubeUserIE(InfoExtractor):
2559         """Information Extractor for YouTube users."""
2560
2561         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2562         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2563         _GDATA_PAGE_SIZE = 50
2564         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2565         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2566         _youtube_ie = None
2567         IE_NAME = u'youtube:user'
2568
2569         def __init__(self, youtube_ie, downloader=None):
2570                 InfoExtractor.__init__(self, downloader)
2571                 self._youtube_ie = youtube_ie
2572
2573         def report_download_page(self, username, start_index):
2574                 """Report attempt to download user page."""
2575                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2576                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2577
2578         def _real_initialize(self):
2579                 self._youtube_ie.initialize()
2580
2581         def _real_extract(self, url):
2582                 # Extract username
2583                 mobj = re.match(self._VALID_URL, url)
2584                 if mobj is None:
2585                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2586                         return
2587
2588                 username = mobj.group(1)
2589
2590                 # Download video ids using YouTube Data API. Result size per
2591                 # query is limited (currently to 50 videos) so we need to query
2592                 # page by page until there are no video ids - it means we got
2593                 # all of them.
2594
2595                 video_ids = []
2596                 pagenum = 0
2597
2598                 while True:
2599                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2600                         self.report_download_page(username, start_index)
2601
2602                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2603
2604                         try:
2605                                 page = urllib2.urlopen(request).read()
2606                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2607                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2608                                 return
2609
2610                         # Extract video identifiers
2611                         ids_in_page = []
2612
2613                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2614                                 if mobj.group(1) not in ids_in_page:
2615                                         ids_in_page.append(mobj.group(1))
2616
2617                         video_ids.extend(ids_in_page)
2618
2619                         # A little optimization - if current page is not
2620                         # "full", ie. does not contain PAGE_SIZE video ids then
2621                         # we can assume that this page is the last one - there
2622                         # are no more ids on further pages - no need to query
2623                         # again.
2624
2625                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2626                                 break
2627
2628                         pagenum += 1
2629
2630                 all_ids_count = len(video_ids)
2631                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2632                 playlistend = self._downloader.params.get('playlistend', -1)
2633
2634                 if playlistend == -1:
2635                         video_ids = video_ids[playliststart:]
2636                 else:
2637                         video_ids = video_ids[playliststart:playlistend]
2638
2639                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2640                                 (username, all_ids_count, len(video_ids)))
2641
2642                 for video_id in video_ids:
2643                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2644
2645
2646 class DepositFilesIE(InfoExtractor):
2647         """Information extractor for depositfiles.com"""
2648
2649         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2650         IE_NAME = u'DepositFiles'
2651
2652         def __init__(self, downloader=None):
2653                 InfoExtractor.__init__(self, downloader)
2654
2655         def report_download_webpage(self, file_id):
2656                 """Report webpage download."""
2657                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2658
2659         def report_extraction(self, file_id):
2660                 """Report information extraction."""
2661                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2662
2663         def _real_extract(self, url):
2664                 # At this point we have a new file
2665                 self._downloader.increment_downloads()
2666
2667                 file_id = url.split('/')[-1]
2668                 # Rebuild url in english locale
2669                 url = 'http://depositfiles.com/en/files/' + file_id
2670
2671                 # Retrieve file webpage with 'Free download' button pressed
2672                 free_download_indication = { 'gateway_result' : '1' }
2673                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2674                 try:
2675                         self.report_download_webpage(file_id)
2676                         webpage = urllib2.urlopen(request).read()
2677                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2678                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2679                         return
2680
2681                 # Search for the real file URL
2682                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2683                 if (mobj is None) or (mobj.group(1) is None):
2684                         # Try to figure out reason of the error.
2685                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2686                         if (mobj is not None) and (mobj.group(1) is not None):
2687                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2688                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2689                         else:
2690                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2691                         return
2692
2693                 file_url = mobj.group(1)
2694                 file_extension = os.path.splitext(file_url)[1][1:]
2695
2696                 # Search for file title
2697                 mobj = re.search(r'<b title="(.*?)">', webpage)
2698                 if mobj is None:
2699                         self._downloader.trouble(u'ERROR: unable to extract title')
2700                         return
2701                 file_title = mobj.group(1).decode('utf-8')
2702
2703                 try:
2704                         # Process file information
2705                         self._downloader.process_info({
2706                                 'id':           file_id.decode('utf-8'),
2707                                 'url':          file_url.decode('utf-8'),
2708                                 'uploader':     u'NA',
2709                                 'upload_date':  u'NA',
2710                                 'title':        file_title,
2711                                 'stitle':       file_title,
2712                                 'ext':          file_extension.decode('utf-8'),
2713                                 'format':       u'NA',
2714                                 'player_url':   None,
2715                         })
2716                 except UnavailableVideoError, err:
2717                         self._downloader.trouble(u'ERROR: unable to download file')
2718
2719
2720 class FacebookIE(InfoExtractor):
2721         """Information Extractor for Facebook"""
2722
2723         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2724         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2725         _NETRC_MACHINE = 'facebook'
2726         _available_formats = ['video', 'highqual', 'lowqual']
2727         _video_extensions = {
2728                 'video': 'mp4',
2729                 'highqual': 'mp4',
2730                 'lowqual': 'mp4',
2731         }
2732         IE_NAME = u'facebook'
2733
2734         def __init__(self, downloader=None):
2735                 InfoExtractor.__init__(self, downloader)
2736
2737         def _reporter(self, message):
2738                 """Add header and report message."""
2739                 self._downloader.to_screen(u'[facebook] %s' % message)
2740
2741         def report_login(self):
2742                 """Report attempt to log in."""
2743                 self._reporter(u'Logging in')
2744
2745         def report_video_webpage_download(self, video_id):
2746                 """Report attempt to download video webpage."""
2747                 self._reporter(u'%s: Downloading video webpage' % video_id)
2748
2749         def report_information_extraction(self, video_id):
2750                 """Report attempt to extract video information."""
2751                 self._reporter(u'%s: Extracting video information' % video_id)
2752
2753         def _parse_page(self, video_webpage):
2754                 """Extract video information from page"""
2755                 # General data
2756                 data = {'title': r'\("video_title", "(.*?)"\)',
2757                         'description': r'<div class="datawrap">(.*?)</div>',
2758                         'owner': r'\("video_owner_name", "(.*?)"\)',
2759                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2760                         }
2761                 video_info = {}
2762                 for piece in data.keys():
2763                         mobj = re.search(data[piece], video_webpage)
2764                         if mobj is not None:
2765                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2766
2767                 # Video urls
2768                 video_urls = {}
2769                 for fmt in self._available_formats:
2770                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2771                         if mobj is not None:
2772                                 # URL is in a Javascript segment inside an escaped Unicode format within
2773                                 # the generally utf-8 page
2774                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2775                 video_info['video_urls'] = video_urls
2776
2777                 return video_info
2778
2779         def _real_initialize(self):
2780                 if self._downloader is None:
2781                         return
2782
2783                 useremail = None
2784                 password = None
2785                 downloader_params = self._downloader.params
2786
2787                 # Attempt to use provided username and password or .netrc data
2788                 if downloader_params.get('username', None) is not None:
2789                         useremail = downloader_params['username']
2790                         password = downloader_params['password']
2791                 elif downloader_params.get('usenetrc', False):
2792                         try:
2793                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2794                                 if info is not None:
2795                                         useremail = info[0]
2796                                         password = info[2]
2797                                 else:
2798                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2799                         except (IOError, netrc.NetrcParseError), err:
2800                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2801                                 return
2802
2803                 if useremail is None:
2804                         return
2805
2806                 # Log in
2807                 login_form = {
2808                         'email': useremail,
2809                         'pass': password,
2810                         'login': 'Log+In'
2811                         }
2812                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2813                 try:
2814                         self.report_login()
2815                         login_results = urllib2.urlopen(request).read()
2816                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2817                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2818                                 return
2819                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2820                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2821                         return
2822
2823         def _real_extract(self, url):
2824                 mobj = re.match(self._VALID_URL, url)
2825                 if mobj is None:
2826                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2827                         return
2828                 video_id = mobj.group('ID')
2829
2830                 # Get video webpage
2831                 self.report_video_webpage_download(video_id)
2832                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2833                 try:
2834                         page = urllib2.urlopen(request)
2835                         video_webpage = page.read()
2836                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2837                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2838                         return
2839
2840                 # Start extracting information
2841                 self.report_information_extraction(video_id)
2842
2843                 # Extract information
2844                 video_info = self._parse_page(video_webpage)
2845
2846                 # uploader
2847                 if 'owner' not in video_info:
2848                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2849                         return
2850                 video_uploader = video_info['owner']
2851
2852                 # title
2853                 if 'title' not in video_info:
2854                         self._downloader.trouble(u'ERROR: unable to extract video title')
2855                         return
2856                 video_title = video_info['title']
2857                 video_title = video_title.decode('utf-8')
2858                 video_title = sanitize_title(video_title)
2859
2860                 simple_title = _simplify_title(video_title)
2861
2862                 # thumbnail image
2863                 if 'thumbnail' not in video_info:
2864                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2865                         video_thumbnail = ''
2866                 else:
2867                         video_thumbnail = video_info['thumbnail']
2868
2869                 # upload date
2870                 upload_date = u'NA'
2871                 if 'upload_date' in video_info:
2872                         upload_time = video_info['upload_date']
2873                         timetuple = email.utils.parsedate_tz(upload_time)
2874                         if timetuple is not None:
2875                                 try:
2876                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2877                                 except:
2878                                         pass
2879
2880                 # description
2881                 video_description = video_info.get('description', 'No description available.')
2882
2883                 url_map = video_info['video_urls']
2884                 if len(url_map.keys()) > 0:
2885                         # Decide which formats to download
2886                         req_format = self._downloader.params.get('format', None)
2887                         format_limit = self._downloader.params.get('format_limit', None)
2888
2889                         if format_limit is not None and format_limit in self._available_formats:
2890                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2891                         else:
2892                                 format_list = self._available_formats
2893                         existing_formats = [x for x in format_list if x in url_map]
2894                         if len(existing_formats) == 0:
2895                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2896                                 return
2897                         if req_format is None:
2898                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2899                         elif req_format == 'worst':
2900                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2901                         elif req_format == '-1':
2902                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2903                         else:
2904                                 # Specific format
2905                                 if req_format not in url_map:
2906                                         self._downloader.trouble(u'ERROR: requested format not available')
2907                                         return
2908                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2909
2910                 for format_param, video_real_url in video_url_list:
2911
2912                         # At this point we have a new video
2913                         self._downloader.increment_downloads()
2914
2915                         # Extension
2916                         video_extension = self._video_extensions.get(format_param, 'mp4')
2917
2918                         try:
2919                                 # Process video information
2920                                 self._downloader.process_info({
2921                                         'id':           video_id.decode('utf-8'),
2922                                         'url':          video_real_url.decode('utf-8'),
2923                                         'uploader':     video_uploader.decode('utf-8'),
2924                                         'upload_date':  upload_date,
2925                                         'title':        video_title,
2926                                         'stitle':       simple_title,
2927                                         'ext':          video_extension.decode('utf-8'),
2928                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2929                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2930                                         'description':  video_description.decode('utf-8'),
2931                                         'player_url':   None,
2932                                 })
2933                         except UnavailableVideoError, err:
2934                                 self._downloader.trouble(u'\nERROR: unable to download video')
2935
2936 class BlipTVIE(InfoExtractor):
2937         """Information extractor for blip.tv"""
2938
2939         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2940         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2941         IE_NAME = u'blip.tv'
2942
2943         def report_extraction(self, file_id):
2944                 """Report information extraction."""
2945                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2946
2947         def report_direct_download(self, title):
2948                 """Report information extraction."""
2949                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2950
2951         def _real_extract(self, url):
2952                 mobj = re.match(self._VALID_URL, url)
2953                 if mobj is None:
2954                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2955                         return
2956
2957                 if '?' in url:
2958                         cchar = '&'
2959                 else:
2960                         cchar = '?'
2961                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2962                 request = urllib2.Request(json_url)
2963                 self.report_extraction(mobj.group(1))
2964                 info = None
2965                 try:
2966                         urlh = urllib2.urlopen(request)
2967                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2968                                 basename = url.split('/')[-1]
2969                                 title,ext = os.path.splitext(basename)
2970                                 title = title.decode('UTF-8')
2971                                 ext = ext.replace('.', '')
2972                                 self.report_direct_download(title)
2973                                 info = {
2974                                         'id': title,
2975                                         'url': url,
2976                                         'title': title,
2977                                         'stitle': _simplify_title(title),
2978                                         'ext': ext,
2979                                         'urlhandle': urlh
2980                                 }
2981                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2982                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2983                         return
2984                 if info is None: # Regular URL
2985                         try:
2986                                 json_code = urlh.read()
2987                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2988                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2989                                 return
2990
2991                         try:
2992                                 json_data = json.loads(json_code)
2993                                 if 'Post' in json_data:
2994                                         data = json_data['Post']
2995                                 else:
2996                                         data = json_data
2997
2998                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2999                                 video_url = data['media']['url']
3000                                 umobj = re.match(self._URL_EXT, video_url)
3001                                 if umobj is None:
3002                                         raise ValueError('Can not determine filename extension')
3003                                 ext = umobj.group(1)
3004
3005                                 info = {
3006                                         'id': data['item_id'],
3007                                         'url': video_url,
3008                                         'uploader': data['display_name'],
3009                                         'upload_date': upload_date,
3010                                         'title': data['title'],
3011                                         'stitle': _simplify_title(data['title']),
3012                                         'ext': ext,
3013                                         'format': data['media']['mimeType'],
3014                                         'thumbnail': data['thumbnailUrl'],
3015                                         'description': data['description'],
3016                                         'player_url': data['embedUrl']
3017                                 }
3018                         except (ValueError,KeyError), err:
3019                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3020                                 return
3021
3022                 self._downloader.increment_downloads()
3023
3024                 try:
3025                         self._downloader.process_info(info)
3026                 except UnavailableVideoError, err:
3027                         self._downloader.trouble(u'\nERROR: unable to download video')
3028
3029
3030 class MyVideoIE(InfoExtractor):
3031         """Information Extractor for myvideo.de."""
3032
3033         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3034         IE_NAME = u'myvideo'
3035
3036         def __init__(self, downloader=None):
3037                 InfoExtractor.__init__(self, downloader)
3038
3039         def report_download_webpage(self, video_id):
3040                 """Report webpage download."""
3041                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3042
3043         def report_extraction(self, video_id):
3044                 """Report information extraction."""
3045                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3046
3047         def _real_extract(self,url):
3048                 mobj = re.match(self._VALID_URL, url)
3049                 if mobj is None:
3050                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3051                         return
3052
3053                 video_id = mobj.group(1)
3054
3055                 # Get video webpage
3056                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3057                 try:
3058                         self.report_download_webpage(video_id)
3059                         webpage = urllib2.urlopen(request).read()
3060                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3061                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3062                         return
3063
3064                 self.report_extraction(video_id)
3065                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3066                                  webpage)
3067                 if mobj is None:
3068                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3069                         return
3070                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3071
3072                 mobj = re.search('<title>([^<]+)</title>', webpage)
3073                 if mobj is None:
3074                         self._downloader.trouble(u'ERROR: unable to extract title')
3075                         return
3076
3077                 video_title = mobj.group(1)
3078                 video_title = sanitize_title(video_title)
3079
3080                 simple_title = _simplify_title(video_title)
3081
3082                 try:
3083                         self._downloader.process_info({
3084                                 'id':           video_id,
3085                                 'url':          video_url,
3086                                 'uploader':     u'NA',
3087                                 'upload_date':  u'NA',
3088                                 'title':        video_title,
3089                                 'stitle':       simple_title,
3090                                 'ext':          u'flv',
3091                                 'format':       u'NA',
3092                                 'player_url':   None,
3093                         })
3094                 except UnavailableVideoError:
3095                         self._downloader.trouble(u'\nERROR: Unable to download video')
3096
3097 class ComedyCentralIE(InfoExtractor):
3098         """Information extractor for The Daily Show and Colbert Report """
3099
3100         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3101         IE_NAME = u'comedycentral'
3102
3103         def report_extraction(self, episode_id):
3104                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3105
3106         def report_config_download(self, episode_id):
3107                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3108
3109         def report_index_download(self, episode_id):
3110                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3111
3112         def report_player_url(self, episode_id):
3113                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3114
3115         def _real_extract(self, url):
3116                 mobj = re.match(self._VALID_URL, url)
3117                 if mobj is None:
3118                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3119                         return
3120
3121                 if mobj.group('shortname'):
3122                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3123                                 url = u'http://www.thedailyshow.com/full-episodes/'
3124                         else:
3125                                 url = u'http://www.colbertnation.com/full-episodes/'
3126                         mobj = re.match(self._VALID_URL, url)
3127                         assert mobj is not None
3128
3129                 dlNewest = not mobj.group('episode')
3130                 if dlNewest:
3131                         epTitle = mobj.group('showname')
3132                 else:
3133                         epTitle = mobj.group('episode')
3134
3135                 req = urllib2.Request(url)
3136                 self.report_extraction(epTitle)
3137                 try:
3138                         htmlHandle = urllib2.urlopen(req)
3139                         html = htmlHandle.read()
3140                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3141                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3142                         return
3143                 if dlNewest:
3144                         url = htmlHandle.geturl()
3145                         mobj = re.match(self._VALID_URL, url)
3146                         if mobj is None:
3147                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3148                                 return
3149                         if mobj.group('episode') == '':
3150                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3151                                 return
3152                         epTitle = mobj.group('episode')
3153
3154                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3155                 if len(mMovieParams) == 0:
3156                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3157                         return
3158
3159                 playerUrl_raw = mMovieParams[0][0]
3160                 self.report_player_url(epTitle)
3161                 try:
3162                         urlHandle = urllib2.urlopen(playerUrl_raw)
3163                         playerUrl = urlHandle.geturl()
3164                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3165                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3166                         return
3167
3168                 uri = mMovieParams[0][1]
3169                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3170                 self.report_index_download(epTitle)
3171                 try:
3172                         indexXml = urllib2.urlopen(indexUrl).read()
3173                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3174                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3175                         return
3176
3177                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3178                 itemEls = idoc.findall('.//item')
3179                 for itemEl in itemEls:
3180                         mediaId = itemEl.findall('./guid')[0].text
3181                         shortMediaId = mediaId.split(':')[-1]
3182                         showId = mediaId.split(':')[-2].replace('.com', '')
3183                         officialTitle = itemEl.findall('./title')[0].text
3184                         officialDate = itemEl.findall('./pubDate')[0].text
3185
3186                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3187                                                 urllib.urlencode({'uri': mediaId}))
3188                         configReq = urllib2.Request(configUrl)
3189                         self.report_config_download(epTitle)
3190                         try:
3191                                 configXml = urllib2.urlopen(configReq).read()
3192                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3193                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3194                                 return
3195
3196                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3197                         turls = []
3198                         for rendition in cdoc.findall('.//rendition'):
3199                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3200                                 turls.append(finfo)
3201
3202                         if len(turls) == 0:
3203                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3204                                 continue
3205
3206                         # For now, just pick the highest bitrate
3207                         format,video_url = turls[-1]
3208
3209                         self._downloader.increment_downloads()
3210
3211                         effTitle = showId + u'-' + epTitle
3212                         info = {
3213                                 'id': shortMediaId,
3214                                 'url': video_url,
3215                                 'uploader': showId,
3216                                 'upload_date': officialDate,
3217                                 'title': effTitle,
3218                                 'stitle': _simplify_title(effTitle),
3219                                 'ext': 'mp4',
3220                                 'format': format,
3221                                 'thumbnail': None,
3222                                 'description': officialTitle,
3223                                 'player_url': playerUrl
3224                         }
3225
3226                         try:
3227                                 self._downloader.process_info(info)
3228                         except UnavailableVideoError, err:
3229                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3230                                 continue
3231
3232
3233 class EscapistIE(InfoExtractor):
3234         """Information extractor for The Escapist """
3235
3236         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3237         IE_NAME = u'escapist'
3238
3239         def report_extraction(self, showName):
3240                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3241
3242         def report_config_download(self, showName):
3243                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3244
3245         def _real_extract(self, url):
3246                 htmlParser = HTMLParser.HTMLParser()
3247
3248                 mobj = re.match(self._VALID_URL, url)
3249                 if mobj is None:
3250                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3251                         return
3252                 showName = mobj.group('showname')
3253                 videoId = mobj.group('episode')
3254
3255                 self.report_extraction(showName)
3256                 try:
3257                         webPage = urllib2.urlopen(url).read()
3258                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3259                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3260                         return
3261
3262                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3263                 description = htmlParser.unescape(descMatch.group(1))
3264                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3265                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3266                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3267                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3268                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3269                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3270
3271                 self.report_config_download(showName)
3272                 try:
3273                         configJSON = urllib2.urlopen(configUrl).read()
3274                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3276                         return
3277
3278                 # Technically, it's JavaScript, not JSON
3279                 configJSON = configJSON.replace("'", '"')
3280
3281                 try:
3282                         config = json.loads(configJSON)
3283                 except (ValueError,), err:
3284                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3285                         return
3286
3287                 playlist = config['playlist']
3288                 videoUrl = playlist[1]['url']
3289
3290                 self._downloader.increment_downloads()
3291                 info = {
3292                         'id': videoId,
3293                         'url': videoUrl,
3294                         'uploader': showName,
3295                         'upload_date': None,
3296                         'title': showName,
3297                         'stitle': _simplify_title(showName),
3298                         'ext': 'flv',
3299                         'format': 'flv',
3300                         'thumbnail': imgUrl,
3301                         'description': description,
3302                         'player_url': playerUrl,
3303                 }
3304
3305                 try:
3306                         self._downloader.process_info(info)
3307                 except UnavailableVideoError, err:
3308                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3309
3310
3311 class CollegeHumorIE(InfoExtractor):
3312         """Information extractor for collegehumor.com"""
3313
3314         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3315         IE_NAME = u'collegehumor'
3316
3317         def report_webpage(self, video_id):
3318                 """Report information extraction."""
3319                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3320
3321         def report_extraction(self, video_id):
3322                 """Report information extraction."""
3323                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3324
3325         def _real_extract(self, url):
3326                 htmlParser = HTMLParser.HTMLParser()
3327
3328                 mobj = re.match(self._VALID_URL, url)
3329                 if mobj is None:
3330                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3331                         return
3332                 video_id = mobj.group('videoid')
3333
3334                 self.report_webpage(video_id)
3335                 request = urllib2.Request(url)
3336                 try:
3337                         webpage = urllib2.urlopen(request).read()
3338                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3339                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3340                         return
3341
3342                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3343                 if m is None:
3344                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3345                         return
3346                 internal_video_id = m.group('internalvideoid')
3347
3348                 info = {
3349                         'id': video_id,
3350                         'internal_id': internal_video_id,
3351                 }
3352
3353                 self.report_extraction(video_id)
3354                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3355                 try:
3356                         metaXml = urllib2.urlopen(xmlUrl).read()
3357                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3358                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3359                         return
3360
3361                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3362                 try:
3363                         videoNode = mdoc.findall('./video')[0]
3364                         info['description'] = videoNode.findall('./description')[0].text
3365                         info['title'] = videoNode.findall('./caption')[0].text
3366                         info['stitle'] = _simplify_title(info['title'])
3367                         info['url'] = videoNode.findall('./file')[0].text
3368                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3369                         info['ext'] = info['url'].rpartition('.')[2]
3370                         info['format'] = info['ext']
3371                 except IndexError:
3372                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3373                         return
3374
3375                 self._downloader.increment_downloads()
3376
3377                 try:
3378                         self._downloader.process_info(info)
3379                 except UnavailableVideoError, err:
3380                         self._downloader.trouble(u'\nERROR: unable to download video')
3381
3382
3383 class XVideosIE(InfoExtractor):
3384         """Information extractor for xvideos.com"""
3385
3386         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3387         IE_NAME = u'xvideos'
3388
3389         def report_webpage(self, video_id):
3390                 """Report information extraction."""
3391                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3392
3393         def report_extraction(self, video_id):
3394                 """Report information extraction."""
3395                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3396
3397         def _real_extract(self, url):
3398                 htmlParser = HTMLParser.HTMLParser()
3399
3400                 mobj = re.match(self._VALID_URL, url)
3401                 if mobj is None:
3402                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3403                         return
3404                 video_id = mobj.group(1).decode('utf-8')
3405
3406                 self.report_webpage(video_id)
3407
3408                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3409                 try:
3410                         webpage = urllib2.urlopen(request).read()
3411                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3412                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3413                         return
3414
3415                 self.report_extraction(video_id)
3416
3417
3418                 # Extract video URL
3419                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3420                 if mobj is None:
3421                         self._downloader.trouble(u'ERROR: unable to extract video url')
3422                         return
3423                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3424
3425
3426                 # Extract title
3427                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3428                 if mobj is None:
3429                         self._downloader.trouble(u'ERROR: unable to extract video title')
3430                         return
3431                 video_title = mobj.group(1).decode('utf-8')
3432
3433
3434                 # Extract video thumbnail
3435                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3436                 if mobj is None:
3437                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3438                         return
3439                 video_thumbnail = mobj.group(1).decode('utf-8')
3440
3441
3442
3443                 self._downloader.increment_downloads()
3444                 info = {
3445                         'id': video_id,
3446                         'url': video_url,
3447                         'uploader': None,
3448                         'upload_date': None,
3449                         'title': video_title,
3450                         'stitle': _simplify_title(video_title),
3451                         'ext': 'flv',
3452                         'format': 'flv',
3453                         'thumbnail': video_thumbnail,
3454                         'description': None,
3455                         'player_url': None,
3456                 }
3457
3458                 try:
3459                         self._downloader.process_info(info)
3460                 except UnavailableVideoError, err:
3461                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3462
3463
3464 class SoundcloudIE(InfoExtractor):
3465         """Information extractor for soundcloud.com
3466            To access the media, the uid of the song and a stream token
3467            must be extracted from the page source and the script must make
3468            a request to media.soundcloud.com/crossdomain.xml. Then
3469            the media can be grabbed by requesting from an url composed
3470            of the stream token and uid
3471          """
3472
3473         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3474         IE_NAME = u'soundcloud'
3475
3476         def __init__(self, downloader=None):
3477                 InfoExtractor.__init__(self, downloader)
3478
3479         def report_webpage(self, video_id):
3480                 """Report information extraction."""
3481                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3482
3483         def report_extraction(self, video_id):
3484                 """Report information extraction."""
3485                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3486
3487         def _real_extract(self, url):
3488                 htmlParser = HTMLParser.HTMLParser()
3489
3490                 mobj = re.match(self._VALID_URL, url)
3491                 if mobj is None:
3492                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3493                         return
3494
3495                 # extract uploader (which is in the url)
3496                 uploader = mobj.group(1).decode('utf-8')
3497                 # extract simple title (uploader + slug of song title)
3498                 slug_title =  mobj.group(2).decode('utf-8')
3499                 simple_title = uploader + '-' + slug_title
3500
3501                 self.report_webpage('%s/%s' % (uploader, slug_title))
3502
3503                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3504                 try:
3505                         webpage = urllib2.urlopen(request).read()
3506                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3507                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3508                         return
3509
3510                 self.report_extraction('%s/%s' % (uploader, slug_title))
3511
3512                 # extract uid and stream token that soundcloud hands out for access
3513                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3514                 if mobj:
3515                         video_id = mobj.group(1)
3516                         stream_token = mobj.group(2)
3517
3518                 # extract unsimplified title
3519                 mobj = re.search('"title":"(.*?)",', webpage)
3520                 if mobj:
3521                         title = mobj.group(1)
3522
3523                 # construct media url (with uid/token)
3524                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3525                 mediaURL = mediaURL % (video_id, stream_token)
3526
3527                 # description
3528                 description = u'No description available'
3529                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3530                 if mobj:
3531                         description = mobj.group(1)
3532
3533                 # upload date
3534                 upload_date = None
3535                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3536                 if mobj:
3537                         try:
3538                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3539                         except Exception, e:
3540                                 print str(e)
3541
3542                 # for soundcloud, a request to a cross domain is required for cookies
3543                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3544
3545                 try:
3546                         self._downloader.process_info({
3547                                 'id':           video_id.decode('utf-8'),
3548                                 'url':          mediaURL,
3549                                 'uploader':     uploader.decode('utf-8'),
3550                                 'upload_date':  upload_date,
3551                                 'title':        simple_title.decode('utf-8'),
3552                                 'stitle':       simple_title.decode('utf-8'),
3553                                 'ext':          u'mp3',
3554                                 'format':       u'NA',
3555                                 'player_url':   None,
3556                                 'description': description.decode('utf-8')
3557                         })
3558                 except UnavailableVideoError:
3559                         self._downloader.trouble(u'\nERROR: unable to download video')
3560
3561
3562 class InfoQIE(InfoExtractor):
3563         """Information extractor for infoq.com"""
3564
3565         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3566         IE_NAME = u'infoq'
3567
3568         def report_webpage(self, video_id):
3569                 """Report information extraction."""
3570                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3571
3572         def report_extraction(self, video_id):
3573                 """Report information extraction."""
3574                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3575
3576         def _real_extract(self, url):
3577                 htmlParser = HTMLParser.HTMLParser()
3578
3579                 mobj = re.match(self._VALID_URL, url)
3580                 if mobj is None:
3581                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3582                         return
3583
3584                 self.report_webpage(url)
3585
3586                 request = urllib2.Request(url)
3587                 try:
3588                         webpage = urllib2.urlopen(request).read()
3589                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3590                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3591                         return
3592
3593                 self.report_extraction(url)
3594
3595
3596                 # Extract video URL
3597                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3598                 if mobj is None:
3599                         self._downloader.trouble(u'ERROR: unable to extract video url')
3600                         return
3601                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3602
3603
3604                 # Extract title
3605                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3606                 if mobj is None:
3607                         self._downloader.trouble(u'ERROR: unable to extract video title')
3608                         return
3609                 video_title = mobj.group(1).decode('utf-8')
3610
3611                 # Extract description
3612                 video_description = u'No description available.'
3613                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3614                 if mobj is not None:
3615                         video_description = mobj.group(1).decode('utf-8')
3616
3617                 video_filename = video_url.split('/')[-1]
3618                 video_id, extension = video_filename.split('.')
3619
3620                 self._downloader.increment_downloads()
3621                 info = {
3622                         'id': video_id,
3623                         'url': video_url,
3624                         'uploader': None,
3625                         'upload_date': None,
3626                         'title': video_title,
3627                         'stitle': _simplify_title(video_title),
3628                         'ext': extension,
3629                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3630                         'thumbnail': None,
3631                         'description': video_description,
3632                         'player_url': None,
3633                 }
3634
3635                 try:
3636                         self._downloader.process_info(info)
3637                 except UnavailableVideoError, err:
3638                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3639
3640 class MixcloudIE(InfoExtractor):
3641         """Information extractor for www.mixcloud.com"""
3642         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3643         IE_NAME = u'mixcloud'
3644
3645         def __init__(self, downloader=None):
3646                 InfoExtractor.__init__(self, downloader)
3647
3648         def report_download_json(self, file_id):
3649                 """Report JSON download."""
3650                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3651
3652         def report_extraction(self, file_id):
3653                 """Report information extraction."""
3654                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3655
3656         def get_urls(self, jsonData, fmt, bitrate='best'):
3657                 """Get urls from 'audio_formats' section in json"""
3658                 file_url = None
3659                 try:
3660                         bitrate_list = jsonData[fmt]
3661                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3662                                 bitrate = max(bitrate_list) # select highest
3663
3664                         url_list = jsonData[fmt][bitrate]
3665                 except TypeError: # we have no bitrate info.
3666                         url_list = jsonData[fmt]
3667
3668                 return url_list
3669
3670         def check_urls(self, url_list):
3671                 """Returns 1st active url from list"""
3672                 for url in url_list:
3673                         try:
3674                                 urllib2.urlopen(url)
3675                                 return url
3676                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3677                                 url = None
3678
3679                 return None
3680
3681         def _print_formats(self, formats):
3682                 print 'Available formats:'
3683                 for fmt in formats.keys():
3684                         for b in formats[fmt]:
3685                                 try:
3686                                         ext = formats[fmt][b][0]
3687                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3688                                 except TypeError: # we have no bitrate info
3689                                         ext = formats[fmt][0]
3690                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3691                                         break
3692
3693         def _real_extract(self, url):
3694                 mobj = re.match(self._VALID_URL, url)
3695                 if mobj is None:
3696                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3697                         return
3698                 # extract uploader & filename from url
3699                 uploader = mobj.group(1).decode('utf-8')
3700                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3701
3702                 # construct API request
3703                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3704                 # retrieve .json file with links to files
3705                 request = urllib2.Request(file_url)
3706                 try:
3707                         self.report_download_json(file_url)
3708                         jsonData = urllib2.urlopen(request).read()
3709                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3710                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3711                         return
3712
3713                 # parse JSON
3714                 json_data = json.loads(jsonData)
3715                 player_url = json_data['player_swf_url']
3716                 formats = dict(json_data['audio_formats'])
3717
3718                 req_format = self._downloader.params.get('format', None)
3719                 bitrate = None
3720
3721                 if self._downloader.params.get('listformats', None):
3722                         self._print_formats(formats)
3723                         return
3724
3725                 if req_format is None or req_format == 'best':
3726                         for format_param in formats.keys():
3727                                 url_list = self.get_urls(formats, format_param)
3728                                 # check urls
3729                                 file_url = self.check_urls(url_list)
3730                                 if file_url is not None:
3731                                         break # got it!
3732                 else:
3733                         if req_format not in formats.keys():
3734                                 self._downloader.trouble(u'ERROR: format is not available')
3735                                 return
3736
3737                         url_list = self.get_urls(formats, req_format)
3738                         file_url = self.check_urls(url_list)
3739                         format_param = req_format
3740
3741                 # We have audio
3742                 self._downloader.increment_downloads()
3743                 try:
3744                         # Process file information
3745                         self._downloader.process_info({
3746                                 'id':           file_id.decode('utf-8'),
3747                                 'url':          file_url.decode('utf-8'),
3748                                 'uploader':     uploader.decode('utf-8'),
3749                                 'upload_date':  u'NA',
3750                                 'title':        json_data['name'],
3751                                 'stitle':       _simplify_title(json_data['name']),
3752                                 'ext':          file_url.split('.')[-1].decode('utf-8'),
3753                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3754                                 'thumbnail':    json_data['thumbnail_url'],
3755                                 'description':  json_data['description'],
3756                                 'player_url':   player_url.decode('utf-8'),
3757                         })
3758                 except UnavailableVideoError, err:
3759                         self._downloader.trouble(u'ERROR: unable to download file')
3760
3761 class StanfordOpenClassroomIE(InfoExtractor):
3762         """Information extractor for Stanford's Open ClassRoom"""
3763
3764         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3765         IE_NAME = u'stanfordoc'
3766
3767         def report_download_webpage(self, objid):
3768                 """Report information extraction."""
3769                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3770
3771         def report_extraction(self, video_id):
3772                 """Report information extraction."""
3773                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3774
3775         def _real_extract(self, url):
3776                 mobj = re.match(self._VALID_URL, url)
3777                 if mobj is None:
3778                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3779                         return
3780
3781                 if mobj.group('course') and mobj.group('video'): # A specific video
3782                         course = mobj.group('course')
3783                         video = mobj.group('video')
3784                         info = {
3785                                 'id': _simplify_title(course + '_' + video),
3786                         }
3787
3788                         self.report_extraction(info['id'])
3789                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3790                         xmlUrl = baseUrl + video + '.xml'
3791                         try:
3792                                 metaXml = urllib2.urlopen(xmlUrl).read()
3793                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3794                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3795                                 return
3796                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3797                         try:
3798                                 info['title'] = mdoc.findall('./title')[0].text
3799                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3800                         except IndexError:
3801                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3802                                 return
3803                         info['stitle'] = _simplify_title(info['title'])
3804                         info['ext'] = info['url'].rpartition('.')[2]
3805                         info['format'] = info['ext']
3806                         self._downloader.increment_downloads()
3807                         try:
3808                                 self._downloader.process_info(info)
3809                         except UnavailableVideoError, err:
3810                                 self._downloader.trouble(u'\nERROR: unable to download video')
3811                 elif mobj.group('course'): # A course page
3812                         unescapeHTML = HTMLParser.HTMLParser().unescape
3813
3814                         course = mobj.group('course')
3815                         info = {
3816                                 'id': _simplify_title(course),
3817                                 'type': 'playlist',
3818                         }
3819
3820                         self.report_download_webpage(info['id'])
3821                         try:
3822                                 coursepage = urllib2.urlopen(url).read()
3823                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3824                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3825                                 return
3826
3827                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3828                         if m:
3829                                 info['title'] = unescapeHTML(m.group(1))
3830                         else:
3831                                 info['title'] = info['id']
3832                         info['stitle'] = _simplify_title(info['title'])
3833
3834                         m = re.search('<description>([^<]+)</description>', coursepage)
3835                         if m:
3836                                 info['description'] = unescapeHTML(m.group(1))
3837
3838                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3839                         info['list'] = [
3840                                 {
3841                                         'type': 'reference',
3842                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3843                                 }
3844                                         for vpage in links]
3845
3846                         for entry in info['list']:
3847                                 assert entry['type'] == 'reference'
3848                                 self.extract(entry['url'])
3849                 else: # Root page
3850                         unescapeHTML = HTMLParser.HTMLParser().unescape
3851
3852                         info = {
3853                                 'id': 'Stanford OpenClassroom',
3854                                 'type': 'playlist',
3855                         }
3856
3857                         self.report_download_webpage(info['id'])
3858                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3859                         try:
3860                                 rootpage = urllib2.urlopen(rootURL).read()
3861                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3862                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3863                                 return
3864
3865                         info['title'] = info['id']
3866                         info['stitle'] = _simplify_title(info['title'])
3867
3868                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3869                         info['list'] = [
3870                                 {
3871                                         'type': 'reference',
3872                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3873                                 }
3874                                         for cpage in links]
3875
3876                         for entry in info['list']:
3877                                 assert entry['type'] == 'reference'
3878                                 self.extract(entry['url'])
3879
3880
3881 class PostProcessor(object):
3882         """Post Processor class.
3883
3884         PostProcessor objects can be added to downloaders with their
3885         add_post_processor() method. When the downloader has finished a
3886         successful download, it will take its internal chain of PostProcessors
3887         and start calling the run() method on each one of them, first with
3888         an initial argument and then with the returned value of the previous
3889         PostProcessor.
3890
3891         The chain will be stopped if one of them ever returns None or the end
3892         of the chain is reached.
3893
3894         PostProcessor objects follow a "mutual registration" process similar
3895         to InfoExtractor objects.
3896         """
3897
3898         _downloader = None
3899
3900         def __init__(self, downloader=None):
3901                 self._downloader = downloader
3902
3903         def set_downloader(self, downloader):
3904                 """Sets the downloader for this PP."""
3905                 self._downloader = downloader
3906
3907         def run(self, information):
3908                 """Run the PostProcessor.
3909
3910                 The "information" argument is a dictionary like the ones
3911                 composed by InfoExtractors. The only difference is that this
3912                 one has an extra field called "filepath" that points to the
3913                 downloaded file.
3914
3915                 When this method returns None, the postprocessing chain is
3916                 stopped. However, this method may return an information
3917                 dictionary that will be passed to the next postprocessing
3918                 object in the chain. It can be the one it received after
3919                 changing some fields.
3920
3921                 In addition, this method may raise a PostProcessingError
3922                 exception that will be taken into account by the downloader
3923                 it was called from.
3924                 """
3925                 return information # by default, do nothing
3926
3927 class AudioConversionError(BaseException):
3928         def __init__(self, message):
3929                 self.message = message
3930
3931 class FFmpegExtractAudioPP(PostProcessor):
3932
3933         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3934                 PostProcessor.__init__(self, downloader)
3935                 if preferredcodec is None:
3936                         preferredcodec = 'best'
3937                 self._preferredcodec = preferredcodec
3938                 self._preferredquality = preferredquality
3939                 self._keepvideo = keepvideo
3940
3941         @staticmethod
3942         def get_audio_codec(path):
3943                 try:
3944                         cmd = ['ffprobe', '-show_streams', '--', path]
3945                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3946                         output = handle.communicate()[0]
3947                         if handle.wait() != 0:
3948                                 return None
3949                 except (IOError, OSError):
3950                         return None
3951                 audio_codec = None
3952                 for line in output.split('\n'):
3953                         if line.startswith('codec_name='):
3954                                 audio_codec = line.split('=')[1].strip()
3955                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3956                                 return audio_codec
3957                 return None
3958
3959         @staticmethod
3960         def run_ffmpeg(path, out_path, codec, more_opts):
3961                 if codec is None:
3962                         acodec_opts = []
3963                 else:
3964                         acodec_opts = ['-acodec', codec]
3965                 cmd = ['ffmpeg', '-y', '-i', path, '-vn'] + acodec_opts + more_opts + ['--', out_path]
3966                 try:
3967                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3968                         stdout,stderr = p.communicate()
3969                 except (IOError, OSError):
3970                         e = sys.exc_info()[1]
3971                         if isinstance(e, OSError) and e.errno == 2:
3972                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
3973                         else:
3974                                 raise e
3975                 if p.returncode != 0:
3976                         msg = stderr.strip().split('\n')[-1]
3977                         raise AudioConversionError(msg)
3978
3979         def run(self, information):
3980                 path = information['filepath']
3981
3982                 filecodec = self.get_audio_codec(path)
3983                 if filecodec is None:
3984                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3985                         return None
3986
3987                 more_opts = []
3988                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
3989                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
3990                                 # Lossless, but in another container
3991                                 acodec = 'copy'
3992                                 extension = self._preferredcodec
3993                                 more_opts = ['-absf', 'aac_adtstoasc']
3994                         elif filecodec in ['aac', 'mp3', 'vorbis']:
3995                                 # Lossless if possible
3996                                 acodec = 'copy'
3997                                 extension = filecodec
3998                                 if filecodec == 'aac':
3999                                         more_opts = ['-f', 'adts']
4000                                 if filecodec == 'vorbis':
4001                                         extension = 'ogg'
4002                         else:
4003                                 # MP3 otherwise.
4004                                 acodec = 'libmp3lame'
4005                                 extension = 'mp3'
4006                                 more_opts = []
4007                                 if self._preferredquality is not None:
4008                                         more_opts += ['-ab', self._preferredquality]
4009                 else:
4010                         # We convert the audio (lossy)
4011                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4012                         extension = self._preferredcodec
4013                         more_opts = []
4014                         if self._preferredquality is not None:
4015                                 more_opts += ['-ab', self._preferredquality]
4016                         if self._preferredcodec == 'aac':
4017                                 more_opts += ['-f', 'adts']
4018                         if self._preferredcodec == 'm4a':
4019                                 more_opts += ['-absf', 'aac_adtstoasc']
4020                         if self._preferredcodec == 'vorbis':
4021                                 extension = 'ogg'
4022                         if self._preferredcodec == 'wav':
4023                                 extension = 'wav'
4024                                 more_opts += ['-f', 'wav']
4025
4026                 (prefix, ext) = os.path.splitext(path)
4027                 new_path = prefix + '.' + extension
4028                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
4029                 try:
4030                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4031                 except:
4032                         etype,e,tb = sys.exc_info()
4033                         if isinstance(e, AudioConversionError):
4034                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4035                         else:
4036                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4037                         return None
4038
4039                 # Try to update the date time for extracted audio file.
4040                 if information.get('filetime') is not None:
4041                         try:
4042                                 os.utime(new_path, (time.time(), information['filetime']))
4043                         except:
4044                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4045
4046                 if not self._keepvideo:
4047                         try:
4048                                 os.remove(path)
4049                         except (IOError, OSError):
4050                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4051                                 return None
4052
4053                 information['filepath'] = new_path
4054                 return information
4055
4056
4057 def updateSelf(downloader, filename):
4058         ''' Update the program file with the latest version from the repository '''
4059         # Note: downloader only used for options
4060         if not os.access(filename, os.W_OK):
4061                 sys.exit('ERROR: no write permissions on %s' % filename)
4062
4063         downloader.to_screen('Updating to latest version...')
4064
4065         try:
4066                 try:
4067                         urlh = urllib.urlopen(UPDATE_URL)
4068                         newcontent = urlh.read()
4069
4070                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4071                         if vmatch is not None and vmatch.group(1) == __version__:
4072                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4073                                 return
4074                 finally:
4075                         urlh.close()
4076         except (IOError, OSError), err:
4077                 sys.exit('ERROR: unable to download latest version')
4078
4079         try:
4080                 outf = open(filename, 'wb')
4081                 try:
4082                         outf.write(newcontent)
4083                 finally:
4084                         outf.close()
4085         except (IOError, OSError), err:
4086                 sys.exit('ERROR: unable to overwrite current version')
4087
4088         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4089
4090 def parseOpts():
4091         # Deferred imports
4092         import getpass
4093         import optparse
4094         import shlex
4095
4096         def _readOptions(filename):
4097                 try:
4098                         optionf = open(filename)
4099                 except IOError:
4100                         return [] # silently skip if file is not present
4101                 try:
4102                         res = []
4103                         for l in optionf:
4104                                 res += shlex.split(l, comments=True)
4105                 finally:
4106                         optionf.close()
4107                 return res
4108
4109         def _format_option_string(option):
4110                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4111
4112                 opts = []
4113
4114                 if option._short_opts: opts.append(option._short_opts[0])
4115                 if option._long_opts: opts.append(option._long_opts[0])
4116                 if len(opts) > 1: opts.insert(1, ', ')
4117
4118                 if option.takes_value(): opts.append(' %s' % option.metavar)
4119
4120                 return "".join(opts)
4121
4122         def _find_term_columns():
4123                 columns = os.environ.get('COLUMNS', None)
4124                 if columns:
4125                         return int(columns)
4126
4127                 try:
4128                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4129                         out,err = sp.communicate()
4130                         return int(out.split()[1])
4131                 except:
4132                         pass
4133                 return None
4134
4135         max_width = 80
4136         max_help_position = 80
4137
4138         # No need to wrap help messages if we're on a wide console
4139         columns = _find_term_columns()
4140         if columns: max_width = columns
4141
4142         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4143         fmt.format_option_strings = _format_option_string
4144
4145         kw = {
4146                 'version'   : __version__,
4147                 'formatter' : fmt,
4148                 'usage' : '%prog [options] url [url...]',
4149                 'conflict_handler' : 'resolve',
4150         }
4151
4152         parser = optparse.OptionParser(**kw)
4153
4154         # option groups
4155         general        = optparse.OptionGroup(parser, 'General Options')
4156         selection      = optparse.OptionGroup(parser, 'Video Selection')
4157         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4158         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4159         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4160         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4161         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4162
4163         general.add_option('-h', '--help',
4164                         action='help', help='print this help text and exit')
4165         general.add_option('-v', '--version',
4166                         action='version', help='print program version and exit')
4167         general.add_option('-U', '--update',
4168                         action='store_true', dest='update_self', help='update this program to latest version')
4169         general.add_option('-i', '--ignore-errors',
4170                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4171         general.add_option('-r', '--rate-limit',
4172                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4173         general.add_option('-R', '--retries',
4174                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4175         general.add_option('--dump-user-agent',
4176                         action='store_true', dest='dump_user_agent',
4177                         help='display the current browser identification', default=False)
4178         general.add_option('--list-extractors',
4179                         action='store_true', dest='list_extractors',
4180                         help='List all supported extractors and the URLs they would handle', default=False)
4181
4182         selection.add_option('--playlist-start',
4183                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4184         selection.add_option('--playlist-end',
4185                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4186         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4187         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4188         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4189
4190         authentication.add_option('-u', '--username',
4191                         dest='username', metavar='USERNAME', help='account username')
4192         authentication.add_option('-p', '--password',
4193                         dest='password', metavar='PASSWORD', help='account password')
4194         authentication.add_option('-n', '--netrc',
4195                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4196
4197
4198         video_format.add_option('-f', '--format',
4199                         action='store', dest='format', metavar='FORMAT', help='video format code')
4200         video_format.add_option('--all-formats',
4201                         action='store_const', dest='format', help='download all available video formats', const='all')
4202         video_format.add_option('--prefer-free-formats',
4203                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4204         video_format.add_option('--max-quality',
4205                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4206         video_format.add_option('-F', '--list-formats',
4207                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4208
4209
4210         verbosity.add_option('-q', '--quiet',
4211                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4212         verbosity.add_option('-s', '--simulate',
4213                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4214         verbosity.add_option('--skip-download',
4215                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4216         verbosity.add_option('-g', '--get-url',
4217                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4218         verbosity.add_option('-e', '--get-title',
4219                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4220         verbosity.add_option('--get-thumbnail',
4221                         action='store_true', dest='getthumbnail',
4222                         help='simulate, quiet but print thumbnail URL', default=False)
4223         verbosity.add_option('--get-description',
4224                         action='store_true', dest='getdescription',
4225                         help='simulate, quiet but print video description', default=False)
4226         verbosity.add_option('--get-filename',
4227                         action='store_true', dest='getfilename',
4228                         help='simulate, quiet but print output filename', default=False)
4229         verbosity.add_option('--get-format',
4230                         action='store_true', dest='getformat',
4231                         help='simulate, quiet but print output format', default=False)
4232         verbosity.add_option('--no-progress',
4233                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4234         verbosity.add_option('--console-title',
4235                         action='store_true', dest='consoletitle',
4236                         help='display progress in console titlebar', default=False)
4237
4238
4239         filesystem.add_option('-t', '--title',
4240                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4241         filesystem.add_option('-l', '--literal',
4242                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4243         filesystem.add_option('-A', '--auto-number',
4244                         action='store_true', dest='autonumber',
4245                         help='number downloaded files starting from 00000', default=False)
4246         filesystem.add_option('-o', '--output',
4247                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4248         filesystem.add_option('-a', '--batch-file',
4249                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4250         filesystem.add_option('-w', '--no-overwrites',
4251                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4252         filesystem.add_option('-c', '--continue',
4253                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4254         filesystem.add_option('--no-continue',
4255                         action='store_false', dest='continue_dl',
4256                         help='do not resume partially downloaded files (restart from beginning)')
4257         filesystem.add_option('--cookies',
4258                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4259         filesystem.add_option('--no-part',
4260                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4261         filesystem.add_option('--no-mtime',
4262                         action='store_false', dest='updatetime',
4263                         help='do not use the Last-modified header to set the file modification time', default=True)
4264         filesystem.add_option('--write-description',
4265                         action='store_true', dest='writedescription',
4266                         help='write video description to a .description file', default=False)
4267         filesystem.add_option('--write-info-json',
4268                         action='store_true', dest='writeinfojson',
4269                         help='write video metadata to a .info.json file', default=False)
4270
4271
4272         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4273                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4274         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4275                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4276         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4277                         help='ffmpeg audio bitrate specification, 128k by default')
4278         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4279                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4280
4281
4282         parser.add_option_group(general)
4283         parser.add_option_group(selection)
4284         parser.add_option_group(filesystem)
4285         parser.add_option_group(verbosity)
4286         parser.add_option_group(video_format)
4287         parser.add_option_group(authentication)
4288         parser.add_option_group(postproc)
4289
4290         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4291         if xdg_config_home:
4292                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4293         else:
4294                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4295         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4296         opts, args = parser.parse_args(argv)
4297
4298         return parser, opts, args
4299
4300 def gen_extractors():
4301         """ Return a list of an instance of every supported extractor.
4302         The order does matter; the first extractor matched is the one handling the URL.
4303         """
4304         youtube_ie = YoutubeIE()
4305         google_ie = GoogleIE()
4306         yahoo_ie = YahooIE()
4307         return [
4308                 YoutubePlaylistIE(youtube_ie),
4309                 YoutubeUserIE(youtube_ie),
4310                 YoutubeSearchIE(youtube_ie),
4311                 youtube_ie,
4312                 MetacafeIE(youtube_ie),
4313                 DailymotionIE(),
4314                 google_ie,
4315                 GoogleSearchIE(google_ie),
4316                 PhotobucketIE(),
4317                 yahoo_ie,
4318                 YahooSearchIE(yahoo_ie),
4319                 DepositFilesIE(),
4320                 FacebookIE(),
4321                 BlipTVIE(),
4322                 VimeoIE(),
4323                 MyVideoIE(),
4324                 ComedyCentralIE(),
4325                 EscapistIE(),
4326                 CollegeHumorIE(),
4327                 XVideosIE(),
4328                 SoundcloudIE(),
4329                 InfoQIE(),
4330                 MixcloudIE(),
4331                 StanfordOpenClassroomIE(),
4332
4333                 GenericIE()
4334         ]
4335
4336 def _real_main():
4337         parser, opts, args = parseOpts()
4338
4339         # Open appropriate CookieJar
4340         if opts.cookiefile is None:
4341                 jar = cookielib.CookieJar()
4342         else:
4343                 try:
4344                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4345                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4346                                 jar.load()
4347                 except (IOError, OSError), err:
4348                         sys.exit(u'ERROR: unable to open cookie file')
4349
4350         # Dump user agent
4351         if opts.dump_user_agent:
4352                 print std_headers['User-Agent']
4353                 sys.exit(0)
4354
4355         # Batch file verification
4356         batchurls = []
4357         if opts.batchfile is not None:
4358                 try:
4359                         if opts.batchfile == '-':
4360                                 batchfd = sys.stdin
4361                         else:
4362                                 batchfd = open(opts.batchfile, 'r')
4363                         batchurls = batchfd.readlines()
4364                         batchurls = [x.strip() for x in batchurls]
4365                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4366                 except IOError:
4367                         sys.exit(u'ERROR: batch file could not be read')
4368         all_urls = batchurls + args
4369
4370         # General configuration
4371         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4372         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4373         urllib2.install_opener(opener)
4374         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4375
4376         extractors = gen_extractors()
4377
4378         if opts.list_extractors:
4379                 for ie in extractors:
4380                         print(ie.IE_NAME)
4381                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4382                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4383                         for mu in matchedUrls:
4384                                 print(u'  ' + mu)
4385                 sys.exit(0)
4386
4387         # Conflicting, missing and erroneous options
4388         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4389                 parser.error(u'using .netrc conflicts with giving username/password')
4390         if opts.password is not None and opts.username is None:
4391                 parser.error(u'account username missing')
4392         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4393                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4394         if opts.usetitle and opts.useliteral:
4395                 parser.error(u'using title conflicts with using literal title')
4396         if opts.username is not None and opts.password is None:
4397                 opts.password = getpass.getpass(u'Type account password and press return:')
4398         if opts.ratelimit is not None:
4399                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4400                 if numeric_limit is None:
4401                         parser.error(u'invalid rate limit specified')
4402                 opts.ratelimit = numeric_limit
4403         if opts.retries is not None:
4404                 try:
4405                         opts.retries = long(opts.retries)
4406                 except (TypeError, ValueError), err:
4407                         parser.error(u'invalid retry count specified')
4408         try:
4409                 opts.playliststart = int(opts.playliststart)
4410                 if opts.playliststart <= 0:
4411                         raise ValueError(u'Playlist start must be positive')
4412         except (TypeError, ValueError), err:
4413                 parser.error(u'invalid playlist start number specified')
4414         try:
4415                 opts.playlistend = int(opts.playlistend)
4416                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4417                         raise ValueError(u'Playlist end must be greater than playlist start')
4418         except (TypeError, ValueError), err:
4419                 parser.error(u'invalid playlist end number specified')
4420         if opts.extractaudio:
4421                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4422                         parser.error(u'invalid audio format specified')
4423
4424         # File downloader
4425         fd = FileDownloader({
4426                 'usenetrc': opts.usenetrc,
4427                 'username': opts.username,
4428                 'password': opts.password,
4429                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4430                 'forceurl': opts.geturl,
4431                 'forcetitle': opts.gettitle,
4432                 'forcethumbnail': opts.getthumbnail,
4433                 'forcedescription': opts.getdescription,
4434                 'forcefilename': opts.getfilename,
4435                 'forceformat': opts.getformat,
4436                 'simulate': opts.simulate,
4437                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4438                 'format': opts.format,
4439                 'format_limit': opts.format_limit,
4440                 'listformats': opts.listformats,
4441                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4442                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4443                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4444                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4445                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4446                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4447                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4448                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4449                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4450                         or u'%(id)s.%(ext)s'),
4451                 'ignoreerrors': opts.ignoreerrors,
4452                 'ratelimit': opts.ratelimit,
4453                 'nooverwrites': opts.nooverwrites,
4454                 'retries': opts.retries,
4455                 'continuedl': opts.continue_dl,
4456                 'noprogress': opts.noprogress,
4457                 'playliststart': opts.playliststart,
4458                 'playlistend': opts.playlistend,
4459                 'logtostderr': opts.outtmpl == '-',
4460                 'consoletitle': opts.consoletitle,
4461                 'nopart': opts.nopart,
4462                 'updatetime': opts.updatetime,
4463                 'writedescription': opts.writedescription,
4464                 'writeinfojson': opts.writeinfojson,
4465                 'matchtitle': opts.matchtitle,
4466                 'rejecttitle': opts.rejecttitle,
4467                 'max_downloads': opts.max_downloads,
4468                 'prefer_free_formats': opts.prefer_free_formats,
4469                 })
4470         for extractor in extractors:
4471                 fd.add_info_extractor(extractor)
4472
4473         # PostProcessors
4474         if opts.extractaudio:
4475                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4476
4477         # Update version
4478         if opts.update_self:
4479                 updateSelf(fd, sys.argv[0])
4480
4481         # Maybe do nothing
4482         if len(all_urls) < 1:
4483                 if not opts.update_self:
4484                         parser.error(u'you must provide at least one URL')
4485                 else:
4486                         sys.exit()
4487
4488         try:
4489                 retcode = fd.download(all_urls)
4490         except MaxDownloadsReached:
4491                 fd.to_screen(u'--max-download limit reached, aborting.')
4492                 retcode = 101
4493
4494         # Dump cookie jar if requested
4495         if opts.cookiefile is not None:
4496                 try:
4497                         jar.save()
4498                 except (IOError, OSError), err:
4499                         sys.exit(u'ERROR: unable to save cookie jar')
4500
4501         sys.exit(retcode)
4502
4503 def main():
4504         try:
4505                 _real_main()
4506         except DownloadError:
4507                 sys.exit(1)
4508         except SameFileError:
4509                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4510         except KeyboardInterrupt:
4511                 sys.exit(u'\nERROR: Interrupted by user')
4512
4513 if __name__ == '__main__':
4514         main()
4515
4516 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: