youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import io
   6 import locale
   7 import os
   8 import re
   9 import sys
  10 import zlib
  11 import email.utils
  12 import json
  13
  14 try:
  15     import urllib.request as compat_urllib_request
  16 except ImportError: # Python 2
  17     import urllib2 as compat_urllib_request
  18
  19 try:
  20     import urllib.error as compat_urllib_error
  21 except ImportError: # Python 2
  22     import urllib2 as compat_urllib_error
  23
  24 try:
  25     import urllib.parse as compat_urllib_parse
  26 except ImportError: # Python 2
  27     import urllib as compat_urllib_parse
  28
  29 try:
  30     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  31 except ImportError: # Python 2
  32     from urlparse import urlparse as compat_urllib_parse_urlparse
  33
  34 try:
  35     import http.cookiejar as compat_cookiejar
  36 except ImportError: # Python 2
  37     import cookielib as compat_cookiejar
  38
  39 try:
  40     import html.entities as compat_html_entities
  41 except ImportError: # Python 2
  42     import htmlentitydefs as compat_html_entities
  43
  44 try:
  45     import html.parser as compat_html_parser
  46 except ImportError: # Python 2
  47     import HTMLParser as compat_html_parser
  48
  49 try:
  50     import http.client as compat_http_client
  51 except ImportError: # Python 2
  52     import httplib as compat_http_client
  53
  54 try:
  55     from urllib.parse import parse_qs as compat_parse_qs
  56 except ImportError: # Python 2
  57     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  58     # Python 2's version is apparently totally broken
  59     def _unquote(string, encoding='utf-8', errors='replace'):
  60         if string == '':
  61             return string
  62         res = string.split('%')
  63         if len(res) == 1:
  64             return string
  65         if encoding is None:
  66             encoding = 'utf-8'
  67         if errors is None:
  68             errors = 'replace'
  69         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  70         pct_sequence = b''
  71         string = res[0]
  72         for item in res[1:]:
  73             try:
  74                 if not item:
  75                     raise ValueError
  76                 pct_sequence += item[:2].decode('hex')
  77                 rest = item[2:]
  78                 if not rest:
  79                     # This segment was just a single percent-encoded character.
  80                     # May be part of a sequence of code units, so delay decoding.
  81                     # (Stored in pct_sequence).
  82                     continue
  83             except ValueError:
  84                 rest = '%' + item
  85             # Encountered non-percent-encoded characters. Flush the current
  86             # pct_sequence.
  87             string += pct_sequence.decode(encoding, errors) + rest
  88             pct_sequence = b''
  89         if pct_sequence:
  90             # Flush the final pct_sequence
  91             string += pct_sequence.decode(encoding, errors)
  92         return string
  93
  94     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
  95                 encoding='utf-8', errors='replace'):
  96         qs, _coerce_result = qs, unicode
  97         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
  98         r = []
  99         for name_value in pairs:
 100             if not name_value and not strict_parsing:
 101                 continue
 102             nv = name_value.split('=', 1)
 103             if len(nv) != 2:
 104                 if strict_parsing:
 105                     raise ValueError("bad query field: %r" % (name_value,))
 106                 # Handle case of a control-name with no equal sign
 107                 if keep_blank_values:
 108                     nv.append('')
 109                 else:
 110                     continue
 111             if len(nv[1]) or keep_blank_values:
 112                 name = nv[0].replace('+', ' ')
 113                 name = _unquote(name, encoding=encoding, errors=errors)
 114                 name = _coerce_result(name)
 115                 value = nv[1].replace('+', ' ')
 116                 value = _unquote(value, encoding=encoding, errors=errors)
 117                 value = _coerce_result(value)
 118                 r.append((name, value))
 119         return r
 120
 121     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 122                 encoding='utf-8', errors='replace'):
 123         parsed_result = {}
 124         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 125                         encoding=encoding, errors=errors)
 126         for name, value in pairs:
 127             if name in parsed_result:
 128                 parsed_result[name].append(value)
 129             else:
 130                 parsed_result[name] = [value]
 131         return parsed_result
 132
 133 try:
 134     compat_str = unicode # Python 2
 135 except NameError:
 136     compat_str = str
 137
 138 try:
 139     compat_chr = unichr # Python 2
 140 except NameError:
 141     compat_chr = chr
 142
 143 std_headers = {
 144     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
 145     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 146     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 147     'Accept-Encoding': 'gzip, deflate',
 148     'Accept-Language': 'en-us,en;q=0.5',
 149 }
 150 def preferredencoding():
 151     """Get preferred encoding.
 152
 153     Returns the best encoding scheme for the system, based on
 154     locale.getpreferredencoding() and some further tweaks.
 155     """
 156     try:
 157         pref = locale.getpreferredencoding()
 158         u'TEST'.encode(pref)
 159     except:
 160         pref = 'UTF-8'
 161
 162     return pref
 163
 164 if sys.version_info < (3,0):
 165     def compat_print(s):
 166         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 167 else:
 168     def compat_print(s):
 169         assert type(s) == type(u'')
 170         print(s)
 171
 172 def htmlentity_transform(matchobj):
 173     """Transforms an HTML entity to a character.
 174
 175     This function receives a match object and is intended to be used with
 176     the re.sub() function.
 177     """
 178     entity = matchobj.group(1)
 179
 180     # Known non-numeric HTML entity
 181     if entity in compat_html_entities.name2codepoint:
 182         return compat_chr(compat_html_entities.name2codepoint[entity])
 183
 184     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 185     if mobj is not None:
 186         numstr = mobj.group(1)
 187         if numstr.startswith(u'x'):
 188             base = 16
 189             numstr = u'0%s' % numstr
 190         else:
 191             base = 10
 192         return compat_chr(int(numstr, base))
 193
 194     # Unknown entity in name, return its literal representation
 195     return (u'&%s;' % entity)
 196
 197 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 198 class IDParser(compat_html_parser.HTMLParser):
 199     """Modified HTMLParser that isolates a tag with the specified id"""
 200     def __init__(self, id):
 201         self.id = id
 202         self.result = None
 203         self.started = False
 204         self.depth = {}
 205         self.html = None
 206         self.watch_startpos = False
 207         self.error_count = 0
 208         compat_html_parser.HTMLParser.__init__(self)
 209
 210     def error(self, message):
 211         if self.error_count > 10 or self.started:
 212             raise compat_html_parser.HTMLParseError(message, self.getpos())
 213         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 214         self.error_count += 1
 215         self.goahead(1)
 216
 217     def loads(self, html):
 218         self.html = html
 219         self.feed(html)
 220         self.close()
 221
 222     def handle_starttag(self, tag, attrs):
 223         attrs = dict(attrs)
 224         if self.started:
 225             self.find_startpos(None)
 226         if 'id' in attrs and attrs['id'] == self.id:
 227             self.result = [tag]
 228             self.started = True
 229             self.watch_startpos = True
 230         if self.started:
 231             if not tag in self.depth: self.depth[tag] = 0
 232             self.depth[tag] += 1
 233
 234     def handle_endtag(self, tag):
 235         if self.started:
 236             if tag in self.depth: self.depth[tag] -= 1
 237             if self.depth[self.result[0]] == 0:
 238                 self.started = False
 239                 self.result.append(self.getpos())
 240
 241     def find_startpos(self, x):
 242         """Needed to put the start position of the result (self.result[1])
 243         after the opening tag with the requested id"""
 244         if self.watch_startpos:
 245             self.watch_startpos = False
 246             self.result.append(self.getpos())
 247     handle_entityref = handle_charref = handle_data = handle_comment = \
 248     handle_decl = handle_pi = unknown_decl = find_startpos
 249
 250     def get_result(self):
 251         if self.result is None:
 252             return None
 253         if len(self.result) != 3:
 254             return None
 255         lines = self.html.split('\n')
 256         lines = lines[self.result[1][0]-1:self.result[2][0]]
 257         lines[0] = lines[0][self.result[1][1]:]
 258         if len(lines) == 1:
 259             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 260         lines[-1] = lines[-1][:self.result[2][1]]
 261         return '\n'.join(lines).strip()
 262
 263 def get_element_by_id(id, html):
 264     """Return the content of the tag with the specified id in the passed HTML document"""
 265     parser = IDParser(id)
 266     try:
 267         parser.loads(html)
 268     except compat_html_parser.HTMLParseError:
 269         pass
 270     return parser.get_result()
 271
 272
 273 def clean_html(html):
 274     """Clean an HTML snippet into a readable string"""
 275     # Newline vs <br />
 276     html = html.replace('\n', ' ')
 277     html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 278     # Strip html tags
 279     html = re.sub('<.*?>', '', html)
 280     # Replace html entities
 281     html = unescapeHTML(html)
 282     return html
 283
 284
 285 def sanitize_open(filename, open_mode):
 286     """Try to open the given filename, and slightly tweak it if this fails.
 287
 288     Attempts to open the given filename. If this fails, it tries to change
 289     the filename slightly, step by step, until it's either able to open it
 290     or it fails and raises a final exception, like the standard open()
 291     function.
 292
 293     It returns the tuple (stream, definitive_file_name).
 294     """
 295     try:
 296         if filename == u'-':
 297             if sys.platform == 'win32':
 298                 import msvcrt
 299                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 300             return (sys.stdout, filename)
 301         stream = open(encodeFilename(filename), open_mode)
 302         return (stream, filename)
 303     except (IOError, OSError) as err:
 304         # In case of error, try to remove win32 forbidden chars
 305         filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
 306
 307         # An exception here should be caught in the caller
 308         stream = open(encodeFilename(filename), open_mode)
 309         return (stream, filename)
 310
 311
 312 def timeconvert(timestr):
 313     """Convert RFC 2822 defined time string into system timestamp"""
 314     timestamp = None
 315     timetuple = email.utils.parsedate_tz(timestr)
 316     if timetuple is not None:
 317         timestamp = email.utils.mktime_tz(timetuple)
 318     return timestamp
 319
 320 def sanitize_filename(s, restricted=False):
 321     """Sanitizes a string so it could be used as part of a filename.
 322     If restricted is set, use a stricter subset of allowed characters.
 323     """
 324     def replace_insane(char):
 325         if char == '?' or ord(char) < 32 or ord(char) == 127:
 326             return ''
 327         elif char == '"':
 328             return '' if restricted else '\''
 329         elif char == ':':
 330             return '_-' if restricted else ' -'
 331         elif char in '\\/|*<>':
 332             return '_'
 333         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 334             return '_'
 335         if restricted and ord(char) > 127:
 336             return '_'
 337         return char
 338
 339     result = u''.join(map(replace_insane, s))
 340     while '__' in result:
 341         result = result.replace('__', '_')
 342     result = result.strip('_')
 343     # Common case of "Foreign band name - English song title"
 344     if restricted and result.startswith('-_'):
 345         result = result[2:]
 346     if not result:
 347         result = '_'
 348     return result
 349
 350 def orderedSet(iterable):
 351     """ Remove all duplicates from the input iterable """
 352     res = []
 353     for el in iterable:
 354         if el not in res:
 355             res.append(el)
 356     return res
 357
 358 def unescapeHTML(s):
 359     """
 360     @param s a string
 361     """
 362     assert type(s) == type(u'')
 363
 364     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 365     return result
 366
 367 def encodeFilename(s):
 368     """
 369     @param s The name of the file
 370     """
 371
 372     assert type(s) == type(u'')
 373
 374     # Python 3 has a Unicode API
 375     if sys.version_info >= (3, 0):
 376         return s
 377
 378     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 379         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 380         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 381         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 382         return s
 383     else:
 384         return s.encode(sys.getfilesystemencoding(), 'ignore')
 385
 386 class DownloadError(Exception):
 387     """Download Error exception.
 388
 389     This exception may be thrown by FileDownloader objects if they are not
 390     configured to continue on errors. They will contain the appropriate
 391     error message.
 392     """
 393     pass
 394
 395
 396 class SameFileError(Exception):
 397     """Same File exception.
 398
 399     This exception will be thrown by FileDownloader objects if they detect
 400     multiple files would have to be downloaded to the same file on disk.
 401     """
 402     pass
 403
 404
 405 class PostProcessingError(Exception):
 406     """Post Processing exception.
 407
 408     This exception may be raised by PostProcessor's .run() method to
 409     indicate an error in the postprocessing task.
 410     """
 411     pass
 412
 413 class MaxDownloadsReached(Exception):
 414     """ --max-downloads limit has been reached. """
 415     pass
 416
 417
 418 class UnavailableVideoError(Exception):
 419     """Unavailable Format exception.
 420
 421     This exception will be thrown when a video is requested
 422     in a format that is not available for that video.
 423     """
 424     pass
 425
 426
 427 class ContentTooShortError(Exception):
 428     """Content Too Short exception.
 429
 430     This exception may be raised by FileDownloader objects when a file they
 431     download is too small for what the server announced first, indicating
 432     the connection was probably interrupted.
 433     """
 434     # Both in bytes
 435     downloaded = None
 436     expected = None
 437
 438     def __init__(self, downloaded, expected):
 439         self.downloaded = downloaded
 440         self.expected = expected
 441
 442
 443 class Trouble(Exception):
 444     """Trouble helper exception
 445
 446     This is an exception to be handled with
 447     FileDownloader.trouble
 448     """
 449
 450 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 451     """Handler for HTTP requests and responses.
 452
 453     This class, when installed with an OpenerDirector, automatically adds
 454     the standard headers to every HTTP request and handles gzipped and
 455     deflated responses from web servers. If compression is to be avoided in
 456     a particular request, the original request in the program code only has
 457     to include the HTTP header "Youtubedl-No-Compression", which will be
 458     removed before making the real request.
 459
 460     Part of this code was copied from:
 461
 462     http://techknack.net/python-urllib2-handlers/
 463
 464     Andrew Rowls, the author of that code, agreed to release it to the
 465     public domain.
 466     """
 467
 468     @staticmethod
 469     def deflate(data):
 470         try:
 471             return zlib.decompress(data, -zlib.MAX_WBITS)
 472         except zlib.error:
 473             return zlib.decompress(data)
 474
 475     @staticmethod
 476     def addinfourl_wrapper(stream, headers, url, code):
 477         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 478             return compat_urllib_request.addinfourl(stream, headers, url, code)
 479         ret = compat_urllib_request.addinfourl(stream, headers, url)
 480         ret.code = code
 481         return ret
 482
 483     def http_request(self, req):
 484         for h in std_headers:
 485             if h in req.headers:
 486                 del req.headers[h]
 487             req.add_header(h, std_headers[h])
 488         if 'Youtubedl-no-compression' in req.headers:
 489             if 'Accept-encoding' in req.headers:
 490                 del req.headers['Accept-encoding']
 491             del req.headers['Youtubedl-no-compression']
 492         return req
 493
 494     def http_response(self, req, resp):
 495         old_resp = resp
 496         # gzip
 497         if resp.headers.get('Content-encoding', '') == 'gzip':
 498             gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 499             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 500             resp.msg = old_resp.msg
 501         # deflate
 502         if resp.headers.get('Content-encoding', '') == 'deflate':
 503             gz = io.BytesIO(self.deflate(resp.read()))
 504             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 505             resp.msg = old_resp.msg
 506         return resp