youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import io
   6 import json
   7 import locale
   8 import os
   9 import re
  10 import sys
  11 import traceback
  12 import zlib
  13 import email.utils
  14 import json
  15
  16 try:
  17     import urllib.request as compat_urllib_request
  18 except ImportError: # Python 2
  19     import urllib2 as compat_urllib_request
  20
  21 try:
  22     import urllib.error as compat_urllib_error
  23 except ImportError: # Python 2
  24     import urllib2 as compat_urllib_error
  25
  26 try:
  27     import urllib.parse as compat_urllib_parse
  28 except ImportError: # Python 2
  29     import urllib as compat_urllib_parse
  30
  31 try:
  32     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  33 except ImportError: # Python 2
  34     from urlparse import urlparse as compat_urllib_parse_urlparse
  35
  36 try:
  37     import http.cookiejar as compat_cookiejar
  38 except ImportError: # Python 2
  39     import cookielib as compat_cookiejar
  40
  41 try:
  42     import html.entities as compat_html_entities
  43 except ImportError: # Python 2
  44     import htmlentitydefs as compat_html_entities
  45
  46 try:
  47     import html.parser as compat_html_parser
  48 except ImportError: # Python 2
  49     import HTMLParser as compat_html_parser
  50
  51 try:
  52     import http.client as compat_http_client
  53 except ImportError: # Python 2
  54     import httplib as compat_http_client
  55
  56 try:
  57     from subprocess import DEVNULL
  58     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  59 except ImportError:
  60     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  61
  62 try:
  63     from urllib.parse import parse_qs as compat_parse_qs
  64 except ImportError: # Python 2
  65     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  66     # Python 2's version is apparently totally broken
  67     def _unquote(string, encoding='utf-8', errors='replace'):
  68         if string == '':
  69             return string
  70         res = string.split('%')
  71         if len(res) == 1:
  72             return string
  73         if encoding is None:
  74             encoding = 'utf-8'
  75         if errors is None:
  76             errors = 'replace'
  77         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  78         pct_sequence = b''
  79         string = res[0]
  80         for item in res[1:]:
  81             try:
  82                 if not item:
  83                     raise ValueError
  84                 pct_sequence += item[:2].decode('hex')
  85                 rest = item[2:]
  86                 if not rest:
  87                     # This segment was just a single percent-encoded character.
  88                     # May be part of a sequence of code units, so delay decoding.
  89                     # (Stored in pct_sequence).
  90                     continue
  91             except ValueError:
  92                 rest = '%' + item
  93             # Encountered non-percent-encoded characters. Flush the current
  94             # pct_sequence.
  95             string += pct_sequence.decode(encoding, errors) + rest
  96             pct_sequence = b''
  97         if pct_sequence:
  98             # Flush the final pct_sequence
  99             string += pct_sequence.decode(encoding, errors)
 100         return string
 101
 102     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 103                 encoding='utf-8', errors='replace'):
 104         qs, _coerce_result = qs, unicode
 105         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 106         r = []
 107         for name_value in pairs:
 108             if not name_value and not strict_parsing:
 109                 continue
 110             nv = name_value.split('=', 1)
 111             if len(nv) != 2:
 112                 if strict_parsing:
 113                     raise ValueError("bad query field: %r" % (name_value,))
 114                 # Handle case of a control-name with no equal sign
 115                 if keep_blank_values:
 116                     nv.append('')
 117                 else:
 118                     continue
 119             if len(nv[1]) or keep_blank_values:
 120                 name = nv[0].replace('+', ' ')
 121                 name = _unquote(name, encoding=encoding, errors=errors)
 122                 name = _coerce_result(name)
 123                 value = nv[1].replace('+', ' ')
 124                 value = _unquote(value, encoding=encoding, errors=errors)
 125                 value = _coerce_result(value)
 126                 r.append((name, value))
 127         return r
 128
 129     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 130                 encoding='utf-8', errors='replace'):
 131         parsed_result = {}
 132         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 133                         encoding=encoding, errors=errors)
 134         for name, value in pairs:
 135             if name in parsed_result:
 136                 parsed_result[name].append(value)
 137             else:
 138                 parsed_result[name] = [value]
 139         return parsed_result
 140
 141 try:
 142     compat_str = unicode # Python 2
 143 except NameError:
 144     compat_str = str
 145
 146 try:
 147     compat_chr = unichr # Python 2
 148 except NameError:
 149     compat_chr = chr
 150
 151 std_headers = {
 152     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
 153     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 154     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 155     'Accept-Encoding': 'gzip, deflate',
 156     'Accept-Language': 'en-us,en;q=0.5',
 157 }
 158
 159 def preferredencoding():
 160     """Get preferred encoding.
 161
 162     Returns the best encoding scheme for the system, based on
 163     locale.getpreferredencoding() and some further tweaks.
 164     """
 165     try:
 166         pref = locale.getpreferredencoding()
 167         u'TEST'.encode(pref)
 168     except:
 169         pref = 'UTF-8'
 170
 171     return pref
 172
 173 if sys.version_info < (3,0):
 174     def compat_print(s):
 175         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 176 else:
 177     def compat_print(s):
 178         assert type(s) == type(u'')
 179         print(s)
 180
 181 # In Python 2.x, json.dump expects a bytestream.
 182 # In Python 3.x, it writes to a character stream
 183 if sys.version_info < (3,0):
 184     def write_json_file(obj, fn):
 185         with open(fn, 'wb') as f:
 186             json.dump(obj, f)
 187 else:
 188     def write_json_file(obj, fn):
 189         with open(fn, 'w', encoding='utf-8') as f:
 190             json.dump(obj, f)
 191
 192 def htmlentity_transform(matchobj):
 193     """Transforms an HTML entity to a character.
 194
 195     This function receives a match object and is intended to be used with
 196     the re.sub() function.
 197     """
 198     entity = matchobj.group(1)
 199
 200     # Known non-numeric HTML entity
 201     if entity in compat_html_entities.name2codepoint:
 202         return compat_chr(compat_html_entities.name2codepoint[entity])
 203
 204     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 205     if mobj is not None:
 206         numstr = mobj.group(1)
 207         if numstr.startswith(u'x'):
 208             base = 16
 209             numstr = u'0%s' % numstr
 210         else:
 211             base = 10
 212         return compat_chr(int(numstr, base))
 213
 214     # Unknown entity in name, return its literal representation
 215     return (u'&%s;' % entity)
 216
 217 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 218 class AttrParser(compat_html_parser.HTMLParser):
 219     """Modified HTMLParser that isolates a tag with the specified attribute"""
 220     def __init__(self, attribute, value):
 221         self.attribute = attribute
 222         self.value = value
 223         self.result = None
 224         self.started = False
 225         self.depth = {}
 226         self.html = None
 227         self.watch_startpos = False
 228         self.error_count = 0
 229         compat_html_parser.HTMLParser.__init__(self)
 230
 231     def error(self, message):
 232         if self.error_count > 10 or self.started:
 233             raise compat_html_parser.HTMLParseError(message, self.getpos())
 234         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 235         self.error_count += 1
 236         self.goahead(1)
 237
 238     def loads(self, html):
 239         self.html = html
 240         self.feed(html)
 241         self.close()
 242
 243     def handle_starttag(self, tag, attrs):
 244         attrs = dict(attrs)
 245         if self.started:
 246             self.find_startpos(None)
 247         if self.attribute in attrs and attrs[self.attribute] == self.value:
 248             self.result = [tag]
 249             self.started = True
 250             self.watch_startpos = True
 251         if self.started:
 252             if not tag in self.depth: self.depth[tag] = 0
 253             self.depth[tag] += 1
 254
 255     def handle_endtag(self, tag):
 256         if self.started:
 257             if tag in self.depth: self.depth[tag] -= 1
 258             if self.depth[self.result[0]] == 0:
 259                 self.started = False
 260                 self.result.append(self.getpos())
 261
 262     def find_startpos(self, x):
 263         """Needed to put the start position of the result (self.result[1])
 264         after the opening tag with the requested id"""
 265         if self.watch_startpos:
 266             self.watch_startpos = False
 267             self.result.append(self.getpos())
 268     handle_entityref = handle_charref = handle_data = handle_comment = \
 269     handle_decl = handle_pi = unknown_decl = find_startpos
 270
 271     def get_result(self):
 272         if self.result is None:
 273             return None
 274         if len(self.result) != 3:
 275             return None
 276         lines = self.html.split('\n')
 277         lines = lines[self.result[1][0]-1:self.result[2][0]]
 278         lines[0] = lines[0][self.result[1][1]:]
 279         if len(lines) == 1:
 280             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 281         lines[-1] = lines[-1][:self.result[2][1]]
 282         return '\n'.join(lines).strip()
 283 # Hack for https://github.com/rg3/youtube-dl/issues/662
 284 if sys.version_info < (2, 7, 3):
 285     AttrParser.parse_endtag = (lambda self, i:
 286         i + len("</scr'+'ipt>")
 287         if self.rawdata[i:].startswith("</scr'+'ipt>")
 288         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 289
 290 def get_element_by_id(id, html):
 291     """Return the content of the tag with the specified ID in the passed HTML document"""
 292     return get_element_by_attribute("id", id, html)
 293
 294 def get_element_by_attribute(attribute, value, html):
 295     """Return the content of the tag with the specified attribute in the passed HTML document"""
 296     parser = AttrParser(attribute, value)
 297     try:
 298         parser.loads(html)
 299     except compat_html_parser.HTMLParseError:
 300         pass
 301     return parser.get_result()
 302
 303
 304 def clean_html(html):
 305     """Clean an HTML snippet into a readable string"""
 306     # Newline vs <br />
 307     html = html.replace('\n', ' ')
 308     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 309     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 310     # Strip html tags
 311     html = re.sub('<.*?>', '', html)
 312     # Replace html entities
 313     html = unescapeHTML(html)
 314     return html
 315
 316
 317 def sanitize_open(filename, open_mode):
 318     """Try to open the given filename, and slightly tweak it if this fails.
 319
 320     Attempts to open the given filename. If this fails, it tries to change
 321     the filename slightly, step by step, until it's either able to open it
 322     or it fails and raises a final exception, like the standard open()
 323     function.
 324
 325     It returns the tuple (stream, definitive_file_name).
 326     """
 327     try:
 328         if filename == u'-':
 329             if sys.platform == 'win32':
 330                 import msvcrt
 331                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 332             return (sys.stdout.buffer, filename)
 333         stream = open(encodeFilename(filename), open_mode)
 334         return (stream, filename)
 335     except (IOError, OSError) as err:
 336         # In case of error, try to remove win32 forbidden chars
 337         filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
 338
 339         # An exception here should be caught in the caller
 340         stream = open(encodeFilename(filename), open_mode)
 341         return (stream, filename)
 342
 343
 344 def timeconvert(timestr):
 345     """Convert RFC 2822 defined time string into system timestamp"""
 346     timestamp = None
 347     timetuple = email.utils.parsedate_tz(timestr)
 348     if timetuple is not None:
 349         timestamp = email.utils.mktime_tz(timetuple)
 350     return timestamp
 351
 352 def sanitize_filename(s, restricted=False, is_id=False):
 353     """Sanitizes a string so it could be used as part of a filename.
 354     If restricted is set, use a stricter subset of allowed characters.
 355     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 356     """
 357     def replace_insane(char):
 358         if char == '?' or ord(char) < 32 or ord(char) == 127:
 359             return ''
 360         elif char == '"':
 361             return '' if restricted else '\''
 362         elif char == ':':
 363             return '_-' if restricted else ' -'
 364         elif char in '\\/|*<>':
 365             return '_'
 366         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 367             return '_'
 368         if restricted and ord(char) > 127:
 369             return '_'
 370         return char
 371
 372     result = u''.join(map(replace_insane, s))
 373     if not is_id:
 374         while '__' in result:
 375             result = result.replace('__', '_')
 376         result = result.strip('_')
 377         # Common case of "Foreign band name - English song title"
 378         if restricted and result.startswith('-_'):
 379             result = result[2:]
 380         if not result:
 381             result = '_'
 382     return result
 383
 384 def orderedSet(iterable):
 385     """ Remove all duplicates from the input iterable """
 386     res = []
 387     for el in iterable:
 388         if el not in res:
 389             res.append(el)
 390     return res
 391
 392 def unescapeHTML(s):
 393     """
 394     @param s a string
 395     """
 396     assert type(s) == type(u'')
 397
 398     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 399     return result
 400
 401 def encodeFilename(s):
 402     """
 403     @param s The name of the file
 404     """
 405
 406     assert type(s) == type(u'')
 407
 408     # Python 3 has a Unicode API
 409     if sys.version_info >= (3, 0):
 410         return s
 411
 412     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 413         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 414         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 415         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 416         return s
 417     else:
 418         encoding = sys.getfilesystemencoding()
 419         if encoding is None:
 420             encoding = 'utf-8'
 421         return s.encode(encoding, 'ignore')
 422
 423 def decodeOption(optval):
 424     if optval is None:
 425         return optval
 426     if isinstance(optval, bytes):
 427         optval = optval.decode(preferredencoding())
 428
 429     assert isinstance(optval, compat_str)
 430     return optval
 431
 432 class ExtractorError(Exception):
 433     """Error during info extraction."""
 434     def __init__(self, msg, tb=None):
 435         """ tb, if given, is the original traceback (so that it can be printed out). """
 436         super(ExtractorError, self).__init__(msg)
 437         self.traceback = tb
 438
 439     def format_traceback(self):
 440         if self.traceback is None:
 441             return None
 442         return u''.join(traceback.format_tb(self.traceback))
 443
 444
 445 class DownloadError(Exception):
 446     """Download Error exception.
 447
 448     This exception may be thrown by FileDownloader objects if they are not
 449     configured to continue on errors. They will contain the appropriate
 450     error message.
 451     """
 452     pass
 453
 454
 455 class SameFileError(Exception):
 456     """Same File exception.
 457
 458     This exception will be thrown by FileDownloader objects if they detect
 459     multiple files would have to be downloaded to the same file on disk.
 460     """
 461     pass
 462
 463
 464 class PostProcessingError(Exception):
 465     """Post Processing exception.
 466
 467     This exception may be raised by PostProcessor's .run() method to
 468     indicate an error in the postprocessing task.
 469     """
 470     def __init__(self, msg):
 471         self.msg = msg
 472
 473 class MaxDownloadsReached(Exception):
 474     """ --max-downloads limit has been reached. """
 475     pass
 476
 477
 478 class UnavailableVideoError(Exception):
 479     """Unavailable Format exception.
 480
 481     This exception will be thrown when a video is requested
 482     in a format that is not available for that video.
 483     """
 484     pass
 485
 486
 487 class ContentTooShortError(Exception):
 488     """Content Too Short exception.
 489
 490     This exception may be raised by FileDownloader objects when a file they
 491     download is too small for what the server announced first, indicating
 492     the connection was probably interrupted.
 493     """
 494     # Both in bytes
 495     downloaded = None
 496     expected = None
 497
 498     def __init__(self, downloaded, expected):
 499         self.downloaded = downloaded
 500         self.expected = expected
 501
 502 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 503     """Handler for HTTP requests and responses.
 504
 505     This class, when installed with an OpenerDirector, automatically adds
 506     the standard headers to every HTTP request and handles gzipped and
 507     deflated responses from web servers. If compression is to be avoided in
 508     a particular request, the original request in the program code only has
 509     to include the HTTP header "Youtubedl-No-Compression", which will be
 510     removed before making the real request.
 511
 512     Part of this code was copied from:
 513
 514     http://techknack.net/python-urllib2-handlers/
 515
 516     Andrew Rowls, the author of that code, agreed to release it to the
 517     public domain.
 518     """
 519
 520     @staticmethod
 521     def deflate(data):
 522         try:
 523             return zlib.decompress(data, -zlib.MAX_WBITS)
 524         except zlib.error:
 525             return zlib.decompress(data)
 526
 527     @staticmethod
 528     def addinfourl_wrapper(stream, headers, url, code):
 529         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 530             return compat_urllib_request.addinfourl(stream, headers, url, code)
 531         ret = compat_urllib_request.addinfourl(stream, headers, url)
 532         ret.code = code
 533         return ret
 534
 535     def http_request(self, req):
 536         for h,v in std_headers.items():
 537             if h in req.headers:
 538                 del req.headers[h]
 539             req.add_header(h, v)
 540         if 'Youtubedl-no-compression' in req.headers:
 541             if 'Accept-encoding' in req.headers:
 542                 del req.headers['Accept-encoding']
 543             del req.headers['Youtubedl-no-compression']
 544         if 'Youtubedl-user-agent' in req.headers:
 545             if 'User-agent' in req.headers:
 546                 del req.headers['User-agent']
 547             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 548             del req.headers['Youtubedl-user-agent']
 549         return req
 550
 551     def http_response(self, req, resp):
 552         old_resp = resp
 553         # gzip
 554         if resp.headers.get('Content-encoding', '') == 'gzip':
 555             gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 556             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 557             resp.msg = old_resp.msg
 558         # deflate
 559         if resp.headers.get('Content-encoding', '') == 'deflate':
 560             gz = io.BytesIO(self.deflate(resp.read()))
 561             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 562             resp.msg = old_resp.msg
 563         return resp
 564
 565     https_request = http_request
 566     https_response = http_response