youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_parse_qs,
  46     compat_shlex_quote,
  47     compat_socket_create_connection,
  48     compat_str,
  49     compat_struct_pack,
  50     compat_struct_unpack,
  51     compat_urllib_error,
  52     compat_urllib_parse,
  53     compat_urllib_parse_urlencode,
  54     compat_urllib_parse_urlparse,
  55     compat_urllib_parse_unquote_plus,
  56     compat_urllib_request,
  57     compat_urlparse,
  58     compat_xpath,
  59 )
  60
  61 from .socks import (
  62     ProxyType,
  63     sockssocket,
  64 )
  65
  66
  67 def register_socks_protocols():
  68     # "Register" SOCKS protocols
  69     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  70     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  71     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  72         if scheme not in compat_urlparse.uses_netloc:
  73             compat_urlparse.uses_netloc.append(scheme)
  74
  75
  76 # This is not clearly defined otherwise
  77 compiled_regex_type = type(re.compile(''))
  78
  79 std_headers = {
  80     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  81     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  82     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  83     'Accept-Encoding': 'gzip, deflate',
  84     'Accept-Language': 'en-us,en;q=0.5',
  85 }
  86
  87
  88 NO_DEFAULT = object()
  89
  90 ENGLISH_MONTH_NAMES = [
  91     'January', 'February', 'March', 'April', 'May', 'June',
  92     'July', 'August', 'September', 'October', 'November', 'December']
  93
  94 KNOWN_EXTENSIONS = (
  95     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  96     'flv', 'f4v', 'f4a', 'f4b',
  97     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  98     'mkv', 'mka', 'mk3d',
  99     'avi', 'divx',
 100     'mov',
 101     'asf', 'wmv', 'wma',
 102     '3gp', '3g2',
 103     'mp3',
 104     'flac',
 105     'ape',
 106     'wav',
 107     'f4f', 'f4m', 'm3u8', 'smil')
 108
 109 # needed for sanitizing filenames in restricted mode
 110 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 111                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 112                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 113
 114 DATE_FORMATS = (
 115     '%d %B %Y',
 116     '%d %b %Y',
 117     '%B %d %Y',
 118     '%b %d %Y',
 119     '%b %dst %Y %I:%M',
 120     '%b %dnd %Y %I:%M',
 121     '%b %dth %Y %I:%M',
 122     '%Y %m %d',
 123     '%Y-%m-%d',
 124     '%Y/%m/%d',
 125     '%Y/%m/%d %H:%M',
 126     '%Y/%m/%d %H:%M:%S',
 127     '%Y-%m-%d %H:%M:%S',
 128     '%Y-%m-%d %H:%M:%S.%f',
 129     '%d.%m.%Y %H:%M',
 130     '%d.%m.%Y %H.%M',
 131     '%Y-%m-%dT%H:%M:%SZ',
 132     '%Y-%m-%dT%H:%M:%S.%fZ',
 133     '%Y-%m-%dT%H:%M:%S.%f0Z',
 134     '%Y-%m-%dT%H:%M:%S',
 135     '%Y-%m-%dT%H:%M:%S.%f',
 136     '%Y-%m-%dT%H:%M',
 137 )
 138
 139 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 140 DATE_FORMATS_DAY_FIRST.extend([
 141     '%d-%m-%Y',
 142     '%d.%m.%Y',
 143     '%d.%m.%y',
 144     '%d/%m/%Y',
 145     '%d/%m/%y',
 146     '%d/%m/%Y %H:%M:%S',
 147 ])
 148
 149 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 150 DATE_FORMATS_MONTH_FIRST.extend([
 151     '%m-%d-%Y',
 152     '%m.%d.%Y',
 153     '%m/%d/%Y',
 154     '%m/%d/%y',
 155     '%m/%d/%Y %H:%M:%S',
 156 ])
 157
 158
 159 def preferredencoding():
 160     """Get preferred encoding.
 161
 162     Returns the best encoding scheme for the system, based on
 163     locale.getpreferredencoding() and some further tweaks.
 164     """
 165     try:
 166         pref = locale.getpreferredencoding()
 167         'TEST'.encode(pref)
 168     except Exception:
 169         pref = 'UTF-8'
 170
 171     return pref
 172
 173
 174 def write_json_file(obj, fn):
 175     """ Encode obj as JSON and write it to fn, atomically if possible """
 176
 177     fn = encodeFilename(fn)
 178     if sys.version_info < (3, 0) and sys.platform != 'win32':
 179         encoding = get_filesystem_encoding()
 180         # os.path.basename returns a bytes object, but NamedTemporaryFile
 181         # will fail if the filename contains non ascii characters unless we
 182         # use a unicode object
 183         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 184         # the same for os.path.dirname
 185         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 186     else:
 187         path_basename = os.path.basename
 188         path_dirname = os.path.dirname
 189
 190     args = {
 191         'suffix': '.tmp',
 192         'prefix': path_basename(fn) + '.',
 193         'dir': path_dirname(fn),
 194         'delete': False,
 195     }
 196
 197     # In Python 2.x, json.dump expects a bytestream.
 198     # In Python 3.x, it writes to a character stream
 199     if sys.version_info < (3, 0):
 200         args['mode'] = 'wb'
 201     else:
 202         args.update({
 203             'mode': 'w',
 204             'encoding': 'utf-8',
 205         })
 206
 207     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 208
 209     try:
 210         with tf:
 211             json.dump(obj, tf)
 212         if sys.platform == 'win32':
 213             # Need to remove existing file on Windows, else os.rename raises
 214             # WindowsError or FileExistsError.
 215             try:
 216                 os.unlink(fn)
 217             except OSError:
 218                 pass
 219         os.rename(tf.name, fn)
 220     except Exception:
 221         try:
 222             os.remove(tf.name)
 223         except OSError:
 224             pass
 225         raise
 226
 227
 228 if sys.version_info >= (2, 7):
 229     def find_xpath_attr(node, xpath, key, val=None):
 230         """ Find the xpath xpath[@key=val] """
 231         assert re.match(r'^[a-zA-Z_-]+$', key)
 232         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 233         return node.find(expr)
 234 else:
 235     def find_xpath_attr(node, xpath, key, val=None):
 236         for f in node.findall(compat_xpath(xpath)):
 237             if key not in f.attrib:
 238                 continue
 239             if val is None or f.attrib.get(key) == val:
 240                 return f
 241         return None
 242
 243 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 244 # the namespace parameter
 245
 246
 247 def xpath_with_ns(path, ns_map):
 248     components = [c.split(':') for c in path.split('/')]
 249     replaced = []
 250     for c in components:
 251         if len(c) == 1:
 252             replaced.append(c[0])
 253         else:
 254             ns, tag = c
 255             replaced.append('{%s}%s' % (ns_map[ns], tag))
 256     return '/'.join(replaced)
 257
 258
 259 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 260     def _find_xpath(xpath):
 261         return node.find(compat_xpath(xpath))
 262
 263     if isinstance(xpath, (str, compat_str)):
 264         n = _find_xpath(xpath)
 265     else:
 266         for xp in xpath:
 267             n = _find_xpath(xp)
 268             if n is not None:
 269                 break
 270
 271     if n is None:
 272         if default is not NO_DEFAULT:
 273             return default
 274         elif fatal:
 275             name = xpath if name is None else name
 276             raise ExtractorError('Could not find XML element %s' % name)
 277         else:
 278             return None
 279     return n
 280
 281
 282 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 283     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 284     if n is None or n == default:
 285         return n
 286     if n.text is None:
 287         if default is not NO_DEFAULT:
 288             return default
 289         elif fatal:
 290             name = xpath if name is None else name
 291             raise ExtractorError('Could not find XML element\'s text %s' % name)
 292         else:
 293             return None
 294     return n.text
 295
 296
 297 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 298     n = find_xpath_attr(node, xpath, key)
 299     if n is None:
 300         if default is not NO_DEFAULT:
 301             return default
 302         elif fatal:
 303             name = '%s[@%s]' % (xpath, key) if name is None else name
 304             raise ExtractorError('Could not find XML attribute %s' % name)
 305         else:
 306             return None
 307     return n.attrib[key]
 308
 309
 310 def get_element_by_id(id, html):
 311     """Return the content of the tag with the specified ID in the passed HTML document"""
 312     return get_element_by_attribute('id', id, html)
 313
 314
 315 def get_element_by_class(class_name, html):
 316     return get_element_by_attribute(
 317         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 318         html, escape_value=False)
 319
 320
 321 def get_element_by_attribute(attribute, value, html, escape_value=True):
 322     """Return the content of the tag with the specified attribute in the passed HTML document"""
 323
 324     value = re.escape(value) if escape_value else value
 325
 326     m = re.search(r'''(?xs)
 327         <([a-zA-Z0-9:._-]+)
 328          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 329          \s+%s=['"]?%s['"]?
 330          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 331         \s*>
 332         (?P<content>.*?)
 333         </\1>
 334     ''' % (re.escape(attribute), value), html)
 335
 336     if not m:
 337         return None
 338     res = m.group('content')
 339
 340     if res.startswith('"') or res.startswith("'"):
 341         res = res[1:-1]
 342
 343     return unescapeHTML(res)
 344
 345
 346 class HTMLAttributeParser(compat_HTMLParser):
 347     """Trivial HTML parser to gather the attributes for a single element"""
 348     def __init__(self):
 349         self.attrs = {}
 350         compat_HTMLParser.__init__(self)
 351
 352     def handle_starttag(self, tag, attrs):
 353         self.attrs = dict(attrs)
 354
 355
 356 def extract_attributes(html_element):
 357     """Given a string for an HTML element such as
 358     <el
 359          a="foo" B="bar" c="&98;az" d=boz
 360          empty= noval entity="&amp;"
 361          sq='"' dq="'"
 362     >
 363     Decode and return a dictionary of attributes.
 364     {
 365         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 366         'empty': '', 'noval': None, 'entity': '&',
 367         'sq': '"', 'dq': '\''
 368     }.
 369     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 370     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 371     """
 372     parser = HTMLAttributeParser()
 373     parser.feed(html_element)
 374     parser.close()
 375     return parser.attrs
 376
 377
 378 def clean_html(html):
 379     """Clean an HTML snippet into a readable string"""
 380
 381     if html is None:  # Convenience for sanitizing descriptions etc.
 382         return html
 383
 384     # Newline vs <br />
 385     html = html.replace('\n', ' ')
 386     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 387     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 388     # Strip html tags
 389     html = re.sub('<.*?>', '', html)
 390     # Replace html entities
 391     html = unescapeHTML(html)
 392     return html.strip()
 393
 394
 395 def sanitize_open(filename, open_mode):
 396     """Try to open the given filename, and slightly tweak it if this fails.
 397
 398     Attempts to open the given filename. If this fails, it tries to change
 399     the filename slightly, step by step, until it's either able to open it
 400     or it fails and raises a final exception, like the standard open()
 401     function.
 402
 403     It returns the tuple (stream, definitive_file_name).
 404     """
 405     try:
 406         if filename == '-':
 407             if sys.platform == 'win32':
 408                 import msvcrt
 409                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 410             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 411         stream = open(encodeFilename(filename), open_mode)
 412         return (stream, filename)
 413     except (IOError, OSError) as err:
 414         if err.errno in (errno.EACCES,):
 415             raise
 416
 417         # In case of error, try to remove win32 forbidden chars
 418         alt_filename = sanitize_path(filename)
 419         if alt_filename == filename:
 420             raise
 421         else:
 422             # An exception here should be caught in the caller
 423             stream = open(encodeFilename(alt_filename), open_mode)
 424             return (stream, alt_filename)
 425
 426
 427 def timeconvert(timestr):
 428     """Convert RFC 2822 defined time string into system timestamp"""
 429     timestamp = None
 430     timetuple = email.utils.parsedate_tz(timestr)
 431     if timetuple is not None:
 432         timestamp = email.utils.mktime_tz(timetuple)
 433     return timestamp
 434
 435
 436 def sanitize_filename(s, restricted=False, is_id=False):
 437     """Sanitizes a string so it could be used as part of a filename.
 438     If restricted is set, use a stricter subset of allowed characters.
 439     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 440     """
 441     def replace_insane(char):
 442         if restricted and char in ACCENT_CHARS:
 443             return ACCENT_CHARS[char]
 444         if char == '?' or ord(char) < 32 or ord(char) == 127:
 445             return ''
 446         elif char == '"':
 447             return '' if restricted else '\''
 448         elif char == ':':
 449             return '_-' if restricted else ' -'
 450         elif char in '\\/|*<>':
 451             return '_'
 452         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 453             return '_'
 454         if restricted and ord(char) > 127:
 455             return '_'
 456         return char
 457
 458     # Handle timestamps
 459     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 460     result = ''.join(map(replace_insane, s))
 461     if not is_id:
 462         while '__' in result:
 463             result = result.replace('__', '_')
 464         result = result.strip('_')
 465         # Common case of "Foreign band name - English song title"
 466         if restricted and result.startswith('-_'):
 467             result = result[2:]
 468         if result.startswith('-'):
 469             result = '_' + result[len('-'):]
 470         result = result.lstrip('.')
 471         if not result:
 472             result = '_'
 473     return result
 474
 475
 476 def sanitize_path(s):
 477     """Sanitizes and normalizes path on Windows"""
 478     if sys.platform != 'win32':
 479         return s
 480     drive_or_unc, _ = os.path.splitdrive(s)
 481     if sys.version_info < (2, 7) and not drive_or_unc:
 482         drive_or_unc, _ = os.path.splitunc(s)
 483     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 484     if drive_or_unc:
 485         norm_path.pop(0)
 486     sanitized_path = [
 487         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 488         for path_part in norm_path]
 489     if drive_or_unc:
 490         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 491     return os.path.join(*sanitized_path)
 492
 493
 494 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 495 # unwanted failures due to missing protocol
 496 def sanitize_url(url):
 497     return 'http:%s' % url if url.startswith('//') else url
 498
 499
 500 def sanitized_Request(url, *args, **kwargs):
 501     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 502
 503
 504 def orderedSet(iterable):
 505     """ Remove all duplicates from the input iterable """
 506     res = []
 507     for el in iterable:
 508         if el not in res:
 509             res.append(el)
 510     return res
 511
 512
 513 def _htmlentity_transform(entity_with_semicolon):
 514     """Transforms an HTML entity to a character."""
 515     entity = entity_with_semicolon[:-1]
 516
 517     # Known non-numeric HTML entity
 518     if entity in compat_html_entities.name2codepoint:
 519         return compat_chr(compat_html_entities.name2codepoint[entity])
 520
 521     # TODO: HTML5 allows entities without a semicolon. For example,
 522     # '&Eacuteric' should be decoded as 'Éric'.
 523     if entity_with_semicolon in compat_html_entities_html5:
 524         return compat_html_entities_html5[entity_with_semicolon]
 525
 526     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 527     if mobj is not None:
 528         numstr = mobj.group(1)
 529         if numstr.startswith('x'):
 530             base = 16
 531             numstr = '0%s' % numstr
 532         else:
 533             base = 10
 534         # See https://github.com/rg3/youtube-dl/issues/7518
 535         try:
 536             return compat_chr(int(numstr, base))
 537         except ValueError:
 538             pass
 539
 540     # Unknown entity in name, return its literal representation
 541     return '&%s;' % entity
 542
 543
 544 def unescapeHTML(s):
 545     if s is None:
 546         return None
 547     assert type(s) == compat_str
 548
 549     return re.sub(
 550         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 551
 552
 553 def get_subprocess_encoding():
 554     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 555         # For subprocess calls, encode with locale encoding
 556         # Refer to http://stackoverflow.com/a/9951851/35070
 557         encoding = preferredencoding()
 558     else:
 559         encoding = sys.getfilesystemencoding()
 560     if encoding is None:
 561         encoding = 'utf-8'
 562     return encoding
 563
 564
 565 def encodeFilename(s, for_subprocess=False):
 566     """
 567     @param s The name of the file
 568     """
 569
 570     assert type(s) == compat_str
 571
 572     # Python 3 has a Unicode API
 573     if sys.version_info >= (3, 0):
 574         return s
 575
 576     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 577     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 578     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 579     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 580         return s
 581
 582     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 583     if sys.platform.startswith('java'):
 584         return s
 585
 586     return s.encode(get_subprocess_encoding(), 'ignore')
 587
 588
 589 def decodeFilename(b, for_subprocess=False):
 590
 591     if sys.version_info >= (3, 0):
 592         return b
 593
 594     if not isinstance(b, bytes):
 595         return b
 596
 597     return b.decode(get_subprocess_encoding(), 'ignore')
 598
 599
 600 def encodeArgument(s):
 601     if not isinstance(s, compat_str):
 602         # Legacy code that uses byte strings
 603         # Uncomment the following line after fixing all post processors
 604         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 605         s = s.decode('ascii')
 606     return encodeFilename(s, True)
 607
 608
 609 def decodeArgument(b):
 610     return decodeFilename(b, True)
 611
 612
 613 def decodeOption(optval):
 614     if optval is None:
 615         return optval
 616     if isinstance(optval, bytes):
 617         optval = optval.decode(preferredencoding())
 618
 619     assert isinstance(optval, compat_str)
 620     return optval
 621
 622
 623 def formatSeconds(secs):
 624     if secs > 3600:
 625         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 626     elif secs > 60:
 627         return '%d:%02d' % (secs // 60, secs % 60)
 628     else:
 629         return '%d' % secs
 630
 631
 632 def make_HTTPS_handler(params, **kwargs):
 633     opts_no_check_certificate = params.get('nocheckcertificate', False)
 634     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 635         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 636         if opts_no_check_certificate:
 637             context.check_hostname = False
 638             context.verify_mode = ssl.CERT_NONE
 639         try:
 640             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 641         except TypeError:
 642             # Python 2.7.8
 643             # (create_default_context present but HTTPSHandler has no context=)
 644             pass
 645
 646     if sys.version_info < (3, 2):
 647         return YoutubeDLHTTPSHandler(params, **kwargs)
 648     else:  # Python < 3.4
 649         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 650         context.verify_mode = (ssl.CERT_NONE
 651                                if opts_no_check_certificate
 652                                else ssl.CERT_REQUIRED)
 653         context.set_default_verify_paths()
 654         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 655
 656
 657 def bug_reports_message():
 658     if ytdl_is_updateable():
 659         update_cmd = 'type  youtube-dl -U  to update'
 660     else:
 661         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 662     msg = '; please report this issue on https://yt-dl.org/bug .'
 663     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 664     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 665     return msg
 666
 667
 668 class ExtractorError(Exception):
 669     """Error during info extraction."""
 670
 671     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 672         """ tb, if given, is the original traceback (so that it can be printed out).
 673         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 674         """
 675
 676         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 677             expected = True
 678         if video_id is not None:
 679             msg = video_id + ': ' + msg
 680         if cause:
 681             msg += ' (caused by %r)' % cause
 682         if not expected:
 683             msg += bug_reports_message()
 684         super(ExtractorError, self).__init__(msg)
 685
 686         self.traceback = tb
 687         self.exc_info = sys.exc_info()  # preserve original exception
 688         self.cause = cause
 689         self.video_id = video_id
 690
 691     def format_traceback(self):
 692         if self.traceback is None:
 693             return None
 694         return ''.join(traceback.format_tb(self.traceback))
 695
 696
 697 class UnsupportedError(ExtractorError):
 698     def __init__(self, url):
 699         super(UnsupportedError, self).__init__(
 700             'Unsupported URL: %s' % url, expected=True)
 701         self.url = url
 702
 703
 704 class RegexNotFoundError(ExtractorError):
 705     """Error when a regex didn't match"""
 706     pass
 707
 708
 709 class DownloadError(Exception):
 710     """Download Error exception.
 711
 712     This exception may be thrown by FileDownloader objects if they are not
 713     configured to continue on errors. They will contain the appropriate
 714     error message.
 715     """
 716
 717     def __init__(self, msg, exc_info=None):
 718         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 719         super(DownloadError, self).__init__(msg)
 720         self.exc_info = exc_info
 721
 722
 723 class SameFileError(Exception):
 724     """Same File exception.
 725
 726     This exception will be thrown by FileDownloader objects if they detect
 727     multiple files would have to be downloaded to the same file on disk.
 728     """
 729     pass
 730
 731
 732 class PostProcessingError(Exception):
 733     """Post Processing exception.
 734
 735     This exception may be raised by PostProcessor's .run() method to
 736     indicate an error in the postprocessing task.
 737     """
 738
 739     def __init__(self, msg):
 740         self.msg = msg
 741
 742
 743 class MaxDownloadsReached(Exception):
 744     """ --max-downloads limit has been reached. """
 745     pass
 746
 747
 748 class UnavailableVideoError(Exception):
 749     """Unavailable Format exception.
 750
 751     This exception will be thrown when a video is requested
 752     in a format that is not available for that video.
 753     """
 754     pass
 755
 756
 757 class ContentTooShortError(Exception):
 758     """Content Too Short exception.
 759
 760     This exception may be raised by FileDownloader objects when a file they
 761     download is too small for what the server announced first, indicating
 762     the connection was probably interrupted.
 763     """
 764
 765     def __init__(self, downloaded, expected):
 766         # Both in bytes
 767         self.downloaded = downloaded
 768         self.expected = expected
 769
 770
 771 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 772     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 773     # expected HTTP responses to meet HTTP/1.0 or later (see also
 774     # https://github.com/rg3/youtube-dl/issues/6727)
 775     if sys.version_info < (3, 0):
 776         kwargs[b'strict'] = True
 777     hc = http_class(*args, **kwargs)
 778     source_address = ydl_handler._params.get('source_address')
 779     if source_address is not None:
 780         sa = (source_address, 0)
 781         if hasattr(hc, 'source_address'):  # Python 2.7+
 782             hc.source_address = sa
 783         else:  # Python 2.6
 784             def _hc_connect(self, *args, **kwargs):
 785                 sock = compat_socket_create_connection(
 786                     (self.host, self.port), self.timeout, sa)
 787                 if is_https:
 788                     self.sock = ssl.wrap_socket(
 789                         sock, self.key_file, self.cert_file,
 790                         ssl_version=ssl.PROTOCOL_TLSv1)
 791                 else:
 792                     self.sock = sock
 793             hc.connect = functools.partial(_hc_connect, hc)
 794
 795     return hc
 796
 797
 798 def handle_youtubedl_headers(headers):
 799     filtered_headers = headers
 800
 801     if 'Youtubedl-no-compression' in filtered_headers:
 802         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 803         del filtered_headers['Youtubedl-no-compression']
 804
 805     return filtered_headers
 806
 807
 808 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 809     """Handler for HTTP requests and responses.
 810
 811     This class, when installed with an OpenerDirector, automatically adds
 812     the standard headers to every HTTP request and handles gzipped and
 813     deflated responses from web servers. If compression is to be avoided in
 814     a particular request, the original request in the program code only has
 815     to include the HTTP header "Youtubedl-no-compression", which will be
 816     removed before making the real request.
 817
 818     Part of this code was copied from:
 819
 820     http://techknack.net/python-urllib2-handlers/
 821
 822     Andrew Rowls, the author of that code, agreed to release it to the
 823     public domain.
 824     """
 825
 826     def __init__(self, params, *args, **kwargs):
 827         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 828         self._params = params
 829
 830     def http_open(self, req):
 831         conn_class = compat_http_client.HTTPConnection
 832
 833         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 834         if socks_proxy:
 835             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 836             del req.headers['Ytdl-socks-proxy']
 837
 838         return self.do_open(functools.partial(
 839             _create_http_connection, self, conn_class, False),
 840             req)
 841
 842     @staticmethod
 843     def deflate(data):
 844         try:
 845             return zlib.decompress(data, -zlib.MAX_WBITS)
 846         except zlib.error:
 847             return zlib.decompress(data)
 848
 849     @staticmethod
 850     def addinfourl_wrapper(stream, headers, url, code):
 851         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 852             return compat_urllib_request.addinfourl(stream, headers, url, code)
 853         ret = compat_urllib_request.addinfourl(stream, headers, url)
 854         ret.code = code
 855         return ret
 856
 857     def http_request(self, req):
 858         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 859         # always respected by websites, some tend to give out URLs with non percent-encoded
 860         # non-ASCII characters (see telemb.py, ard.py [#3412])
 861         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 862         # To work around aforementioned issue we will replace request's original URL with
 863         # percent-encoded one
 864         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 865         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 866         url = req.get_full_url()
 867         url_escaped = escape_url(url)
 868
 869         # Substitute URL if any change after escaping
 870         if url != url_escaped:
 871             req = update_Request(req, url=url_escaped)
 872
 873         for h, v in std_headers.items():
 874             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 875             # The dict keys are capitalized because of this bug by urllib
 876             if h.capitalize() not in req.headers:
 877                 req.add_header(h, v)
 878
 879         req.headers = handle_youtubedl_headers(req.headers)
 880
 881         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 882             # Python 2.6 is brain-dead when it comes to fragments
 883             req._Request__original = req._Request__original.partition('#')[0]
 884             req._Request__r_type = req._Request__r_type.partition('#')[0]
 885
 886         return req
 887
 888     def http_response(self, req, resp):
 889         old_resp = resp
 890         # gzip
 891         if resp.headers.get('Content-encoding', '') == 'gzip':
 892             content = resp.read()
 893             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 894             try:
 895                 uncompressed = io.BytesIO(gz.read())
 896             except IOError as original_ioerror:
 897                 # There may be junk add the end of the file
 898                 # See http://stackoverflow.com/q/4928560/35070 for details
 899                 for i in range(1, 1024):
 900                     try:
 901                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 902                         uncompressed = io.BytesIO(gz.read())
 903                     except IOError:
 904                         continue
 905                     break
 906                 else:
 907                     raise original_ioerror
 908             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 909             resp.msg = old_resp.msg
 910             del resp.headers['Content-encoding']
 911         # deflate
 912         if resp.headers.get('Content-encoding', '') == 'deflate':
 913             gz = io.BytesIO(self.deflate(resp.read()))
 914             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 915             resp.msg = old_resp.msg
 916             del resp.headers['Content-encoding']
 917         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 918         # https://github.com/rg3/youtube-dl/issues/6457).
 919         if 300 <= resp.code < 400:
 920             location = resp.headers.get('Location')
 921             if location:
 922                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 923                 if sys.version_info >= (3, 0):
 924                     location = location.encode('iso-8859-1').decode('utf-8')
 925                 else:
 926                     location = location.decode('utf-8')
 927                 location_escaped = escape_url(location)
 928                 if location != location_escaped:
 929                     del resp.headers['Location']
 930                     if sys.version_info < (3, 0):
 931                         location_escaped = location_escaped.encode('utf-8')
 932                     resp.headers['Location'] = location_escaped
 933         return resp
 934
 935     https_request = http_request
 936     https_response = http_response
 937
 938
 939 def make_socks_conn_class(base_class, socks_proxy):
 940     assert issubclass(base_class, (
 941         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 942
 943     url_components = compat_urlparse.urlparse(socks_proxy)
 944     if url_components.scheme.lower() == 'socks5':
 945         socks_type = ProxyType.SOCKS5
 946     elif url_components.scheme.lower() in ('socks', 'socks4'):
 947         socks_type = ProxyType.SOCKS4
 948     elif url_components.scheme.lower() == 'socks4a':
 949         socks_type = ProxyType.SOCKS4A
 950
 951     def unquote_if_non_empty(s):
 952         if not s:
 953             return s
 954         return compat_urllib_parse_unquote_plus(s)
 955
 956     proxy_args = (
 957         socks_type,
 958         url_components.hostname, url_components.port or 1080,
 959         True,  # Remote DNS
 960         unquote_if_non_empty(url_components.username),
 961         unquote_if_non_empty(url_components.password),
 962     )
 963
 964     class SocksConnection(base_class):
 965         def connect(self):
 966             self.sock = sockssocket()
 967             self.sock.setproxy(*proxy_args)
 968             if type(self.timeout) in (int, float):
 969                 self.sock.settimeout(self.timeout)
 970             self.sock.connect((self.host, self.port))
 971
 972             if isinstance(self, compat_http_client.HTTPSConnection):
 973                 if hasattr(self, '_context'):  # Python > 2.6
 974                     self.sock = self._context.wrap_socket(
 975                         self.sock, server_hostname=self.host)
 976                 else:
 977                     self.sock = ssl.wrap_socket(self.sock)
 978
 979     return SocksConnection
 980
 981
 982 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 983     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 984         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 985         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 986         self._params = params
 987
 988     def https_open(self, req):
 989         kwargs = {}
 990         conn_class = self._https_conn_class
 991
 992         if hasattr(self, '_context'):  # python > 2.6
 993             kwargs['context'] = self._context
 994         if hasattr(self, '_check_hostname'):  # python 3.x
 995             kwargs['check_hostname'] = self._check_hostname
 996
 997         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 998         if socks_proxy:
 999             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1000             del req.headers['Ytdl-socks-proxy']
1001
1002         return self.do_open(functools.partial(
1003             _create_http_connection, self, conn_class, True),
1004             req, **kwargs)
1005
1006
1007 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1008     def __init__(self, cookiejar=None):
1009         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1010
1011     def http_response(self, request, response):
1012         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1013         # characters in Set-Cookie HTTP header of last response (see
1014         # https://github.com/rg3/youtube-dl/issues/6769).
1015         # In order to at least prevent crashing we will percent encode Set-Cookie
1016         # header before HTTPCookieProcessor starts processing it.
1017         # if sys.version_info < (3, 0) and response.headers:
1018         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1019         #         set_cookie = response.headers.get(set_cookie_header)
1020         #         if set_cookie:
1021         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1022         #             if set_cookie != set_cookie_escaped:
1023         #                 del response.headers[set_cookie_header]
1024         #                 response.headers[set_cookie_header] = set_cookie_escaped
1025         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1026
1027     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1028     https_response = http_response
1029
1030
1031 def extract_timezone(date_str):
1032     m = re.search(
1033         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1034         date_str)
1035     if not m:
1036         timezone = datetime.timedelta()
1037     else:
1038         date_str = date_str[:-len(m.group('tz'))]
1039         if not m.group('sign'):
1040             timezone = datetime.timedelta()
1041         else:
1042             sign = 1 if m.group('sign') == '+' else -1
1043             timezone = datetime.timedelta(
1044                 hours=sign * int(m.group('hours')),
1045                 minutes=sign * int(m.group('minutes')))
1046     return timezone, date_str
1047
1048
1049 def parse_iso8601(date_str, delimiter='T', timezone=None):
1050     """ Return a UNIX timestamp from the given date """
1051
1052     if date_str is None:
1053         return None
1054
1055     date_str = re.sub(r'\.[0-9]+', '', date_str)
1056
1057     if timezone is None:
1058         timezone, date_str = extract_timezone(date_str)
1059
1060     try:
1061         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1062         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1063         return calendar.timegm(dt.timetuple())
1064     except ValueError:
1065         pass
1066
1067
1068 def date_formats(day_first=True):
1069     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1070
1071
1072 def unified_strdate(date_str, day_first=True):
1073     """Return a string with the date in the format YYYYMMDD"""
1074
1075     if date_str is None:
1076         return None
1077     upload_date = None
1078     # Replace commas
1079     date_str = date_str.replace(',', ' ')
1080     # Remove AM/PM + timezone
1081     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1082     _, date_str = extract_timezone(date_str)
1083
1084     for expression in date_formats(day_first):
1085         try:
1086             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1087         except ValueError:
1088             pass
1089     if upload_date is None:
1090         timetuple = email.utils.parsedate_tz(date_str)
1091         if timetuple:
1092             try:
1093                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1094             except ValueError:
1095                 pass
1096     if upload_date is not None:
1097         return compat_str(upload_date)
1098
1099
1100 def unified_timestamp(date_str, day_first=True):
1101     if date_str is None:
1102         return None
1103
1104     date_str = date_str.replace(',', ' ')
1105
1106     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1107     timezone, date_str = extract_timezone(date_str)
1108
1109     # Remove AM/PM + timezone
1110     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1111
1112     for expression in date_formats(day_first):
1113         try:
1114             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1115             return calendar.timegm(dt.timetuple())
1116         except ValueError:
1117             pass
1118     timetuple = email.utils.parsedate_tz(date_str)
1119     if timetuple:
1120         return calendar.timegm(timetuple) + pm_delta * 3600
1121
1122
1123 def determine_ext(url, default_ext='unknown_video'):
1124     if url is None:
1125         return default_ext
1126     guess = url.partition('?')[0].rpartition('.')[2]
1127     if re.match(r'^[A-Za-z0-9]+$', guess):
1128         return guess
1129     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1130     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1131         return guess.rstrip('/')
1132     else:
1133         return default_ext
1134
1135
1136 def subtitles_filename(filename, sub_lang, sub_format):
1137     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1138
1139
1140 def date_from_str(date_str):
1141     """
1142     Return a datetime object from a string in the format YYYYMMDD or
1143     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1144     today = datetime.date.today()
1145     if date_str in ('now', 'today'):
1146         return today
1147     if date_str == 'yesterday':
1148         return today - datetime.timedelta(days=1)
1149     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1150     if match is not None:
1151         sign = match.group('sign')
1152         time = int(match.group('time'))
1153         if sign == '-':
1154             time = -time
1155         unit = match.group('unit')
1156         # A bad approximation?
1157         if unit == 'month':
1158             unit = 'day'
1159             time *= 30
1160         elif unit == 'year':
1161             unit = 'day'
1162             time *= 365
1163         unit += 's'
1164         delta = datetime.timedelta(**{unit: time})
1165         return today + delta
1166     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1167
1168
1169 def hyphenate_date(date_str):
1170     """
1171     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1172     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1173     if match is not None:
1174         return '-'.join(match.groups())
1175     else:
1176         return date_str
1177
1178
1179 class DateRange(object):
1180     """Represents a time interval between two dates"""
1181
1182     def __init__(self, start=None, end=None):
1183         """start and end must be strings in the format accepted by date"""
1184         if start is not None:
1185             self.start = date_from_str(start)
1186         else:
1187             self.start = datetime.datetime.min.date()
1188         if end is not None:
1189             self.end = date_from_str(end)
1190         else:
1191             self.end = datetime.datetime.max.date()
1192         if self.start > self.end:
1193             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1194
1195     @classmethod
1196     def day(cls, day):
1197         """Returns a range that only contains the given day"""
1198         return cls(day, day)
1199
1200     def __contains__(self, date):
1201         """Check if the date is in the range"""
1202         if not isinstance(date, datetime.date):
1203             date = date_from_str(date)
1204         return self.start <= date <= self.end
1205
1206     def __str__(self):
1207         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1208
1209
1210 def platform_name():
1211     """ Returns the platform name as a compat_str """
1212     res = platform.platform()
1213     if isinstance(res, bytes):
1214         res = res.decode(preferredencoding())
1215
1216     assert isinstance(res, compat_str)
1217     return res
1218
1219
1220 def _windows_write_string(s, out):
1221     """ Returns True if the string was written using special methods,
1222     False if it has yet to be written out."""
1223     # Adapted from http://stackoverflow.com/a/3259271/35070
1224
1225     import ctypes
1226     import ctypes.wintypes
1227
1228     WIN_OUTPUT_IDS = {
1229         1: -11,
1230         2: -12,
1231     }
1232
1233     try:
1234         fileno = out.fileno()
1235     except AttributeError:
1236         # If the output stream doesn't have a fileno, it's virtual
1237         return False
1238     except io.UnsupportedOperation:
1239         # Some strange Windows pseudo files?
1240         return False
1241     if fileno not in WIN_OUTPUT_IDS:
1242         return False
1243
1244     GetStdHandle = ctypes.WINFUNCTYPE(
1245         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1246         (b'GetStdHandle', ctypes.windll.kernel32))
1247     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1248
1249     WriteConsoleW = ctypes.WINFUNCTYPE(
1250         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1251         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1252         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1253     written = ctypes.wintypes.DWORD(0)
1254
1255     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1256     FILE_TYPE_CHAR = 0x0002
1257     FILE_TYPE_REMOTE = 0x8000
1258     GetConsoleMode = ctypes.WINFUNCTYPE(
1259         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1260         ctypes.POINTER(ctypes.wintypes.DWORD))(
1261         (b'GetConsoleMode', ctypes.windll.kernel32))
1262     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1263
1264     def not_a_console(handle):
1265         if handle == INVALID_HANDLE_VALUE or handle is None:
1266             return True
1267         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1268                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1269
1270     if not_a_console(h):
1271         return False
1272
1273     def next_nonbmp_pos(s):
1274         try:
1275             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1276         except StopIteration:
1277             return len(s)
1278
1279     while s:
1280         count = min(next_nonbmp_pos(s), 1024)
1281
1282         ret = WriteConsoleW(
1283             h, s, count if count else 2, ctypes.byref(written), None)
1284         if ret == 0:
1285             raise OSError('Failed to write string')
1286         if not count:  # We just wrote a non-BMP character
1287             assert written.value == 2
1288             s = s[1:]
1289         else:
1290             assert written.value > 0
1291             s = s[written.value:]
1292     return True
1293
1294
1295 def write_string(s, out=None, encoding=None):
1296     if out is None:
1297         out = sys.stderr
1298     assert type(s) == compat_str
1299
1300     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1301         if _windows_write_string(s, out):
1302             return
1303
1304     if ('b' in getattr(out, 'mode', '') or
1305             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1306         byt = s.encode(encoding or preferredencoding(), 'ignore')
1307         out.write(byt)
1308     elif hasattr(out, 'buffer'):
1309         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1310         byt = s.encode(enc, 'ignore')
1311         out.buffer.write(byt)
1312     else:
1313         out.write(s)
1314     out.flush()
1315
1316
1317 def bytes_to_intlist(bs):
1318     if not bs:
1319         return []
1320     if isinstance(bs[0], int):  # Python 3
1321         return list(bs)
1322     else:
1323         return [ord(c) for c in bs]
1324
1325
1326 def intlist_to_bytes(xs):
1327     if not xs:
1328         return b''
1329     return compat_struct_pack('%dB' % len(xs), *xs)
1330
1331
1332 # Cross-platform file locking
1333 if sys.platform == 'win32':
1334     import ctypes.wintypes
1335     import msvcrt
1336
1337     class OVERLAPPED(ctypes.Structure):
1338         _fields_ = [
1339             ('Internal', ctypes.wintypes.LPVOID),
1340             ('InternalHigh', ctypes.wintypes.LPVOID),
1341             ('Offset', ctypes.wintypes.DWORD),
1342             ('OffsetHigh', ctypes.wintypes.DWORD),
1343             ('hEvent', ctypes.wintypes.HANDLE),
1344         ]
1345
1346     kernel32 = ctypes.windll.kernel32
1347     LockFileEx = kernel32.LockFileEx
1348     LockFileEx.argtypes = [
1349         ctypes.wintypes.HANDLE,     # hFile
1350         ctypes.wintypes.DWORD,      # dwFlags
1351         ctypes.wintypes.DWORD,      # dwReserved
1352         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1353         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1354         ctypes.POINTER(OVERLAPPED)  # Overlapped
1355     ]
1356     LockFileEx.restype = ctypes.wintypes.BOOL
1357     UnlockFileEx = kernel32.UnlockFileEx
1358     UnlockFileEx.argtypes = [
1359         ctypes.wintypes.HANDLE,     # hFile
1360         ctypes.wintypes.DWORD,      # dwReserved
1361         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1362         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1363         ctypes.POINTER(OVERLAPPED)  # Overlapped
1364     ]
1365     UnlockFileEx.restype = ctypes.wintypes.BOOL
1366     whole_low = 0xffffffff
1367     whole_high = 0x7fffffff
1368
1369     def _lock_file(f, exclusive):
1370         overlapped = OVERLAPPED()
1371         overlapped.Offset = 0
1372         overlapped.OffsetHigh = 0
1373         overlapped.hEvent = 0
1374         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1375         handle = msvcrt.get_osfhandle(f.fileno())
1376         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1377                           whole_low, whole_high, f._lock_file_overlapped_p):
1378             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1379
1380     def _unlock_file(f):
1381         assert f._lock_file_overlapped_p
1382         handle = msvcrt.get_osfhandle(f.fileno())
1383         if not UnlockFileEx(handle, 0,
1384                             whole_low, whole_high, f._lock_file_overlapped_p):
1385             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1386
1387 else:
1388     # Some platforms, such as Jython, is missing fcntl
1389     try:
1390         import fcntl
1391
1392         def _lock_file(f, exclusive):
1393             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1394
1395         def _unlock_file(f):
1396             fcntl.flock(f, fcntl.LOCK_UN)
1397     except ImportError:
1398         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1399
1400         def _lock_file(f, exclusive):
1401             raise IOError(UNSUPPORTED_MSG)
1402
1403         def _unlock_file(f):
1404             raise IOError(UNSUPPORTED_MSG)
1405
1406
1407 class locked_file(object):
1408     def __init__(self, filename, mode, encoding=None):
1409         assert mode in ['r', 'a', 'w']
1410         self.f = io.open(filename, mode, encoding=encoding)
1411         self.mode = mode
1412
1413     def __enter__(self):
1414         exclusive = self.mode != 'r'
1415         try:
1416             _lock_file(self.f, exclusive)
1417         except IOError:
1418             self.f.close()
1419             raise
1420         return self
1421
1422     def __exit__(self, etype, value, traceback):
1423         try:
1424             _unlock_file(self.f)
1425         finally:
1426             self.f.close()
1427
1428     def __iter__(self):
1429         return iter(self.f)
1430
1431     def write(self, *args):
1432         return self.f.write(*args)
1433
1434     def read(self, *args):
1435         return self.f.read(*args)
1436
1437
1438 def get_filesystem_encoding():
1439     encoding = sys.getfilesystemencoding()
1440     return encoding if encoding is not None else 'utf-8'
1441
1442
1443 def shell_quote(args):
1444     quoted_args = []
1445     encoding = get_filesystem_encoding()
1446     for a in args:
1447         if isinstance(a, bytes):
1448             # We may get a filename encoded with 'encodeFilename'
1449             a = a.decode(encoding)
1450         quoted_args.append(pipes.quote(a))
1451     return ' '.join(quoted_args)
1452
1453
1454 def smuggle_url(url, data):
1455     """ Pass additional data in a URL for internal use. """
1456
1457     url, idata = unsmuggle_url(url, {})
1458     data.update(idata)
1459     sdata = compat_urllib_parse_urlencode(
1460         {'__youtubedl_smuggle': json.dumps(data)})
1461     return url + '#' + sdata
1462
1463
1464 def unsmuggle_url(smug_url, default=None):
1465     if '#__youtubedl_smuggle' not in smug_url:
1466         return smug_url, default
1467     url, _, sdata = smug_url.rpartition('#')
1468     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1469     data = json.loads(jsond)
1470     return url, data
1471
1472
1473 def format_bytes(bytes):
1474     if bytes is None:
1475         return 'N/A'
1476     if type(bytes) is str:
1477         bytes = float(bytes)
1478     if bytes == 0.0:
1479         exponent = 0
1480     else:
1481         exponent = int(math.log(bytes, 1024.0))
1482     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1483     converted = float(bytes) / float(1024 ** exponent)
1484     return '%.2f%s' % (converted, suffix)
1485
1486
1487 def lookup_unit_table(unit_table, s):
1488     units_re = '|'.join(re.escape(u) for u in unit_table)
1489     m = re.match(
1490         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1491     if not m:
1492         return None
1493     num_str = m.group('num').replace(',', '.')
1494     mult = unit_table[m.group('unit')]
1495     return int(float(num_str) * mult)
1496
1497
1498 def parse_filesize(s):
1499     if s is None:
1500         return None
1501
1502     # The lower-case forms are of course incorrect and unofficial,
1503     # but we support those too
1504     _UNIT_TABLE = {
1505         'B': 1,
1506         'b': 1,
1507         'KiB': 1024,
1508         'KB': 1000,
1509         'kB': 1024,
1510         'Kb': 1000,
1511         'kb': 1000,
1512         'MiB': 1024 ** 2,
1513         'MB': 1000 ** 2,
1514         'mB': 1024 ** 2,
1515         'Mb': 1000 ** 2,
1516         'mb': 1000 ** 2,
1517         'GiB': 1024 ** 3,
1518         'GB': 1000 ** 3,
1519         'gB': 1024 ** 3,
1520         'Gb': 1000 ** 3,
1521         'gb': 1000 ** 3,
1522         'TiB': 1024 ** 4,
1523         'TB': 1000 ** 4,
1524         'tB': 1024 ** 4,
1525         'Tb': 1000 ** 4,
1526         'tb': 1000 ** 4,
1527         'PiB': 1024 ** 5,
1528         'PB': 1000 ** 5,
1529         'pB': 1024 ** 5,
1530         'Pb': 1000 ** 5,
1531         'pb': 1000 ** 5,
1532         'EiB': 1024 ** 6,
1533         'EB': 1000 ** 6,
1534         'eB': 1024 ** 6,
1535         'Eb': 1000 ** 6,
1536         'eb': 1000 ** 6,
1537         'ZiB': 1024 ** 7,
1538         'ZB': 1000 ** 7,
1539         'zB': 1024 ** 7,
1540         'Zb': 1000 ** 7,
1541         'zb': 1000 ** 7,
1542         'YiB': 1024 ** 8,
1543         'YB': 1000 ** 8,
1544         'yB': 1024 ** 8,
1545         'Yb': 1000 ** 8,
1546         'yb': 1000 ** 8,
1547     }
1548
1549     return lookup_unit_table(_UNIT_TABLE, s)
1550
1551
1552 def parse_count(s):
1553     if s is None:
1554         return None
1555
1556     s = s.strip()
1557
1558     if re.match(r'^[\d,.]+$', s):
1559         return str_to_int(s)
1560
1561     _UNIT_TABLE = {
1562         'k': 1000,
1563         'K': 1000,
1564         'm': 1000 ** 2,
1565         'M': 1000 ** 2,
1566         'kk': 1000 ** 2,
1567         'KK': 1000 ** 2,
1568     }
1569
1570     return lookup_unit_table(_UNIT_TABLE, s)
1571
1572
1573 def month_by_name(name):
1574     """ Return the number of a month by (locale-independently) English name """
1575
1576     try:
1577         return ENGLISH_MONTH_NAMES.index(name) + 1
1578     except ValueError:
1579         return None
1580
1581
1582 def month_by_abbreviation(abbrev):
1583     """ Return the number of a month by (locale-independently) English
1584         abbreviations """
1585
1586     try:
1587         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1588     except ValueError:
1589         return None
1590
1591
1592 def fix_xml_ampersands(xml_str):
1593     """Replace all the '&' by '&amp;' in XML"""
1594     return re.sub(
1595         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1596         '&amp;',
1597         xml_str)
1598
1599
1600 def setproctitle(title):
1601     assert isinstance(title, compat_str)
1602
1603     # ctypes in Jython is not complete
1604     # http://bugs.jython.org/issue2148
1605     if sys.platform.startswith('java'):
1606         return
1607
1608     try:
1609         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1610     except OSError:
1611         return
1612     title_bytes = title.encode('utf-8')
1613     buf = ctypes.create_string_buffer(len(title_bytes))
1614     buf.value = title_bytes
1615     try:
1616         libc.prctl(15, buf, 0, 0, 0)
1617     except AttributeError:
1618         return  # Strange libc, just skip this
1619
1620
1621 def remove_start(s, start):
1622     return s[len(start):] if s is not None and s.startswith(start) else s
1623
1624
1625 def remove_end(s, end):
1626     return s[:-len(end)] if s is not None and s.endswith(end) else s
1627
1628
1629 def remove_quotes(s):
1630     if s is None or len(s) < 2:
1631         return s
1632     for quote in ('"', "'", ):
1633         if s[0] == quote and s[-1] == quote:
1634             return s[1:-1]
1635     return s
1636
1637
1638 def url_basename(url):
1639     path = compat_urlparse.urlparse(url).path
1640     return path.strip('/').split('/')[-1]
1641
1642
1643 class HEADRequest(compat_urllib_request.Request):
1644     def get_method(self):
1645         return 'HEAD'
1646
1647
1648 class PUTRequest(compat_urllib_request.Request):
1649     def get_method(self):
1650         return 'PUT'
1651
1652
1653 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1654     if get_attr:
1655         if v is not None:
1656             v = getattr(v, get_attr, None)
1657     if v == '':
1658         v = None
1659     if v is None:
1660         return default
1661     try:
1662         return int(v) * invscale // scale
1663     except ValueError:
1664         return default
1665
1666
1667 def str_or_none(v, default=None):
1668     return default if v is None else compat_str(v)
1669
1670
1671 def str_to_int(int_str):
1672     """ A more relaxed version of int_or_none """
1673     if int_str is None:
1674         return None
1675     int_str = re.sub(r'[,\.\+]', '', int_str)
1676     return int(int_str)
1677
1678
1679 def float_or_none(v, scale=1, invscale=1, default=None):
1680     if v is None:
1681         return default
1682     try:
1683         return float(v) * invscale / scale
1684     except ValueError:
1685         return default
1686
1687
1688 def strip_or_none(v):
1689     return None if v is None else v.strip()
1690
1691
1692 def parse_duration(s):
1693     if not isinstance(s, compat_basestring):
1694         return None
1695
1696     s = s.strip()
1697
1698     days, hours, mins, secs, ms = [None] * 5
1699     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1700     if m:
1701         days, hours, mins, secs, ms = m.groups()
1702     else:
1703         m = re.match(
1704             r'''(?ix)(?:P?T)?
1705                 (?:
1706                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1707                 )?
1708                 (?:
1709                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1710                 )?
1711                 (?:
1712                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1713                 )?
1714                 (?:
1715                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1716                 )?$''', s)
1717         if m:
1718             days, hours, mins, secs, ms = m.groups()
1719         else:
1720             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1721             if m:
1722                 hours, mins = m.groups()
1723             else:
1724                 return None
1725
1726     duration = 0
1727     if secs:
1728         duration += float(secs)
1729     if mins:
1730         duration += float(mins) * 60
1731     if hours:
1732         duration += float(hours) * 60 * 60
1733     if days:
1734         duration += float(days) * 24 * 60 * 60
1735     if ms:
1736         duration += float(ms)
1737     return duration
1738
1739
1740 def prepend_extension(filename, ext, expected_real_ext=None):
1741     name, real_ext = os.path.splitext(filename)
1742     return (
1743         '{0}.{1}{2}'.format(name, ext, real_ext)
1744         if not expected_real_ext or real_ext[1:] == expected_real_ext
1745         else '{0}.{1}'.format(filename, ext))
1746
1747
1748 def replace_extension(filename, ext, expected_real_ext=None):
1749     name, real_ext = os.path.splitext(filename)
1750     return '{0}.{1}'.format(
1751         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1752         ext)
1753
1754
1755 def check_executable(exe, args=[]):
1756     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1757     args can be a list of arguments for a short output (like -version) """
1758     try:
1759         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1760     except OSError:
1761         return False
1762     return exe
1763
1764
1765 def get_exe_version(exe, args=['--version'],
1766                     version_re=None, unrecognized='present'):
1767     """ Returns the version of the specified executable,
1768     or False if the executable is not present """
1769     try:
1770         out, _ = subprocess.Popen(
1771             [encodeArgument(exe)] + args,
1772             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1773     except OSError:
1774         return False
1775     if isinstance(out, bytes):  # Python 2.x
1776         out = out.decode('ascii', 'ignore')
1777     return detect_exe_version(out, version_re, unrecognized)
1778
1779
1780 def detect_exe_version(output, version_re=None, unrecognized='present'):
1781     assert isinstance(output, compat_str)
1782     if version_re is None:
1783         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1784     m = re.search(version_re, output)
1785     if m:
1786         return m.group(1)
1787     else:
1788         return unrecognized
1789
1790
1791 class PagedList(object):
1792     def __len__(self):
1793         # This is only useful for tests
1794         return len(self.getslice())
1795
1796
1797 class OnDemandPagedList(PagedList):
1798     def __init__(self, pagefunc, pagesize, use_cache=False):
1799         self._pagefunc = pagefunc
1800         self._pagesize = pagesize
1801         self._use_cache = use_cache
1802         if use_cache:
1803             self._cache = {}
1804
1805     def getslice(self, start=0, end=None):
1806         res = []
1807         for pagenum in itertools.count(start // self._pagesize):
1808             firstid = pagenum * self._pagesize
1809             nextfirstid = pagenum * self._pagesize + self._pagesize
1810             if start >= nextfirstid:
1811                 continue
1812
1813             page_results = None
1814             if self._use_cache:
1815                 page_results = self._cache.get(pagenum)
1816             if page_results is None:
1817                 page_results = list(self._pagefunc(pagenum))
1818             if self._use_cache:
1819                 self._cache[pagenum] = page_results
1820
1821             startv = (
1822                 start % self._pagesize
1823                 if firstid <= start < nextfirstid
1824                 else 0)
1825
1826             endv = (
1827                 ((end - 1) % self._pagesize) + 1
1828                 if (end is not None and firstid <= end <= nextfirstid)
1829                 else None)
1830
1831             if startv != 0 or endv is not None:
1832                 page_results = page_results[startv:endv]
1833             res.extend(page_results)
1834
1835             # A little optimization - if current page is not "full", ie. does
1836             # not contain page_size videos then we can assume that this page
1837             # is the last one - there are no more ids on further pages -
1838             # i.e. no need to query again.
1839             if len(page_results) + startv < self._pagesize:
1840                 break
1841
1842             # If we got the whole page, but the next page is not interesting,
1843             # break out early as well
1844             if end == nextfirstid:
1845                 break
1846         return res
1847
1848
1849 class InAdvancePagedList(PagedList):
1850     def __init__(self, pagefunc, pagecount, pagesize):
1851         self._pagefunc = pagefunc
1852         self._pagecount = pagecount
1853         self._pagesize = pagesize
1854
1855     def getslice(self, start=0, end=None):
1856         res = []
1857         start_page = start // self._pagesize
1858         end_page = (
1859             self._pagecount if end is None else (end // self._pagesize + 1))
1860         skip_elems = start - start_page * self._pagesize
1861         only_more = None if end is None else end - start
1862         for pagenum in range(start_page, end_page):
1863             page = list(self._pagefunc(pagenum))
1864             if skip_elems:
1865                 page = page[skip_elems:]
1866                 skip_elems = None
1867             if only_more is not None:
1868                 if len(page) < only_more:
1869                     only_more -= len(page)
1870                 else:
1871                     page = page[:only_more]
1872                     res.extend(page)
1873                     break
1874             res.extend(page)
1875         return res
1876
1877
1878 def uppercase_escape(s):
1879     unicode_escape = codecs.getdecoder('unicode_escape')
1880     return re.sub(
1881         r'\\U[0-9a-fA-F]{8}',
1882         lambda m: unicode_escape(m.group(0))[0],
1883         s)
1884
1885
1886 def lowercase_escape(s):
1887     unicode_escape = codecs.getdecoder('unicode_escape')
1888     return re.sub(
1889         r'\\u[0-9a-fA-F]{4}',
1890         lambda m: unicode_escape(m.group(0))[0],
1891         s)
1892
1893
1894 def escape_rfc3986(s):
1895     """Escape non-ASCII characters as suggested by RFC 3986"""
1896     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1897         s = s.encode('utf-8')
1898     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1899
1900
1901 def escape_url(url):
1902     """Escape URL as suggested by RFC 3986"""
1903     url_parsed = compat_urllib_parse_urlparse(url)
1904     return url_parsed._replace(
1905         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1906         path=escape_rfc3986(url_parsed.path),
1907         params=escape_rfc3986(url_parsed.params),
1908         query=escape_rfc3986(url_parsed.query),
1909         fragment=escape_rfc3986(url_parsed.fragment)
1910     ).geturl()
1911
1912
1913 def read_batch_urls(batch_fd):
1914     def fixup(url):
1915         if not isinstance(url, compat_str):
1916             url = url.decode('utf-8', 'replace')
1917         BOM_UTF8 = '\xef\xbb\xbf'
1918         if url.startswith(BOM_UTF8):
1919             url = url[len(BOM_UTF8):]
1920         url = url.strip()
1921         if url.startswith(('#', ';', ']')):
1922             return False
1923         return url
1924
1925     with contextlib.closing(batch_fd) as fd:
1926         return [url for url in map(fixup, fd) if url]
1927
1928
1929 def urlencode_postdata(*args, **kargs):
1930     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1931
1932
1933 def update_url_query(url, query):
1934     if not query:
1935         return url
1936     parsed_url = compat_urlparse.urlparse(url)
1937     qs = compat_parse_qs(parsed_url.query)
1938     qs.update(query)
1939     return compat_urlparse.urlunparse(parsed_url._replace(
1940         query=compat_urllib_parse_urlencode(qs, True)))
1941
1942
1943 def update_Request(req, url=None, data=None, headers={}, query={}):
1944     req_headers = req.headers.copy()
1945     req_headers.update(headers)
1946     req_data = data or req.data
1947     req_url = update_url_query(url or req.get_full_url(), query)
1948     req_get_method = req.get_method()
1949     if req_get_method == 'HEAD':
1950         req_type = HEADRequest
1951     elif req_get_method == 'PUT':
1952         req_type = PUTRequest
1953     else:
1954         req_type = compat_urllib_request.Request
1955     new_req = req_type(
1956         req_url, data=req_data, headers=req_headers,
1957         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1958     if hasattr(req, 'timeout'):
1959         new_req.timeout = req.timeout
1960     return new_req
1961
1962
1963 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1964     if isinstance(key_or_keys, (list, tuple)):
1965         for key in key_or_keys:
1966             if key not in d or d[key] is None or skip_false_values and not d[key]:
1967                 continue
1968             return d[key]
1969         return default
1970     return d.get(key_or_keys, default)
1971
1972
1973 def try_get(src, getter, expected_type=None):
1974     try:
1975         v = getter(src)
1976     except (AttributeError, KeyError, TypeError, IndexError):
1977         pass
1978     else:
1979         if expected_type is None or isinstance(v, expected_type):
1980             return v
1981
1982
1983 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1984     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1985
1986
1987 US_RATINGS = {
1988     'G': 0,
1989     'PG': 10,
1990     'PG-13': 13,
1991     'R': 16,
1992     'NC': 18,
1993 }
1994
1995
1996 TV_PARENTAL_GUIDELINES = {
1997     'TV-Y': 0,
1998     'TV-Y7': 7,
1999     'TV-G': 0,
2000     'TV-PG': 0,
2001     'TV-14': 14,
2002     'TV-MA': 17,
2003 }
2004
2005
2006 def parse_age_limit(s):
2007     if type(s) == int:
2008         return s if 0 <= s <= 21 else None
2009     if not isinstance(s, compat_basestring):
2010         return None
2011     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2012     if m:
2013         return int(m.group('age'))
2014     if s in US_RATINGS:
2015         return US_RATINGS[s]
2016     return TV_PARENTAL_GUIDELINES.get(s)
2017
2018
2019 def strip_jsonp(code):
2020     return re.sub(
2021         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2022
2023
2024 def js_to_json(code):
2025     def fix_kv(m):
2026         v = m.group(0)
2027         if v in ('true', 'false', 'null'):
2028             return v
2029         elif v.startswith('/*') or v == ',':
2030             return ""
2031
2032         if v[0] in ("'", '"'):
2033             v = re.sub(r'(?s)\\.|"', lambda m: {
2034                 '"': '\\"',
2035                 "\\'": "'",
2036                 '\\\n': '',
2037                 '\\x': '\\u00',
2038             }.get(m.group(0), m.group(0)), v[1:-1])
2039
2040         INTEGER_TABLE = (
2041             (r'^0[xX][0-9a-fA-F]+', 16),
2042             (r'^0+[0-7]+', 8),
2043         )
2044
2045         for regex, base in INTEGER_TABLE:
2046             im = re.match(regex, v)
2047             if im:
2048                 i = int(im.group(0), base)
2049                 return '"%d":' % i if v.endswith(':') else '%d' % i
2050
2051         return '"%s"' % v
2052
2053     return re.sub(r'''(?sx)
2054         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2055         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2056         /\*.*?\*/|,(?=\s*[\]}])|
2057         [a-zA-Z_][.a-zA-Z_0-9]*|
2058         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2059         [0-9]+(?=\s*:)
2060         ''', fix_kv, code)
2061
2062
2063 def qualities(quality_ids):
2064     """ Get a numeric quality value out of a list of possible values """
2065     def q(qid):
2066         try:
2067             return quality_ids.index(qid)
2068         except ValueError:
2069             return -1
2070     return q
2071
2072
2073 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2074
2075
2076 def limit_length(s, length):
2077     """ Add ellipses to overly long strings """
2078     if s is None:
2079         return None
2080     ELLIPSES = '...'
2081     if len(s) > length:
2082         return s[:length - len(ELLIPSES)] + ELLIPSES
2083     return s
2084
2085
2086 def version_tuple(v):
2087     return tuple(int(e) for e in re.split(r'[-.]', v))
2088
2089
2090 def is_outdated_version(version, limit, assume_new=True):
2091     if not version:
2092         return not assume_new
2093     try:
2094         return version_tuple(version) < version_tuple(limit)
2095     except ValueError:
2096         return not assume_new
2097
2098
2099 def ytdl_is_updateable():
2100     """ Returns if youtube-dl can be updated with -U """
2101     from zipimport import zipimporter
2102
2103     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2104
2105
2106 def args_to_str(args):
2107     # Get a short string representation for a subprocess command
2108     return ' '.join(compat_shlex_quote(a) for a in args)
2109
2110
2111 def error_to_compat_str(err):
2112     err_str = str(err)
2113     # On python 2 error byte string must be decoded with proper
2114     # encoding rather than ascii
2115     if sys.version_info[0] < 3:
2116         err_str = err_str.decode(preferredencoding())
2117     return err_str
2118
2119
2120 def mimetype2ext(mt):
2121     if mt is None:
2122         return None
2123
2124     ext = {
2125         'audio/mp4': 'm4a',
2126         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2127         # it's the most popular one
2128         'audio/mpeg': 'mp3',
2129     }.get(mt)
2130     if ext is not None:
2131         return ext
2132
2133     _, _, res = mt.rpartition('/')
2134     res = res.lower()
2135
2136     return {
2137         '3gpp': '3gp',
2138         'smptett+xml': 'tt',
2139         'srt': 'srt',
2140         'ttaf+xml': 'dfxp',
2141         'ttml+xml': 'ttml',
2142         'vtt': 'vtt',
2143         'x-flv': 'flv',
2144         'x-mp4-fragmented': 'mp4',
2145         'x-ms-wmv': 'wmv',
2146         'mpegurl': 'm3u8',
2147         'x-mpegurl': 'm3u8',
2148         'vnd.apple.mpegurl': 'm3u8',
2149         'dash+xml': 'mpd',
2150         'f4m': 'f4m',
2151         'f4m+xml': 'f4m',
2152         'hds+xml': 'f4m',
2153         'vnd.ms-sstr+xml': 'ism',
2154     }.get(res, res)
2155
2156
2157 def parse_codecs(codecs_str):
2158     # http://tools.ietf.org/html/rfc6381
2159     if not codecs_str:
2160         return {}
2161     splited_codecs = list(filter(None, map(
2162         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2163     vcodec, acodec = None, None
2164     for full_codec in splited_codecs:
2165         codec = full_codec.split('.')[0]
2166         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2167             if not vcodec:
2168                 vcodec = full_codec
2169         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'):
2170             if not acodec:
2171                 acodec = full_codec
2172         else:
2173             write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2174     if not vcodec and not acodec:
2175         if len(splited_codecs) == 2:
2176             return {
2177                 'vcodec': vcodec,
2178                 'acodec': acodec,
2179             }
2180         elif len(splited_codecs) == 1:
2181             return {
2182                 'vcodec': 'none',
2183                 'acodec': vcodec,
2184             }
2185     else:
2186         return {
2187             'vcodec': vcodec or 'none',
2188             'acodec': acodec or 'none',
2189         }
2190     return {}
2191
2192
2193 def urlhandle_detect_ext(url_handle):
2194     getheader = url_handle.headers.get
2195
2196     cd = getheader('Content-Disposition')
2197     if cd:
2198         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2199         if m:
2200             e = determine_ext(m.group('filename'), default_ext=None)
2201             if e:
2202                 return e
2203
2204     return mimetype2ext(getheader('Content-Type'))
2205
2206
2207 def encode_data_uri(data, mime_type):
2208     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2209
2210
2211 def age_restricted(content_limit, age_limit):
2212     """ Returns True iff the content should be blocked """
2213
2214     if age_limit is None:  # No limit set
2215         return False
2216     if content_limit is None:
2217         return False  # Content available for everyone
2218     return age_limit < content_limit
2219
2220
2221 def is_html(first_bytes):
2222     """ Detect whether a file contains HTML by examining its first bytes. """
2223
2224     BOMS = [
2225         (b'\xef\xbb\xbf', 'utf-8'),
2226         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2227         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2228         (b'\xff\xfe', 'utf-16-le'),
2229         (b'\xfe\xff', 'utf-16-be'),
2230     ]
2231     for bom, enc in BOMS:
2232         if first_bytes.startswith(bom):
2233             s = first_bytes[len(bom):].decode(enc, 'replace')
2234             break
2235     else:
2236         s = first_bytes.decode('utf-8', 'replace')
2237
2238     return re.match(r'^\s*<', s)
2239
2240
2241 def determine_protocol(info_dict):
2242     protocol = info_dict.get('protocol')
2243     if protocol is not None:
2244         return protocol
2245
2246     url = info_dict['url']
2247     if url.startswith('rtmp'):
2248         return 'rtmp'
2249     elif url.startswith('mms'):
2250         return 'mms'
2251     elif url.startswith('rtsp'):
2252         return 'rtsp'
2253
2254     ext = determine_ext(url)
2255     if ext == 'm3u8':
2256         return 'm3u8'
2257     elif ext == 'f4m':
2258         return 'f4m'
2259
2260     return compat_urllib_parse_urlparse(url).scheme
2261
2262
2263 def render_table(header_row, data):
2264     """ Render a list of rows, each as a list of values """
2265     table = [header_row] + data
2266     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2267     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2268     return '\n'.join(format_str % tuple(row) for row in table)
2269
2270
2271 def _match_one(filter_part, dct):
2272     COMPARISON_OPERATORS = {
2273         '<': operator.lt,
2274         '<=': operator.le,
2275         '>': operator.gt,
2276         '>=': operator.ge,
2277         '=': operator.eq,
2278         '!=': operator.ne,
2279     }
2280     operator_rex = re.compile(r'''(?x)\s*
2281         (?P<key>[a-z_]+)
2282         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2283         (?:
2284             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2285             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2286         )
2287         \s*$
2288         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2289     m = operator_rex.search(filter_part)
2290     if m:
2291         op = COMPARISON_OPERATORS[m.group('op')]
2292         if m.group('strval') is not None:
2293             if m.group('op') not in ('=', '!='):
2294                 raise ValueError(
2295                     'Operator %s does not support string values!' % m.group('op'))
2296             comparison_value = m.group('strval')
2297         else:
2298             try:
2299                 comparison_value = int(m.group('intval'))
2300             except ValueError:
2301                 comparison_value = parse_filesize(m.group('intval'))
2302                 if comparison_value is None:
2303                     comparison_value = parse_filesize(m.group('intval') + 'B')
2304                 if comparison_value is None:
2305                     raise ValueError(
2306                         'Invalid integer value %r in filter part %r' % (
2307                             m.group('intval'), filter_part))
2308         actual_value = dct.get(m.group('key'))
2309         if actual_value is None:
2310             return m.group('none_inclusive')
2311         return op(actual_value, comparison_value)
2312
2313     UNARY_OPERATORS = {
2314         '': lambda v: v is not None,
2315         '!': lambda v: v is None,
2316     }
2317     operator_rex = re.compile(r'''(?x)\s*
2318         (?P<op>%s)\s*(?P<key>[a-z_]+)
2319         \s*$
2320         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2321     m = operator_rex.search(filter_part)
2322     if m:
2323         op = UNARY_OPERATORS[m.group('op')]
2324         actual_value = dct.get(m.group('key'))
2325         return op(actual_value)
2326
2327     raise ValueError('Invalid filter part %r' % filter_part)
2328
2329
2330 def match_str(filter_str, dct):
2331     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2332
2333     return all(
2334         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2335
2336
2337 def match_filter_func(filter_str):
2338     def _match_func(info_dict):
2339         if match_str(filter_str, info_dict):
2340             return None
2341         else:
2342             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2343             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2344     return _match_func
2345
2346
2347 def parse_dfxp_time_expr(time_expr):
2348     if not time_expr:
2349         return
2350
2351     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2352     if mobj:
2353         return float(mobj.group('time_offset'))
2354
2355     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2356     if mobj:
2357         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2358
2359
2360 def srt_subtitles_timecode(seconds):
2361     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2362
2363
2364 def dfxp2srt(dfxp_data):
2365     _x = functools.partial(xpath_with_ns, ns_map={
2366         'ttml': 'http://www.w3.org/ns/ttml',
2367         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2368         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2369     })
2370
2371     class TTMLPElementParser(object):
2372         out = ''
2373
2374         def start(self, tag, attrib):
2375             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2376                 self.out += '\n'
2377
2378         def end(self, tag):
2379             pass
2380
2381         def data(self, data):
2382             self.out += data
2383
2384         def close(self):
2385             return self.out.strip()
2386
2387     def parse_node(node):
2388         target = TTMLPElementParser()
2389         parser = xml.etree.ElementTree.XMLParser(target=target)
2390         parser.feed(xml.etree.ElementTree.tostring(node))
2391         return parser.close()
2392
2393     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2394     out = []
2395     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2396
2397     if not paras:
2398         raise ValueError('Invalid dfxp/TTML subtitle')
2399
2400     for para, index in zip(paras, itertools.count(1)):
2401         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2402         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2403         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2404         if begin_time is None:
2405             continue
2406         if not end_time:
2407             if not dur:
2408                 continue
2409             end_time = begin_time + dur
2410         out.append('%d\n%s --> %s\n%s\n\n' % (
2411             index,
2412             srt_subtitles_timecode(begin_time),
2413             srt_subtitles_timecode(end_time),
2414             parse_node(para)))
2415
2416     return ''.join(out)
2417
2418
2419 def cli_option(params, command_option, param):
2420     param = params.get(param)
2421     if param:
2422         param = compat_str(param)
2423     return [command_option, param] if param is not None else []
2424
2425
2426 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2427     param = params.get(param)
2428     assert isinstance(param, bool)
2429     if separator:
2430         return [command_option + separator + (true_value if param else false_value)]
2431     return [command_option, true_value if param else false_value]
2432
2433
2434 def cli_valueless_option(params, command_option, param, expected_value=True):
2435     param = params.get(param)
2436     return [command_option] if param == expected_value else []
2437
2438
2439 def cli_configuration_args(params, param, default=[]):
2440     ex_args = params.get(param)
2441     if ex_args is None:
2442         return default
2443     assert isinstance(ex_args, list)
2444     return ex_args
2445
2446
2447 class ISO639Utils(object):
2448     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2449     _lang_map = {
2450         'aa': 'aar',
2451         'ab': 'abk',
2452         'ae': 'ave',
2453         'af': 'afr',
2454         'ak': 'aka',
2455         'am': 'amh',
2456         'an': 'arg',
2457         'ar': 'ara',
2458         'as': 'asm',
2459         'av': 'ava',
2460         'ay': 'aym',
2461         'az': 'aze',
2462         'ba': 'bak',
2463         'be': 'bel',
2464         'bg': 'bul',
2465         'bh': 'bih',
2466         'bi': 'bis',
2467         'bm': 'bam',
2468         'bn': 'ben',
2469         'bo': 'bod',
2470         'br': 'bre',
2471         'bs': 'bos',
2472         'ca': 'cat',
2473         'ce': 'che',
2474         'ch': 'cha',
2475         'co': 'cos',
2476         'cr': 'cre',
2477         'cs': 'ces',
2478         'cu': 'chu',
2479         'cv': 'chv',
2480         'cy': 'cym',
2481         'da': 'dan',
2482         'de': 'deu',
2483         'dv': 'div',
2484         'dz': 'dzo',
2485         'ee': 'ewe',
2486         'el': 'ell',
2487         'en': 'eng',
2488         'eo': 'epo',
2489         'es': 'spa',
2490         'et': 'est',
2491         'eu': 'eus',
2492         'fa': 'fas',
2493         'ff': 'ful',
2494         'fi': 'fin',
2495         'fj': 'fij',
2496         'fo': 'fao',
2497         'fr': 'fra',
2498         'fy': 'fry',
2499         'ga': 'gle',
2500         'gd': 'gla',
2501         'gl': 'glg',
2502         'gn': 'grn',
2503         'gu': 'guj',
2504         'gv': 'glv',
2505         'ha': 'hau',
2506         'he': 'heb',
2507         'hi': 'hin',
2508         'ho': 'hmo',
2509         'hr': 'hrv',
2510         'ht': 'hat',
2511         'hu': 'hun',
2512         'hy': 'hye',
2513         'hz': 'her',
2514         'ia': 'ina',
2515         'id': 'ind',
2516         'ie': 'ile',
2517         'ig': 'ibo',
2518         'ii': 'iii',
2519         'ik': 'ipk',
2520         'io': 'ido',
2521         'is': 'isl',
2522         'it': 'ita',
2523         'iu': 'iku',
2524         'ja': 'jpn',
2525         'jv': 'jav',
2526         'ka': 'kat',
2527         'kg': 'kon',
2528         'ki': 'kik',
2529         'kj': 'kua',
2530         'kk': 'kaz',
2531         'kl': 'kal',
2532         'km': 'khm',
2533         'kn': 'kan',
2534         'ko': 'kor',
2535         'kr': 'kau',
2536         'ks': 'kas',
2537         'ku': 'kur',
2538         'kv': 'kom',
2539         'kw': 'cor',
2540         'ky': 'kir',
2541         'la': 'lat',
2542         'lb': 'ltz',
2543         'lg': 'lug',
2544         'li': 'lim',
2545         'ln': 'lin',
2546         'lo': 'lao',
2547         'lt': 'lit',
2548         'lu': 'lub',
2549         'lv': 'lav',
2550         'mg': 'mlg',
2551         'mh': 'mah',
2552         'mi': 'mri',
2553         'mk': 'mkd',
2554         'ml': 'mal',
2555         'mn': 'mon',
2556         'mr': 'mar',
2557         'ms': 'msa',
2558         'mt': 'mlt',
2559         'my': 'mya',
2560         'na': 'nau',
2561         'nb': 'nob',
2562         'nd': 'nde',
2563         'ne': 'nep',
2564         'ng': 'ndo',
2565         'nl': 'nld',
2566         'nn': 'nno',
2567         'no': 'nor',
2568         'nr': 'nbl',
2569         'nv': 'nav',
2570         'ny': 'nya',
2571         'oc': 'oci',
2572         'oj': 'oji',
2573         'om': 'orm',
2574         'or': 'ori',
2575         'os': 'oss',
2576         'pa': 'pan',
2577         'pi': 'pli',
2578         'pl': 'pol',
2579         'ps': 'pus',
2580         'pt': 'por',
2581         'qu': 'que',
2582         'rm': 'roh',
2583         'rn': 'run',
2584         'ro': 'ron',
2585         'ru': 'rus',
2586         'rw': 'kin',
2587         'sa': 'san',
2588         'sc': 'srd',
2589         'sd': 'snd',
2590         'se': 'sme',
2591         'sg': 'sag',
2592         'si': 'sin',
2593         'sk': 'slk',
2594         'sl': 'slv',
2595         'sm': 'smo',
2596         'sn': 'sna',
2597         'so': 'som',
2598         'sq': 'sqi',
2599         'sr': 'srp',
2600         'ss': 'ssw',
2601         'st': 'sot',
2602         'su': 'sun',
2603         'sv': 'swe',
2604         'sw': 'swa',
2605         'ta': 'tam',
2606         'te': 'tel',
2607         'tg': 'tgk',
2608         'th': 'tha',
2609         'ti': 'tir',
2610         'tk': 'tuk',
2611         'tl': 'tgl',
2612         'tn': 'tsn',
2613         'to': 'ton',
2614         'tr': 'tur',
2615         'ts': 'tso',
2616         'tt': 'tat',
2617         'tw': 'twi',
2618         'ty': 'tah',
2619         'ug': 'uig',
2620         'uk': 'ukr',
2621         'ur': 'urd',
2622         'uz': 'uzb',
2623         've': 'ven',
2624         'vi': 'vie',
2625         'vo': 'vol',
2626         'wa': 'wln',
2627         'wo': 'wol',
2628         'xh': 'xho',
2629         'yi': 'yid',
2630         'yo': 'yor',
2631         'za': 'zha',
2632         'zh': 'zho',
2633         'zu': 'zul',
2634     }
2635
2636     @classmethod
2637     def short2long(cls, code):
2638         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2639         return cls._lang_map.get(code[:2])
2640
2641     @classmethod
2642     def long2short(cls, code):
2643         """Convert language code from ISO 639-2/T to ISO 639-1"""
2644         for short_name, long_name in cls._lang_map.items():
2645             if long_name == code:
2646                 return short_name
2647
2648
2649 class ISO3166Utils(object):
2650     # From http://data.okfn.org/data/core/country-list
2651     _country_map = {
2652         'AF': 'Afghanistan',
2653         'AX': 'Åland Islands',
2654         'AL': 'Albania',
2655         'DZ': 'Algeria',
2656         'AS': 'American Samoa',
2657         'AD': 'Andorra',
2658         'AO': 'Angola',
2659         'AI': 'Anguilla',
2660         'AQ': 'Antarctica',
2661         'AG': 'Antigua and Barbuda',
2662         'AR': 'Argentina',
2663         'AM': 'Armenia',
2664         'AW': 'Aruba',
2665         'AU': 'Australia',
2666         'AT': 'Austria',
2667         'AZ': 'Azerbaijan',
2668         'BS': 'Bahamas',
2669         'BH': 'Bahrain',
2670         'BD': 'Bangladesh',
2671         'BB': 'Barbados',
2672         'BY': 'Belarus',
2673         'BE': 'Belgium',
2674         'BZ': 'Belize',
2675         'BJ': 'Benin',
2676         'BM': 'Bermuda',
2677         'BT': 'Bhutan',
2678         'BO': 'Bolivia, Plurinational State of',
2679         'BQ': 'Bonaire, Sint Eustatius and Saba',
2680         'BA': 'Bosnia and Herzegovina',
2681         'BW': 'Botswana',
2682         'BV': 'Bouvet Island',
2683         'BR': 'Brazil',
2684         'IO': 'British Indian Ocean Territory',
2685         'BN': 'Brunei Darussalam',
2686         'BG': 'Bulgaria',
2687         'BF': 'Burkina Faso',
2688         'BI': 'Burundi',
2689         'KH': 'Cambodia',
2690         'CM': 'Cameroon',
2691         'CA': 'Canada',
2692         'CV': 'Cape Verde',
2693         'KY': 'Cayman Islands',
2694         'CF': 'Central African Republic',
2695         'TD': 'Chad',
2696         'CL': 'Chile',
2697         'CN': 'China',
2698         'CX': 'Christmas Island',
2699         'CC': 'Cocos (Keeling) Islands',
2700         'CO': 'Colombia',
2701         'KM': 'Comoros',
2702         'CG': 'Congo',
2703         'CD': 'Congo, the Democratic Republic of the',
2704         'CK': 'Cook Islands',
2705         'CR': 'Costa Rica',
2706         'CI': 'Côte d\'Ivoire',
2707         'HR': 'Croatia',
2708         'CU': 'Cuba',
2709         'CW': 'Curaçao',
2710         'CY': 'Cyprus',
2711         'CZ': 'Czech Republic',
2712         'DK': 'Denmark',
2713         'DJ': 'Djibouti',
2714         'DM': 'Dominica',
2715         'DO': 'Dominican Republic',
2716         'EC': 'Ecuador',
2717         'EG': 'Egypt',
2718         'SV': 'El Salvador',
2719         'GQ': 'Equatorial Guinea',
2720         'ER': 'Eritrea',
2721         'EE': 'Estonia',
2722         'ET': 'Ethiopia',
2723         'FK': 'Falkland Islands (Malvinas)',
2724         'FO': 'Faroe Islands',
2725         'FJ': 'Fiji',
2726         'FI': 'Finland',
2727         'FR': 'France',
2728         'GF': 'French Guiana',
2729         'PF': 'French Polynesia',
2730         'TF': 'French Southern Territories',
2731         'GA': 'Gabon',
2732         'GM': 'Gambia',
2733         'GE': 'Georgia',
2734         'DE': 'Germany',
2735         'GH': 'Ghana',
2736         'GI': 'Gibraltar',
2737         'GR': 'Greece',
2738         'GL': 'Greenland',
2739         'GD': 'Grenada',
2740         'GP': 'Guadeloupe',
2741         'GU': 'Guam',
2742         'GT': 'Guatemala',
2743         'GG': 'Guernsey',
2744         'GN': 'Guinea',
2745         'GW': 'Guinea-Bissau',
2746         'GY': 'Guyana',
2747         'HT': 'Haiti',
2748         'HM': 'Heard Island and McDonald Islands',
2749         'VA': 'Holy See (Vatican City State)',
2750         'HN': 'Honduras',
2751         'HK': 'Hong Kong',
2752         'HU': 'Hungary',
2753         'IS': 'Iceland',
2754         'IN': 'India',
2755         'ID': 'Indonesia',
2756         'IR': 'Iran, Islamic Republic of',
2757         'IQ': 'Iraq',
2758         'IE': 'Ireland',
2759         'IM': 'Isle of Man',
2760         'IL': 'Israel',
2761         'IT': 'Italy',
2762         'JM': 'Jamaica',
2763         'JP': 'Japan',
2764         'JE': 'Jersey',
2765         'JO': 'Jordan',
2766         'KZ': 'Kazakhstan',
2767         'KE': 'Kenya',
2768         'KI': 'Kiribati',
2769         'KP': 'Korea, Democratic People\'s Republic of',
2770         'KR': 'Korea, Republic of',
2771         'KW': 'Kuwait',
2772         'KG': 'Kyrgyzstan',
2773         'LA': 'Lao People\'s Democratic Republic',
2774         'LV': 'Latvia',
2775         'LB': 'Lebanon',
2776         'LS': 'Lesotho',
2777         'LR': 'Liberia',
2778         'LY': 'Libya',
2779         'LI': 'Liechtenstein',
2780         'LT': 'Lithuania',
2781         'LU': 'Luxembourg',
2782         'MO': 'Macao',
2783         'MK': 'Macedonia, the Former Yugoslav Republic of',
2784         'MG': 'Madagascar',
2785         'MW': 'Malawi',
2786         'MY': 'Malaysia',
2787         'MV': 'Maldives',
2788         'ML': 'Mali',
2789         'MT': 'Malta',
2790         'MH': 'Marshall Islands',
2791         'MQ': 'Martinique',
2792         'MR': 'Mauritania',
2793         'MU': 'Mauritius',
2794         'YT': 'Mayotte',
2795         'MX': 'Mexico',
2796         'FM': 'Micronesia, Federated States of',
2797         'MD': 'Moldova, Republic of',
2798         'MC': 'Monaco',
2799         'MN': 'Mongolia',
2800         'ME': 'Montenegro',
2801         'MS': 'Montserrat',
2802         'MA': 'Morocco',
2803         'MZ': 'Mozambique',
2804         'MM': 'Myanmar',
2805         'NA': 'Namibia',
2806         'NR': 'Nauru',
2807         'NP': 'Nepal',
2808         'NL': 'Netherlands',
2809         'NC': 'New Caledonia',
2810         'NZ': 'New Zealand',
2811         'NI': 'Nicaragua',
2812         'NE': 'Niger',
2813         'NG': 'Nigeria',
2814         'NU': 'Niue',
2815         'NF': 'Norfolk Island',
2816         'MP': 'Northern Mariana Islands',
2817         'NO': 'Norway',
2818         'OM': 'Oman',
2819         'PK': 'Pakistan',
2820         'PW': 'Palau',
2821         'PS': 'Palestine, State of',
2822         'PA': 'Panama',
2823         'PG': 'Papua New Guinea',
2824         'PY': 'Paraguay',
2825         'PE': 'Peru',
2826         'PH': 'Philippines',
2827         'PN': 'Pitcairn',
2828         'PL': 'Poland',
2829         'PT': 'Portugal',
2830         'PR': 'Puerto Rico',
2831         'QA': 'Qatar',
2832         'RE': 'Réunion',
2833         'RO': 'Romania',
2834         'RU': 'Russian Federation',
2835         'RW': 'Rwanda',
2836         'BL': 'Saint Barthélemy',
2837         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2838         'KN': 'Saint Kitts and Nevis',
2839         'LC': 'Saint Lucia',
2840         'MF': 'Saint Martin (French part)',
2841         'PM': 'Saint Pierre and Miquelon',
2842         'VC': 'Saint Vincent and the Grenadines',
2843         'WS': 'Samoa',
2844         'SM': 'San Marino',
2845         'ST': 'Sao Tome and Principe',
2846         'SA': 'Saudi Arabia',
2847         'SN': 'Senegal',
2848         'RS': 'Serbia',
2849         'SC': 'Seychelles',
2850         'SL': 'Sierra Leone',
2851         'SG': 'Singapore',
2852         'SX': 'Sint Maarten (Dutch part)',
2853         'SK': 'Slovakia',
2854         'SI': 'Slovenia',
2855         'SB': 'Solomon Islands',
2856         'SO': 'Somalia',
2857         'ZA': 'South Africa',
2858         'GS': 'South Georgia and the South Sandwich Islands',
2859         'SS': 'South Sudan',
2860         'ES': 'Spain',
2861         'LK': 'Sri Lanka',
2862         'SD': 'Sudan',
2863         'SR': 'Suriname',
2864         'SJ': 'Svalbard and Jan Mayen',
2865         'SZ': 'Swaziland',
2866         'SE': 'Sweden',
2867         'CH': 'Switzerland',
2868         'SY': 'Syrian Arab Republic',
2869         'TW': 'Taiwan, Province of China',
2870         'TJ': 'Tajikistan',
2871         'TZ': 'Tanzania, United Republic of',
2872         'TH': 'Thailand',
2873         'TL': 'Timor-Leste',
2874         'TG': 'Togo',
2875         'TK': 'Tokelau',
2876         'TO': 'Tonga',
2877         'TT': 'Trinidad and Tobago',
2878         'TN': 'Tunisia',
2879         'TR': 'Turkey',
2880         'TM': 'Turkmenistan',
2881         'TC': 'Turks and Caicos Islands',
2882         'TV': 'Tuvalu',
2883         'UG': 'Uganda',
2884         'UA': 'Ukraine',
2885         'AE': 'United Arab Emirates',
2886         'GB': 'United Kingdom',
2887         'US': 'United States',
2888         'UM': 'United States Minor Outlying Islands',
2889         'UY': 'Uruguay',
2890         'UZ': 'Uzbekistan',
2891         'VU': 'Vanuatu',
2892         'VE': 'Venezuela, Bolivarian Republic of',
2893         'VN': 'Viet Nam',
2894         'VG': 'Virgin Islands, British',
2895         'VI': 'Virgin Islands, U.S.',
2896         'WF': 'Wallis and Futuna',
2897         'EH': 'Western Sahara',
2898         'YE': 'Yemen',
2899         'ZM': 'Zambia',
2900         'ZW': 'Zimbabwe',
2901     }
2902
2903     @classmethod
2904     def short2full(cls, code):
2905         """Convert an ISO 3166-2 country code to the corresponding full name"""
2906         return cls._country_map.get(code.upper())
2907
2908
2909 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2910     def __init__(self, proxies=None):
2911         # Set default handlers
2912         for type in ('http', 'https'):
2913             setattr(self, '%s_open' % type,
2914                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2915                         meth(r, proxy, type))
2916         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2917
2918     def proxy_open(self, req, proxy, type):
2919         req_proxy = req.headers.get('Ytdl-request-proxy')
2920         if req_proxy is not None:
2921             proxy = req_proxy
2922             del req.headers['Ytdl-request-proxy']
2923
2924         if proxy == '__noproxy__':
2925             return None  # No Proxy
2926         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2927             req.add_header('Ytdl-socks-proxy', proxy)
2928             # youtube-dl's http/https handlers do wrapping the socket with socks
2929             return None
2930         return compat_urllib_request.ProxyHandler.proxy_open(
2931             self, req, proxy, type)
2932
2933
2934 def ohdave_rsa_encrypt(data, exponent, modulus):
2935     '''
2936     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2937
2938     Input:
2939         data: data to encrypt, bytes-like object
2940         exponent, modulus: parameter e and N of RSA algorithm, both integer
2941     Output: hex string of encrypted data
2942
2943     Limitation: supports one block encryption only
2944     '''
2945
2946     payload = int(binascii.hexlify(data[::-1]), 16)
2947     encrypted = pow(payload, exponent, modulus)
2948     return '%x' % encrypted
2949
2950
2951 def encode_base_n(num, n, table=None):
2952     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2953     if not table:
2954         table = FULL_TABLE[:n]
2955
2956     if n > len(table):
2957         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2958
2959     if num == 0:
2960         return table[0]
2961
2962     ret = ''
2963     while num:
2964         ret = table[num % n] + ret
2965         num = num // n
2966     return ret
2967
2968
2969 def decode_packed_codes(code):
2970     mobj = re.search(
2971         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2972         code)
2973     obfucasted_code, base, count, symbols = mobj.groups()
2974     base = int(base)
2975     count = int(count)
2976     symbols = symbols.split('|')
2977     symbol_table = {}
2978
2979     while count:
2980         count -= 1
2981         base_n_count = encode_base_n(count, base)
2982         symbol_table[base_n_count] = symbols[count] or base_n_count
2983
2984     return re.sub(
2985         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2986         obfucasted_code)
2987
2988
2989 def parse_m3u8_attributes(attrib):
2990     info = {}
2991     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2992         if val.startswith('"'):
2993             val = val[1:-1]
2994         info[key] = val
2995     return info
2996
2997
2998 def urshift(val, n):
2999     return val >> n if val >= 0 else (val + 0x100000000) >> n
3000
3001
3002 # Based on png2str() written by @gdkchan and improved by @yokrysty
3003 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3004 def decode_png(png_data):
3005     # Reference: https://www.w3.org/TR/PNG/
3006     header = png_data[8:]
3007
3008     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3009         raise IOError('Not a valid PNG file.')
3010
3011     int_map = {1: '>B', 2: '>H', 4: '>I'}
3012     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3013
3014     chunks = []
3015
3016     while header:
3017         length = unpack_integer(header[:4])
3018         header = header[4:]
3019
3020         chunk_type = header[:4]
3021         header = header[4:]
3022
3023         chunk_data = header[:length]
3024         header = header[length:]
3025
3026         header = header[4:]  # Skip CRC
3027
3028         chunks.append({
3029             'type': chunk_type,
3030             'length': length,
3031             'data': chunk_data
3032         })
3033
3034     ihdr = chunks[0]['data']
3035
3036     width = unpack_integer(ihdr[:4])
3037     height = unpack_integer(ihdr[4:8])
3038
3039     idat = b''
3040
3041     for chunk in chunks:
3042         if chunk['type'] == b'IDAT':
3043             idat += chunk['data']
3044
3045     if not idat:
3046         raise IOError('Unable to read PNG data.')
3047
3048     decompressed_data = bytearray(zlib.decompress(idat))
3049
3050     stride = width * 3
3051     pixels = []
3052
3053     def _get_pixel(idx):
3054         x = idx % stride
3055         y = idx // stride
3056         return pixels[y][x]
3057
3058     for y in range(height):
3059         basePos = y * (1 + stride)
3060         filter_type = decompressed_data[basePos]
3061
3062         current_row = []
3063
3064         pixels.append(current_row)
3065
3066         for x in range(stride):
3067             color = decompressed_data[1 + basePos + x]
3068             basex = y * stride + x
3069             left = 0
3070             up = 0
3071
3072             if x > 2:
3073                 left = _get_pixel(basex - 3)
3074             if y > 0:
3075                 up = _get_pixel(basex - stride)
3076
3077             if filter_type == 1:  # Sub
3078                 color = (color + left) & 0xff
3079             elif filter_type == 2:  # Up
3080                 color = (color + up) & 0xff
3081             elif filter_type == 3:  # Average
3082                 color = (color + ((left + up) >> 1)) & 0xff
3083             elif filter_type == 4:  # Paeth
3084                 a = left
3085                 b = up
3086                 c = 0
3087
3088                 if x > 2 and y > 0:
3089                     c = _get_pixel(basex - stride - 3)
3090
3091                 p = a + b - c
3092
3093                 pa = abs(p - a)
3094                 pb = abs(p - b)
3095                 pc = abs(p - c)
3096
3097                 if pa <= pb and pa <= pc:
3098                     color = (color + a) & 0xff
3099                 elif pb <= pc:
3100                     color = (color + b) & 0xff
3101                 else:
3102                     color = (color + c) & 0xff
3103
3104             current_row.append(color)
3105
3106     return width, height, pixels