yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import collections
  11 import contextlib
  12 import ctypes
  13 import datetime
  14 import email.utils
  15 import email.header
  16 import errno
  17 import functools
  18 import gzip
  19 import hashlib
  20 import hmac
  21 import importlib.util
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import operator
  28 import os
  29 import platform
  30 import random
  31 import re
  32 import socket
  33 import ssl
  34 import subprocess
  35 import sys
  36 import tempfile
  37 import time
  38 import traceback
  39 import xml.etree.ElementTree
  40 import zlib
  41 import mimetypes
  42
  43 from .compat import (
  44     compat_HTMLParseError,
  45     compat_HTMLParser,
  46     compat_HTTPError,
  47     compat_basestring,
  48     compat_chr,
  49     compat_cookiejar,
  50     compat_ctypes_WINFUNCTYPE,
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_html_entities,
  54     compat_html_entities_html5,
  55     compat_http_client,
  56     compat_integer_types,
  57     compat_numeric_types,
  58     compat_kwargs,
  59     compat_os_name,
  60     compat_parse_qs,
  61     compat_shlex_quote,
  62     compat_str,
  63     compat_struct_pack,
  64     compat_struct_unpack,
  65     compat_urllib_error,
  66     compat_urllib_parse,
  67     compat_urllib_parse_urlencode,
  68     compat_urllib_parse_urlparse,
  69     compat_urllib_parse_urlunparse,
  70     compat_urllib_parse_quote,
  71     compat_urllib_parse_quote_plus,
  72     compat_urllib_parse_unquote_plus,
  73     compat_urllib_request,
  74     compat_urlparse,
  75     compat_xpath,
  76 )
  77
  78 from .socks import (
  79     ProxyType,
  80     sockssocket,
  81 )
  82
  83
  84 def register_socks_protocols():
  85     # "Register" SOCKS protocols
  86     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  87     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  88     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  89         if scheme not in compat_urlparse.uses_netloc:
  90             compat_urlparse.uses_netloc.append(scheme)
  91
  92
  93 # This is not clearly defined otherwise
  94 compiled_regex_type = type(re.compile(''))
  95
  96
  97 def random_user_agent():
  98     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  99     _CHROME_VERSIONS = (
 100         '90.0.4430.212',
 101         '90.0.4430.24',
 102         '90.0.4430.70',
 103         '90.0.4430.72',
 104         '90.0.4430.85',
 105         '90.0.4430.93',
 106         '91.0.4472.101',
 107         '91.0.4472.106',
 108         '91.0.4472.114',
 109         '91.0.4472.124',
 110         '91.0.4472.164',
 111         '91.0.4472.19',
 112         '91.0.4472.77',
 113         '92.0.4515.107',
 114         '92.0.4515.115',
 115         '92.0.4515.131',
 116         '92.0.4515.159',
 117         '92.0.4515.43',
 118         '93.0.4556.0',
 119         '93.0.4577.15',
 120         '93.0.4577.63',
 121         '93.0.4577.82',
 122         '94.0.4606.41',
 123         '94.0.4606.54',
 124         '94.0.4606.61',
 125         '94.0.4606.71',
 126         '94.0.4606.81',
 127         '94.0.4606.85',
 128         '95.0.4638.17',
 129         '95.0.4638.50',
 130         '95.0.4638.54',
 131         '95.0.4638.69',
 132         '95.0.4638.74',
 133         '96.0.4664.18',
 134         '96.0.4664.45',
 135         '96.0.4664.55',
 136         '96.0.4664.93',
 137         '97.0.4692.20',
 138     )
 139     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 140
 141
 142 std_headers = {
 143     'User-Agent': random_user_agent(),
 144     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 145     'Accept-Encoding': 'gzip, deflate',
 146     'Accept-Language': 'en-us,en;q=0.5',
 147 }
 148
 149
 150 USER_AGENTS = {
 151     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 152 }
 153
 154
 155 NO_DEFAULT = object()
 156
 157 ENGLISH_MONTH_NAMES = [
 158     'January', 'February', 'March', 'April', 'May', 'June',
 159     'July', 'August', 'September', 'October', 'November', 'December']
 160
 161 MONTH_NAMES = {
 162     'en': ENGLISH_MONTH_NAMES,
 163     'fr': [
 164         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 165         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 166 }
 167
 168 KNOWN_EXTENSIONS = (
 169     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 170     'flv', 'f4v', 'f4a', 'f4b',
 171     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 172     'mkv', 'mka', 'mk3d',
 173     'avi', 'divx',
 174     'mov',
 175     'asf', 'wmv', 'wma',
 176     '3gp', '3g2',
 177     'mp3',
 178     'flac',
 179     'ape',
 180     'wav',
 181     'f4f', 'f4m', 'm3u8', 'smil')
 182
 183 # needed for sanitizing filenames in restricted mode
 184 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 185                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 186                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 187
 188 DATE_FORMATS = (
 189     '%d %B %Y',
 190     '%d %b %Y',
 191     '%B %d %Y',
 192     '%B %dst %Y',
 193     '%B %dnd %Y',
 194     '%B %drd %Y',
 195     '%B %dth %Y',
 196     '%b %d %Y',
 197     '%b %dst %Y',
 198     '%b %dnd %Y',
 199     '%b %drd %Y',
 200     '%b %dth %Y',
 201     '%b %dst %Y %I:%M',
 202     '%b %dnd %Y %I:%M',
 203     '%b %drd %Y %I:%M',
 204     '%b %dth %Y %I:%M',
 205     '%Y %m %d',
 206     '%Y-%m-%d',
 207     '%Y.%m.%d.',
 208     '%Y/%m/%d',
 209     '%Y/%m/%d %H:%M',
 210     '%Y/%m/%d %H:%M:%S',
 211     '%Y%m%d%H%M',
 212     '%Y%m%d%H%M%S',
 213     '%Y%m%d',
 214     '%Y-%m-%d %H:%M',
 215     '%Y-%m-%d %H:%M:%S',
 216     '%Y-%m-%d %H:%M:%S.%f',
 217     '%Y-%m-%d %H:%M:%S:%f',
 218     '%d.%m.%Y %H:%M',
 219     '%d.%m.%Y %H.%M',
 220     '%Y-%m-%dT%H:%M:%SZ',
 221     '%Y-%m-%dT%H:%M:%S.%fZ',
 222     '%Y-%m-%dT%H:%M:%S.%f0Z',
 223     '%Y-%m-%dT%H:%M:%S',
 224     '%Y-%m-%dT%H:%M:%S.%f',
 225     '%Y-%m-%dT%H:%M',
 226     '%b %d %Y at %H:%M',
 227     '%b %d %Y at %H:%M:%S',
 228     '%B %d %Y at %H:%M',
 229     '%B %d %Y at %H:%M:%S',
 230     '%H:%M %d-%b-%Y',
 231 )
 232
 233 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 234 DATE_FORMATS_DAY_FIRST.extend([
 235     '%d-%m-%Y',
 236     '%d.%m.%Y',
 237     '%d.%m.%y',
 238     '%d/%m/%Y',
 239     '%d/%m/%y',
 240     '%d/%m/%Y %H:%M:%S',
 241 ])
 242
 243 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 244 DATE_FORMATS_MONTH_FIRST.extend([
 245     '%m-%d-%Y',
 246     '%m.%d.%Y',
 247     '%m/%d/%Y',
 248     '%m/%d/%y',
 249     '%m/%d/%Y %H:%M:%S',
 250 ])
 251
 252 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 253 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 254
 255
 256 def preferredencoding():
 257     """Get preferred encoding.
 258
 259     Returns the best encoding scheme for the system, based on
 260     locale.getpreferredencoding() and some further tweaks.
 261     """
 262     try:
 263         pref = locale.getpreferredencoding()
 264         'TEST'.encode(pref)
 265     except Exception:
 266         pref = 'UTF-8'
 267
 268     return pref
 269
 270
 271 def write_json_file(obj, fn):
 272     """ Encode obj as JSON and write it to fn, atomically if possible """
 273
 274     fn = encodeFilename(fn)
 275     if sys.version_info < (3, 0) and sys.platform != 'win32':
 276         encoding = get_filesystem_encoding()
 277         # os.path.basename returns a bytes object, but NamedTemporaryFile
 278         # will fail if the filename contains non ascii characters unless we
 279         # use a unicode object
 280         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 281         # the same for os.path.dirname
 282         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 283     else:
 284         path_basename = os.path.basename
 285         path_dirname = os.path.dirname
 286
 287     args = {
 288         'suffix': '.tmp',
 289         'prefix': path_basename(fn) + '.',
 290         'dir': path_dirname(fn),
 291         'delete': False,
 292     }
 293
 294     # In Python 2.x, json.dump expects a bytestream.
 295     # In Python 3.x, it writes to a character stream
 296     if sys.version_info < (3, 0):
 297         args['mode'] = 'wb'
 298     else:
 299         args.update({
 300             'mode': 'w',
 301             'encoding': 'utf-8',
 302         })
 303
 304     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 305
 306     try:
 307         with tf:
 308             json.dump(obj, tf)
 309         if sys.platform == 'win32':
 310             # Need to remove existing file on Windows, else os.rename raises
 311             # WindowsError or FileExistsError.
 312             try:
 313                 os.unlink(fn)
 314             except OSError:
 315                 pass
 316         try:
 317             mask = os.umask(0)
 318             os.umask(mask)
 319             os.chmod(tf.name, 0o666 & ~mask)
 320         except OSError:
 321             pass
 322         os.rename(tf.name, fn)
 323     except Exception:
 324         try:
 325             os.remove(tf.name)
 326         except OSError:
 327             pass
 328         raise
 329
 330
 331 if sys.version_info >= (2, 7):
 332     def find_xpath_attr(node, xpath, key, val=None):
 333         """ Find the xpath xpath[@key=val] """
 334         assert re.match(r'^[a-zA-Z_-]+$', key)
 335         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 336         return node.find(expr)
 337 else:
 338     def find_xpath_attr(node, xpath, key, val=None):
 339         for f in node.findall(compat_xpath(xpath)):
 340             if key not in f.attrib:
 341                 continue
 342             if val is None or f.attrib.get(key) == val:
 343                 return f
 344         return None
 345
 346 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 347 # the namespace parameter
 348
 349
 350 def xpath_with_ns(path, ns_map):
 351     components = [c.split(':') for c in path.split('/')]
 352     replaced = []
 353     for c in components:
 354         if len(c) == 1:
 355             replaced.append(c[0])
 356         else:
 357             ns, tag = c
 358             replaced.append('{%s}%s' % (ns_map[ns], tag))
 359     return '/'.join(replaced)
 360
 361
 362 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 363     def _find_xpath(xpath):
 364         return node.find(compat_xpath(xpath))
 365
 366     if isinstance(xpath, (str, compat_str)):
 367         n = _find_xpath(xpath)
 368     else:
 369         for xp in xpath:
 370             n = _find_xpath(xp)
 371             if n is not None:
 372                 break
 373
 374     if n is None:
 375         if default is not NO_DEFAULT:
 376             return default
 377         elif fatal:
 378             name = xpath if name is None else name
 379             raise ExtractorError('Could not find XML element %s' % name)
 380         else:
 381             return None
 382     return n
 383
 384
 385 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 386     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 387     if n is None or n == default:
 388         return n
 389     if n.text is None:
 390         if default is not NO_DEFAULT:
 391             return default
 392         elif fatal:
 393             name = xpath if name is None else name
 394             raise ExtractorError('Could not find XML element\'s text %s' % name)
 395         else:
 396             return None
 397     return n.text
 398
 399
 400 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 401     n = find_xpath_attr(node, xpath, key)
 402     if n is None:
 403         if default is not NO_DEFAULT:
 404             return default
 405         elif fatal:
 406             name = '%s[@%s]' % (xpath, key) if name is None else name
 407             raise ExtractorError('Could not find XML attribute %s' % name)
 408         else:
 409             return None
 410     return n.attrib[key]
 411
 412
 413 def get_element_by_id(id, html):
 414     """Return the content of the tag with the specified ID in the passed HTML document"""
 415     return get_element_by_attribute('id', id, html)
 416
 417
 418 def get_element_by_class(class_name, html):
 419     """Return the content of the first tag with the specified class in the passed HTML document"""
 420     retval = get_elements_by_class(class_name, html)
 421     return retval[0] if retval else None
 422
 423
 424 def get_element_by_attribute(attribute, value, html, escape_value=True):
 425     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 426     return retval[0] if retval else None
 427
 428
 429 def get_elements_by_class(class_name, html):
 430     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 431     return get_elements_by_attribute(
 432         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 433         html, escape_value=False)
 434
 435
 436 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 437     """Return the content of the tag with the specified attribute in the passed HTML document"""
 438
 439     value = re.escape(value) if escape_value else value
 440
 441     retlist = []
 442     for m in re.finditer(r'''(?xs)
 443         <([a-zA-Z0-9:._-]+)
 444          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 445          \s+%s=['"]?%s['"]?
 446          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 447         \s*>
 448         (?P<content>.*?)
 449         </\1>
 450     ''' % (re.escape(attribute), value), html):
 451         res = m.group('content')
 452
 453         if res.startswith('"') or res.startswith("'"):
 454             res = res[1:-1]
 455
 456         retlist.append(unescapeHTML(res))
 457
 458     return retlist
 459
 460
 461 class HTMLAttributeParser(compat_HTMLParser):
 462     """Trivial HTML parser to gather the attributes for a single element"""
 463
 464     def __init__(self):
 465         self.attrs = {}
 466         compat_HTMLParser.__init__(self)
 467
 468     def handle_starttag(self, tag, attrs):
 469         self.attrs = dict(attrs)
 470
 471
 472 class HTMLListAttrsParser(compat_HTMLParser):
 473     """HTML parser to gather the attributes for the elements of a list"""
 474
 475     def __init__(self):
 476         compat_HTMLParser.__init__(self)
 477         self.items = []
 478         self._level = 0
 479
 480     def handle_starttag(self, tag, attrs):
 481         if tag == 'li' and self._level == 0:
 482             self.items.append(dict(attrs))
 483         self._level += 1
 484
 485     def handle_endtag(self, tag):
 486         self._level -= 1
 487
 488
 489 def extract_attributes(html_element):
 490     """Given a string for an HTML element such as
 491     <el
 492          a="foo" B="bar" c="&98;az" d=boz
 493          empty= noval entity="&amp;"
 494          sq='"' dq="'"
 495     >
 496     Decode and return a dictionary of attributes.
 497     {
 498         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 499         'empty': '', 'noval': None, 'entity': '&',
 500         'sq': '"', 'dq': '\''
 501     }.
 502     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 503     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 504     """
 505     parser = HTMLAttributeParser()
 506     try:
 507         parser.feed(html_element)
 508         parser.close()
 509     # Older Python may throw HTMLParseError in case of malformed HTML
 510     except compat_HTMLParseError:
 511         pass
 512     return parser.attrs
 513
 514
 515 def parse_list(webpage):
 516     """Given a string for an series of HTML <li> elements,
 517     return a dictionary of their attributes"""
 518     parser = HTMLListAttrsParser()
 519     parser.feed(webpage)
 520     parser.close()
 521     return parser.items
 522
 523
 524 def clean_html(html):
 525     """Clean an HTML snippet into a readable string"""
 526
 527     if html is None:  # Convenience for sanitizing descriptions etc.
 528         return html
 529
 530     # Newline vs <br />
 531     html = html.replace('\n', ' ')
 532     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 533     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 534     # Strip html tags
 535     html = re.sub('<.*?>', '', html)
 536     # Replace html entities
 537     html = unescapeHTML(html)
 538     return html.strip()
 539
 540
 541 def sanitize_open(filename, open_mode):
 542     """Try to open the given filename, and slightly tweak it if this fails.
 543
 544     Attempts to open the given filename. If this fails, it tries to change
 545     the filename slightly, step by step, until it's either able to open it
 546     or it fails and raises a final exception, like the standard open()
 547     function.
 548
 549     It returns the tuple (stream, definitive_file_name).
 550     """
 551     try:
 552         if filename == '-':
 553             if sys.platform == 'win32':
 554                 import msvcrt
 555                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 556             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 557         stream = open(encodeFilename(filename), open_mode)
 558         return (stream, filename)
 559     except (IOError, OSError) as err:
 560         if err.errno in (errno.EACCES,):
 561             raise
 562
 563         # In case of error, try to remove win32 forbidden chars
 564         alt_filename = sanitize_path(filename)
 565         if alt_filename == filename:
 566             raise
 567         else:
 568             # An exception here should be caught in the caller
 569             stream = open(encodeFilename(alt_filename), open_mode)
 570             return (stream, alt_filename)
 571
 572
 573 def timeconvert(timestr):
 574     """Convert RFC 2822 defined time string into system timestamp"""
 575     timestamp = None
 576     timetuple = email.utils.parsedate_tz(timestr)
 577     if timetuple is not None:
 578         timestamp = email.utils.mktime_tz(timetuple)
 579     return timestamp
 580
 581
 582 def sanitize_filename(s, restricted=False, is_id=False):
 583     """Sanitizes a string so it could be used as part of a filename.
 584     If restricted is set, use a stricter subset of allowed characters.
 585     Set is_id if this is not an arbitrary string, but an ID that should be kept
 586     if possible.
 587     """
 588     def replace_insane(char):
 589         if restricted and char in ACCENT_CHARS:
 590             return ACCENT_CHARS[char]
 591         elif not restricted and char == '\n':
 592             return ' '
 593         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 594             return ''
 595         elif char == '"':
 596             return '' if restricted else '\''
 597         elif char == ':':
 598             return '_-' if restricted else ' -'
 599         elif char in '\\/|*<>':
 600             return '_'
 601         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 602             return '_'
 603         if restricted and ord(char) > 127:
 604             return '_'
 605         return char
 606
 607     if s == '':
 608         return ''
 609     # Handle timestamps
 610     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 611     result = ''.join(map(replace_insane, s))
 612     if not is_id:
 613         while '__' in result:
 614             result = result.replace('__', '_')
 615         result = result.strip('_')
 616         # Common case of "Foreign band name - English song title"
 617         if restricted and result.startswith('-_'):
 618             result = result[2:]
 619         if result.startswith('-'):
 620             result = '_' + result[len('-'):]
 621         result = result.lstrip('.')
 622         if not result:
 623             result = '_'
 624     return result
 625
 626
 627 def sanitize_path(s, force=False):
 628     """Sanitizes and normalizes path on Windows"""
 629     if sys.platform == 'win32':
 630         force = False
 631         drive_or_unc, _ = os.path.splitdrive(s)
 632         if sys.version_info < (2, 7) and not drive_or_unc:
 633             drive_or_unc, _ = os.path.splitunc(s)
 634     elif force:
 635         drive_or_unc = ''
 636     else:
 637         return s
 638
 639     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 640     if drive_or_unc:
 641         norm_path.pop(0)
 642     sanitized_path = [
 643         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 644         for path_part in norm_path]
 645     if drive_or_unc:
 646         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 647     elif force and s[0] == os.path.sep:
 648         sanitized_path.insert(0, os.path.sep)
 649     return os.path.join(*sanitized_path)
 650
 651
 652 def sanitize_url(url):
 653     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 654     # the number of unwanted failures due to missing protocol
 655     if url.startswith('//'):
 656         return 'http:%s' % url
 657     # Fix some common typos seen so far
 658     COMMON_TYPOS = (
 659         # https://github.com/ytdl-org/youtube-dl/issues/15649
 660         (r'^httpss://', r'https://'),
 661         # https://bx1.be/lives/direct-tv/
 662         (r'^rmtp([es]?)://', r'rtmp\1://'),
 663     )
 664     for mistake, fixup in COMMON_TYPOS:
 665         if re.match(mistake, url):
 666             return re.sub(mistake, fixup, url)
 667     return url
 668
 669
 670 def extract_basic_auth(url):
 671     parts = compat_urlparse.urlsplit(url)
 672     if parts.username is None:
 673         return url, None
 674     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 675         parts.hostname if parts.port is None
 676         else '%s:%d' % (parts.hostname, parts.port))))
 677     auth_payload = base64.b64encode(
 678         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 679     return url, 'Basic ' + auth_payload.decode('utf-8')
 680
 681
 682 def sanitized_Request(url, *args, **kwargs):
 683     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 684     if auth_header is not None:
 685         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 686         headers['Authorization'] = auth_header
 687     return compat_urllib_request.Request(url, *args, **kwargs)
 688
 689
 690 def expand_path(s):
 691     """Expand shell variables and ~"""
 692     return os.path.expandvars(compat_expanduser(s))
 693
 694
 695 def orderedSet(iterable):
 696     """ Remove all duplicates from the input iterable """
 697     res = []
 698     for el in iterable:
 699         if el not in res:
 700             res.append(el)
 701     return res
 702
 703
 704 def _htmlentity_transform(entity_with_semicolon):
 705     """Transforms an HTML entity to a character."""
 706     entity = entity_with_semicolon[:-1]
 707
 708     # Known non-numeric HTML entity
 709     if entity in compat_html_entities.name2codepoint:
 710         return compat_chr(compat_html_entities.name2codepoint[entity])
 711
 712     # TODO: HTML5 allows entities without a semicolon. For example,
 713     # '&Eacuteric' should be decoded as 'Éric'.
 714     if entity_with_semicolon in compat_html_entities_html5:
 715         return compat_html_entities_html5[entity_with_semicolon]
 716
 717     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 718     if mobj is not None:
 719         numstr = mobj.group(1)
 720         if numstr.startswith('x'):
 721             base = 16
 722             numstr = '0%s' % numstr
 723         else:
 724             base = 10
 725         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 726         try:
 727             return compat_chr(int(numstr, base))
 728         except ValueError:
 729             pass
 730
 731     # Unknown entity in name, return its literal representation
 732     return '&%s;' % entity
 733
 734
 735 def unescapeHTML(s):
 736     if s is None:
 737         return None
 738     assert type(s) == compat_str
 739
 740     return re.sub(
 741         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 742
 743
 744 def escapeHTML(text):
 745     return (
 746         text
 747         .replace('&', '&amp;')
 748         .replace('<', '&lt;')
 749         .replace('>', '&gt;')
 750         .replace('"', '&quot;')
 751         .replace("'", '&#39;')
 752     )
 753
 754
 755 def process_communicate_or_kill(p, *args, **kwargs):
 756     try:
 757         return p.communicate(*args, **kwargs)
 758     except BaseException:  # Including KeyboardInterrupt
 759         p.kill()
 760         p.wait()
 761         raise
 762
 763
 764 class Popen(subprocess.Popen):
 765     if sys.platform == 'win32':
 766         _startupinfo = subprocess.STARTUPINFO()
 767         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 768     else:
 769         _startupinfo = None
 770
 771     def __init__(self, *args, **kwargs):
 772         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 773
 774     def communicate_or_kill(self, *args, **kwargs):
 775         return process_communicate_or_kill(self, *args, **kwargs)
 776
 777
 778 def get_subprocess_encoding():
 779     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 780         # For subprocess calls, encode with locale encoding
 781         # Refer to http://stackoverflow.com/a/9951851/35070
 782         encoding = preferredencoding()
 783     else:
 784         encoding = sys.getfilesystemencoding()
 785     if encoding is None:
 786         encoding = 'utf-8'
 787     return encoding
 788
 789
 790 def encodeFilename(s, for_subprocess=False):
 791     """
 792     @param s The name of the file
 793     """
 794
 795     assert type(s) == compat_str
 796
 797     # Python 3 has a Unicode API
 798     if sys.version_info >= (3, 0):
 799         return s
 800
 801     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 802     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 803     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 804     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 805         return s
 806
 807     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 808     if sys.platform.startswith('java'):
 809         return s
 810
 811     return s.encode(get_subprocess_encoding(), 'ignore')
 812
 813
 814 def decodeFilename(b, for_subprocess=False):
 815
 816     if sys.version_info >= (3, 0):
 817         return b
 818
 819     if not isinstance(b, bytes):
 820         return b
 821
 822     return b.decode(get_subprocess_encoding(), 'ignore')
 823
 824
 825 def encodeArgument(s):
 826     if not isinstance(s, compat_str):
 827         # Legacy code that uses byte strings
 828         # Uncomment the following line after fixing all post processors
 829         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 830         s = s.decode('ascii')
 831     return encodeFilename(s, True)
 832
 833
 834 def decodeArgument(b):
 835     return decodeFilename(b, True)
 836
 837
 838 def decodeOption(optval):
 839     if optval is None:
 840         return optval
 841     if isinstance(optval, bytes):
 842         optval = optval.decode(preferredencoding())
 843
 844     assert isinstance(optval, compat_str)
 845     return optval
 846
 847
 848 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 849
 850
 851 def timetuple_from_msec(msec):
 852     secs, msec = divmod(msec, 1000)
 853     mins, secs = divmod(secs, 60)
 854     hrs, mins = divmod(mins, 60)
 855     return _timetuple(hrs, mins, secs, msec)
 856
 857
 858 def formatSeconds(secs, delim=':', msec=False):
 859     time = timetuple_from_msec(secs * 1000)
 860     if time.hours:
 861         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 862     elif time.minutes:
 863         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 864     else:
 865         ret = '%d' % time.seconds
 866     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 867
 868
 869 def _ssl_load_windows_store_certs(ssl_context, storename):
 870     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 871     try:
 872         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 873                  if encoding == 'x509_asn' and (
 874                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 875     except PermissionError:
 876         return
 877     for cert in certs:
 878         try:
 879             ssl_context.load_verify_locations(cadata=cert)
 880         except ssl.SSLError:
 881             pass
 882
 883
 884 def make_HTTPS_handler(params, **kwargs):
 885     opts_check_certificate = not params.get('nocheckcertificate')
 886     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 887     context.check_hostname = opts_check_certificate
 888     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 889     if opts_check_certificate:
 890         try:
 891             context.load_default_certs()
 892             # Work around the issue in load_default_certs when there are bad certificates. See:
 893             # https://github.com/yt-dlp/yt-dlp/issues/1060,
 894             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 895         except ssl.SSLError:
 896             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 897             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 898                 # Create a new context to discard any certificates that were already loaded
 899                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 900                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
 901                 for storename in ('CA', 'ROOT'):
 902                     _ssl_load_windows_store_certs(context, storename)
 903             context.set_default_verify_paths()
 904     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 905
 906
 907 def bug_reports_message(before=';'):
 908     if ytdl_is_updateable():
 909         update_cmd = 'type  yt-dlp -U  to update'
 910     else:
 911         update_cmd = 'see  https://github.com/yt-dlp/yt-dlp  on how to update'
 912     msg = 'please report this issue on  https://github.com/yt-dlp/yt-dlp .'
 913     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 914     msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
 915
 916     before = before.rstrip()
 917     if not before or before.endswith(('.', '!', '?')):
 918         msg = msg[0].title() + msg[1:]
 919
 920     return (before + ' ' if before else '') + msg
 921
 922
 923 class YoutubeDLError(Exception):
 924     """Base exception for YoutubeDL errors."""
 925     msg = None
 926
 927     def __init__(self, msg=None):
 928         if msg is not None:
 929             self.msg = msg
 930         elif self.msg is None:
 931             self.msg = type(self).__name__
 932         super().__init__(self.msg)
 933
 934
 935 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 936 if hasattr(ssl, 'CertificateError'):
 937     network_exceptions.append(ssl.CertificateError)
 938 network_exceptions = tuple(network_exceptions)
 939
 940
 941 class ExtractorError(YoutubeDLError):
 942     """Error during info extraction."""
 943
 944     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 945         """ tb, if given, is the original traceback (so that it can be printed out).
 946         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 947         """
 948         if sys.exc_info()[0] in network_exceptions:
 949             expected = True
 950
 951         self.msg = str(msg)
 952         self.traceback = tb
 953         self.expected = expected
 954         self.cause = cause
 955         self.video_id = video_id
 956         self.ie = ie
 957         self.exc_info = sys.exc_info()  # preserve original exception
 958
 959         super(ExtractorError, self).__init__(''.join((
 960             format_field(ie, template='[%s] '),
 961             format_field(video_id, template='%s: '),
 962             self.msg,
 963             format_field(cause, template=' (caused by %r)'),
 964             '' if expected else bug_reports_message())))
 965
 966     def format_traceback(self):
 967         if self.traceback is None:
 968             return None
 969         return ''.join(traceback.format_tb(self.traceback))
 970
 971
 972 class UnsupportedError(ExtractorError):
 973     def __init__(self, url):
 974         super(UnsupportedError, self).__init__(
 975             'Unsupported URL: %s' % url, expected=True)
 976         self.url = url
 977
 978
 979 class RegexNotFoundError(ExtractorError):
 980     """Error when a regex didn't match"""
 981     pass
 982
 983
 984 class GeoRestrictedError(ExtractorError):
 985     """Geographic restriction Error exception.
 986
 987     This exception may be thrown when a video is not available from your
 988     geographic location due to geographic restrictions imposed by a website.
 989     """
 990
 991     def __init__(self, msg, countries=None, **kwargs):
 992         kwargs['expected'] = True
 993         super(GeoRestrictedError, self).__init__(msg, **kwargs)
 994         self.countries = countries
 995
 996
 997 class DownloadError(YoutubeDLError):
 998     """Download Error exception.
 999
1000     This exception may be thrown by FileDownloader objects if they are not
1001     configured to continue on errors. They will contain the appropriate
1002     error message.
1003     """
1004
1005     def __init__(self, msg, exc_info=None):
1006         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1007         super(DownloadError, self).__init__(msg)
1008         self.exc_info = exc_info
1009
1010
1011 class EntryNotInPlaylist(YoutubeDLError):
1012     """Entry not in playlist exception.
1013
1014     This exception will be thrown by YoutubeDL when a requested entry
1015     is not found in the playlist info_dict
1016     """
1017     msg = 'Entry not found in info'
1018
1019
1020 class SameFileError(YoutubeDLError):
1021     """Same File exception.
1022
1023     This exception will be thrown by FileDownloader objects if they detect
1024     multiple files would have to be downloaded to the same file on disk.
1025     """
1026     msg = 'Fixed output name but more than one file to download'
1027
1028     def __init__(self, filename=None):
1029         if filename is not None:
1030             self.msg += f': {filename}'
1031         super().__init__(self.msg)
1032
1033
1034 class PostProcessingError(YoutubeDLError):
1035     """Post Processing exception.
1036
1037     This exception may be raised by PostProcessor's .run() method to
1038     indicate an error in the postprocessing task.
1039     """
1040
1041
1042 class DownloadCancelled(YoutubeDLError):
1043     """ Exception raised when the download queue should be interrupted """
1044     msg = 'The download was cancelled'
1045
1046
1047 class ExistingVideoReached(DownloadCancelled):
1048     """ --break-on-existing triggered """
1049     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1050
1051
1052 class RejectedVideoReached(DownloadCancelled):
1053     """ --break-on-reject triggered """
1054     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1055
1056
1057 class MaxDownloadsReached(DownloadCancelled):
1058     """ --max-downloads limit has been reached. """
1059     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1060
1061
1062 class ReExtractInfo(YoutubeDLError):
1063     """ Video info needs to be re-extracted. """
1064
1065     def __init__(self, msg, expected=False):
1066         super().__init__(msg)
1067         self.expected = expected
1068
1069
1070 class ThrottledDownload(ReExtractInfo):
1071     """ Download speed below --throttled-rate. """
1072     msg = 'The download speed is below throttle limit'
1073
1074     def __init__(self):
1075         super().__init__(self.msg, expected=False)
1076
1077
1078 class UnavailableVideoError(YoutubeDLError):
1079     """Unavailable Format exception.
1080
1081     This exception will be thrown when a video is requested
1082     in a format that is not available for that video.
1083     """
1084     msg = 'Unable to download video'
1085
1086     def __init__(self, err=None):
1087         if err is not None:
1088             self.msg += f': {err}'
1089         super().__init__(self.msg)
1090
1091
1092 class ContentTooShortError(YoutubeDLError):
1093     """Content Too Short exception.
1094
1095     This exception may be raised by FileDownloader objects when a file they
1096     download is too small for what the server announced first, indicating
1097     the connection was probably interrupted.
1098     """
1099
1100     def __init__(self, downloaded, expected):
1101         super(ContentTooShortError, self).__init__(
1102             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1103         )
1104         # Both in bytes
1105         self.downloaded = downloaded
1106         self.expected = expected
1107
1108
1109 class XAttrMetadataError(YoutubeDLError):
1110     def __init__(self, code=None, msg='Unknown error'):
1111         super(XAttrMetadataError, self).__init__(msg)
1112         self.code = code
1113         self.msg = msg
1114
1115         # Parsing code and msg
1116         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1117                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1118             self.reason = 'NO_SPACE'
1119         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1120             self.reason = 'VALUE_TOO_LONG'
1121         else:
1122             self.reason = 'NOT_SUPPORTED'
1123
1124
1125 class XAttrUnavailableError(YoutubeDLError):
1126     pass
1127
1128
1129 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1130     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1131     # expected HTTP responses to meet HTTP/1.0 or later (see also
1132     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1133     if sys.version_info < (3, 0):
1134         kwargs['strict'] = True
1135     hc = http_class(*args, **compat_kwargs(kwargs))
1136     source_address = ydl_handler._params.get('source_address')
1137
1138     if source_address is not None:
1139         # This is to workaround _create_connection() from socket where it will try all
1140         # address data from getaddrinfo() including IPv6. This filters the result from
1141         # getaddrinfo() based on the source_address value.
1142         # This is based on the cpython socket.create_connection() function.
1143         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1144         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1145             host, port = address
1146             err = None
1147             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1148             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1149             ip_addrs = [addr for addr in addrs if addr[0] == af]
1150             if addrs and not ip_addrs:
1151                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1152                 raise socket.error(
1153                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1154                     % (ip_version, source_address[0]))
1155             for res in ip_addrs:
1156                 af, socktype, proto, canonname, sa = res
1157                 sock = None
1158                 try:
1159                     sock = socket.socket(af, socktype, proto)
1160                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1161                         sock.settimeout(timeout)
1162                     sock.bind(source_address)
1163                     sock.connect(sa)
1164                     err = None  # Explicitly break reference cycle
1165                     return sock
1166                 except socket.error as _:
1167                     err = _
1168                     if sock is not None:
1169                         sock.close()
1170             if err is not None:
1171                 raise err
1172             else:
1173                 raise socket.error('getaddrinfo returns an empty list')
1174         if hasattr(hc, '_create_connection'):
1175             hc._create_connection = _create_connection
1176         sa = (source_address, 0)
1177         if hasattr(hc, 'source_address'):  # Python 2.7+
1178             hc.source_address = sa
1179         else:  # Python 2.6
1180             def _hc_connect(self, *args, **kwargs):
1181                 sock = _create_connection(
1182                     (self.host, self.port), self.timeout, sa)
1183                 if is_https:
1184                     self.sock = ssl.wrap_socket(
1185                         sock, self.key_file, self.cert_file,
1186                         ssl_version=ssl.PROTOCOL_TLSv1)
1187                 else:
1188                     self.sock = sock
1189             hc.connect = functools.partial(_hc_connect, hc)
1190
1191     return hc
1192
1193
1194 def handle_youtubedl_headers(headers):
1195     filtered_headers = headers
1196
1197     if 'Youtubedl-no-compression' in filtered_headers:
1198         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1199         del filtered_headers['Youtubedl-no-compression']
1200
1201     return filtered_headers
1202
1203
1204 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1205     """Handler for HTTP requests and responses.
1206
1207     This class, when installed with an OpenerDirector, automatically adds
1208     the standard headers to every HTTP request and handles gzipped and
1209     deflated responses from web servers. If compression is to be avoided in
1210     a particular request, the original request in the program code only has
1211     to include the HTTP header "Youtubedl-no-compression", which will be
1212     removed before making the real request.
1213
1214     Part of this code was copied from:
1215
1216     http://techknack.net/python-urllib2-handlers/
1217
1218     Andrew Rowls, the author of that code, agreed to release it to the
1219     public domain.
1220     """
1221
1222     def __init__(self, params, *args, **kwargs):
1223         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1224         self._params = params
1225
1226     def http_open(self, req):
1227         conn_class = compat_http_client.HTTPConnection
1228
1229         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1230         if socks_proxy:
1231             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1232             del req.headers['Ytdl-socks-proxy']
1233
1234         return self.do_open(functools.partial(
1235             _create_http_connection, self, conn_class, False),
1236             req)
1237
1238     @staticmethod
1239     def deflate(data):
1240         if not data:
1241             return data
1242         try:
1243             return zlib.decompress(data, -zlib.MAX_WBITS)
1244         except zlib.error:
1245             return zlib.decompress(data)
1246
1247     def http_request(self, req):
1248         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1249         # always respected by websites, some tend to give out URLs with non percent-encoded
1250         # non-ASCII characters (see telemb.py, ard.py [#3412])
1251         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1252         # To work around aforementioned issue we will replace request's original URL with
1253         # percent-encoded one
1254         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1255         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1256         url = req.get_full_url()
1257         url_escaped = escape_url(url)
1258
1259         # Substitute URL if any change after escaping
1260         if url != url_escaped:
1261             req = update_Request(req, url=url_escaped)
1262
1263         for h, v in std_headers.items():
1264             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1265             # The dict keys are capitalized because of this bug by urllib
1266             if h.capitalize() not in req.headers:
1267                 req.add_header(h, v)
1268
1269         req.headers = handle_youtubedl_headers(req.headers)
1270
1271         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1272             # Python 2.6 is brain-dead when it comes to fragments
1273             req._Request__original = req._Request__original.partition('#')[0]
1274             req._Request__r_type = req._Request__r_type.partition('#')[0]
1275
1276         return req
1277
1278     def http_response(self, req, resp):
1279         old_resp = resp
1280         # gzip
1281         if resp.headers.get('Content-encoding', '') == 'gzip':
1282             content = resp.read()
1283             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1284             try:
1285                 uncompressed = io.BytesIO(gz.read())
1286             except IOError as original_ioerror:
1287                 # There may be junk add the end of the file
1288                 # See http://stackoverflow.com/q/4928560/35070 for details
1289                 for i in range(1, 1024):
1290                     try:
1291                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1292                         uncompressed = io.BytesIO(gz.read())
1293                     except IOError:
1294                         continue
1295                     break
1296                 else:
1297                     raise original_ioerror
1298             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1299             resp.msg = old_resp.msg
1300             del resp.headers['Content-encoding']
1301         # deflate
1302         if resp.headers.get('Content-encoding', '') == 'deflate':
1303             gz = io.BytesIO(self.deflate(resp.read()))
1304             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1305             resp.msg = old_resp.msg
1306             del resp.headers['Content-encoding']
1307         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1308         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1309         if 300 <= resp.code < 400:
1310             location = resp.headers.get('Location')
1311             if location:
1312                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1313                 if sys.version_info >= (3, 0):
1314                     location = location.encode('iso-8859-1').decode('utf-8')
1315                 else:
1316                     location = location.decode('utf-8')
1317                 location_escaped = escape_url(location)
1318                 if location != location_escaped:
1319                     del resp.headers['Location']
1320                     if sys.version_info < (3, 0):
1321                         location_escaped = location_escaped.encode('utf-8')
1322                     resp.headers['Location'] = location_escaped
1323         return resp
1324
1325     https_request = http_request
1326     https_response = http_response
1327
1328
1329 def make_socks_conn_class(base_class, socks_proxy):
1330     assert issubclass(base_class, (
1331         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1332
1333     url_components = compat_urlparse.urlparse(socks_proxy)
1334     if url_components.scheme.lower() == 'socks5':
1335         socks_type = ProxyType.SOCKS5
1336     elif url_components.scheme.lower() in ('socks', 'socks4'):
1337         socks_type = ProxyType.SOCKS4
1338     elif url_components.scheme.lower() == 'socks4a':
1339         socks_type = ProxyType.SOCKS4A
1340
1341     def unquote_if_non_empty(s):
1342         if not s:
1343             return s
1344         return compat_urllib_parse_unquote_plus(s)
1345
1346     proxy_args = (
1347         socks_type,
1348         url_components.hostname, url_components.port or 1080,
1349         True,  # Remote DNS
1350         unquote_if_non_empty(url_components.username),
1351         unquote_if_non_empty(url_components.password),
1352     )
1353
1354     class SocksConnection(base_class):
1355         def connect(self):
1356             self.sock = sockssocket()
1357             self.sock.setproxy(*proxy_args)
1358             if type(self.timeout) in (int, float):
1359                 self.sock.settimeout(self.timeout)
1360             self.sock.connect((self.host, self.port))
1361
1362             if isinstance(self, compat_http_client.HTTPSConnection):
1363                 if hasattr(self, '_context'):  # Python > 2.6
1364                     self.sock = self._context.wrap_socket(
1365                         self.sock, server_hostname=self.host)
1366                 else:
1367                     self.sock = ssl.wrap_socket(self.sock)
1368
1369     return SocksConnection
1370
1371
1372 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1373     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1374         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1375         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1376         self._params = params
1377
1378     def https_open(self, req):
1379         kwargs = {}
1380         conn_class = self._https_conn_class
1381
1382         if hasattr(self, '_context'):  # python > 2.6
1383             kwargs['context'] = self._context
1384         if hasattr(self, '_check_hostname'):  # python 3.x
1385             kwargs['check_hostname'] = self._check_hostname
1386
1387         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1388         if socks_proxy:
1389             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1390             del req.headers['Ytdl-socks-proxy']
1391
1392         return self.do_open(functools.partial(
1393             _create_http_connection, self, conn_class, True),
1394             req, **kwargs)
1395
1396
1397 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1398     """
1399     See [1] for cookie file format.
1400
1401     1. https://curl.haxx.se/docs/http-cookies.html
1402     """
1403     _HTTPONLY_PREFIX = '#HttpOnly_'
1404     _ENTRY_LEN = 7
1405     _HEADER = '''# Netscape HTTP Cookie File
1406 # This file is generated by yt-dlp.  Do not edit.
1407
1408 '''
1409     _CookieFileEntry = collections.namedtuple(
1410         'CookieFileEntry',
1411         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1412
1413     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1414         """
1415         Save cookies to a file.
1416
1417         Most of the code is taken from CPython 3.8 and slightly adapted
1418         to support cookie files with UTF-8 in both python 2 and 3.
1419         """
1420         if filename is None:
1421             if self.filename is not None:
1422                 filename = self.filename
1423             else:
1424                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1425
1426         # Store session cookies with `expires` set to 0 instead of an empty
1427         # string
1428         for cookie in self:
1429             if cookie.expires is None:
1430                 cookie.expires = 0
1431
1432         with io.open(filename, 'w', encoding='utf-8') as f:
1433             f.write(self._HEADER)
1434             now = time.time()
1435             for cookie in self:
1436                 if not ignore_discard and cookie.discard:
1437                     continue
1438                 if not ignore_expires and cookie.is_expired(now):
1439                     continue
1440                 if cookie.secure:
1441                     secure = 'TRUE'
1442                 else:
1443                     secure = 'FALSE'
1444                 if cookie.domain.startswith('.'):
1445                     initial_dot = 'TRUE'
1446                 else:
1447                     initial_dot = 'FALSE'
1448                 if cookie.expires is not None:
1449                     expires = compat_str(cookie.expires)
1450                 else:
1451                     expires = ''
1452                 if cookie.value is None:
1453                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1454                     # with no name, whereas http.cookiejar regards it as a
1455                     # cookie with no value.
1456                     name = ''
1457                     value = cookie.name
1458                 else:
1459                     name = cookie.name
1460                     value = cookie.value
1461                 f.write(
1462                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1463                                secure, expires, name, value]) + '\n')
1464
1465     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1466         """Load cookies from a file."""
1467         if filename is None:
1468             if self.filename is not None:
1469                 filename = self.filename
1470             else:
1471                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1472
1473         def prepare_line(line):
1474             if line.startswith(self._HTTPONLY_PREFIX):
1475                 line = line[len(self._HTTPONLY_PREFIX):]
1476             # comments and empty lines are fine
1477             if line.startswith('#') or not line.strip():
1478                 return line
1479             cookie_list = line.split('\t')
1480             if len(cookie_list) != self._ENTRY_LEN:
1481                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1482             cookie = self._CookieFileEntry(*cookie_list)
1483             if cookie.expires_at and not cookie.expires_at.isdigit():
1484                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1485             return line
1486
1487         cf = io.StringIO()
1488         with io.open(filename, encoding='utf-8') as f:
1489             for line in f:
1490                 try:
1491                     cf.write(prepare_line(line))
1492                 except compat_cookiejar.LoadError as e:
1493                     write_string(
1494                         'WARNING: skipping cookie file entry due to %s: %r\n'
1495                         % (e, line), sys.stderr)
1496                     continue
1497         cf.seek(0)
1498         self._really_load(cf, filename, ignore_discard, ignore_expires)
1499         # Session cookies are denoted by either `expires` field set to
1500         # an empty string or 0. MozillaCookieJar only recognizes the former
1501         # (see [1]). So we need force the latter to be recognized as session
1502         # cookies on our own.
1503         # Session cookies may be important for cookies-based authentication,
1504         # e.g. usually, when user does not check 'Remember me' check box while
1505         # logging in on a site, some important cookies are stored as session
1506         # cookies so that not recognizing them will result in failed login.
1507         # 1. https://bugs.python.org/issue17164
1508         for cookie in self:
1509             # Treat `expires=0` cookies as session cookies
1510             if cookie.expires == 0:
1511                 cookie.expires = None
1512                 cookie.discard = True
1513
1514
1515 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1516     def __init__(self, cookiejar=None):
1517         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1518
1519     def http_response(self, request, response):
1520         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1521         # characters in Set-Cookie HTTP header of last response (see
1522         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1523         # In order to at least prevent crashing we will percent encode Set-Cookie
1524         # header before HTTPCookieProcessor starts processing it.
1525         # if sys.version_info < (3, 0) and response.headers:
1526         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1527         #         set_cookie = response.headers.get(set_cookie_header)
1528         #         if set_cookie:
1529         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1530         #             if set_cookie != set_cookie_escaped:
1531         #                 del response.headers[set_cookie_header]
1532         #                 response.headers[set_cookie_header] = set_cookie_escaped
1533         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1534
1535     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1536     https_response = http_response
1537
1538
1539 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1540     """YoutubeDL redirect handler
1541
1542     The code is based on HTTPRedirectHandler implementation from CPython [1].
1543
1544     This redirect handler solves two issues:
1545      - ensures redirect URL is always unicode under python 2
1546      - introduces support for experimental HTTP response status code
1547        308 Permanent Redirect [2] used by some sites [3]
1548
1549     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1550     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1551     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1552     """
1553
1554     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1555
1556     def redirect_request(self, req, fp, code, msg, headers, newurl):
1557         """Return a Request or None in response to a redirect.
1558
1559         This is called by the http_error_30x methods when a
1560         redirection response is received.  If a redirection should
1561         take place, return a new Request to allow http_error_30x to
1562         perform the redirect.  Otherwise, raise HTTPError if no-one
1563         else should try to handle this url.  Return None if you can't
1564         but another Handler might.
1565         """
1566         m = req.get_method()
1567         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1568                  or code in (301, 302, 303) and m == "POST")):
1569             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1570         # Strictly (according to RFC 2616), 301 or 302 in response to
1571         # a POST MUST NOT cause a redirection without confirmation
1572         # from the user (of urllib.request, in this case).  In practice,
1573         # essentially all clients do redirect in this case, so we do
1574         # the same.
1575
1576         # On python 2 urlh.geturl() may sometimes return redirect URL
1577         # as byte string instead of unicode. This workaround allows
1578         # to force it always return unicode.
1579         if sys.version_info[0] < 3:
1580             newurl = compat_str(newurl)
1581
1582         # Be conciliant with URIs containing a space.  This is mainly
1583         # redundant with the more complete encoding done in http_error_302(),
1584         # but it is kept for compatibility with other callers.
1585         newurl = newurl.replace(' ', '%20')
1586
1587         CONTENT_HEADERS = ("content-length", "content-type")
1588         # NB: don't use dict comprehension for python 2.6 compatibility
1589         newheaders = dict((k, v) for k, v in req.headers.items()
1590                           if k.lower() not in CONTENT_HEADERS)
1591         return compat_urllib_request.Request(
1592             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1593             unverifiable=True)
1594
1595
1596 def extract_timezone(date_str):
1597     m = re.search(
1598         r'''(?x)
1599             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1600             (?P<tz>Z|                                            # just the UTC Z, or
1601                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1602                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1603                    [ ]?                                          # optional space
1604                 (?P<sign>\+|-)                                   # +/-
1605                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1606             $)
1607         ''', date_str)
1608     if not m:
1609         timezone = datetime.timedelta()
1610     else:
1611         date_str = date_str[:-len(m.group('tz'))]
1612         if not m.group('sign'):
1613             timezone = datetime.timedelta()
1614         else:
1615             sign = 1 if m.group('sign') == '+' else -1
1616             timezone = datetime.timedelta(
1617                 hours=sign * int(m.group('hours')),
1618                 minutes=sign * int(m.group('minutes')))
1619     return timezone, date_str
1620
1621
1622 def parse_iso8601(date_str, delimiter='T', timezone=None):
1623     """ Return a UNIX timestamp from the given date """
1624
1625     if date_str is None:
1626         return None
1627
1628     date_str = re.sub(r'\.[0-9]+', '', date_str)
1629
1630     if timezone is None:
1631         timezone, date_str = extract_timezone(date_str)
1632
1633     try:
1634         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1635         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1636         return calendar.timegm(dt.timetuple())
1637     except ValueError:
1638         pass
1639
1640
1641 def date_formats(day_first=True):
1642     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1643
1644
1645 def unified_strdate(date_str, day_first=True):
1646     """Return a string with the date in the format YYYYMMDD"""
1647
1648     if date_str is None:
1649         return None
1650     upload_date = None
1651     # Replace commas
1652     date_str = date_str.replace(',', ' ')
1653     # Remove AM/PM + timezone
1654     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1655     _, date_str = extract_timezone(date_str)
1656
1657     for expression in date_formats(day_first):
1658         try:
1659             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1660         except ValueError:
1661             pass
1662     if upload_date is None:
1663         timetuple = email.utils.parsedate_tz(date_str)
1664         if timetuple:
1665             try:
1666                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1667             except ValueError:
1668                 pass
1669     if upload_date is not None:
1670         return compat_str(upload_date)
1671
1672
1673 def unified_timestamp(date_str, day_first=True):
1674     if date_str is None:
1675         return None
1676
1677     date_str = re.sub(r'[,|]', '', date_str)
1678
1679     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1680     timezone, date_str = extract_timezone(date_str)
1681
1682     # Remove AM/PM + timezone
1683     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1684
1685     # Remove unrecognized timezones from ISO 8601 alike timestamps
1686     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1687     if m:
1688         date_str = date_str[:-len(m.group('tz'))]
1689
1690     # Python only supports microseconds, so remove nanoseconds
1691     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1692     if m:
1693         date_str = m.group(1)
1694
1695     for expression in date_formats(day_first):
1696         try:
1697             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1698             return calendar.timegm(dt.timetuple())
1699         except ValueError:
1700             pass
1701     timetuple = email.utils.parsedate_tz(date_str)
1702     if timetuple:
1703         return calendar.timegm(timetuple) + pm_delta * 3600
1704
1705
1706 def determine_ext(url, default_ext='unknown_video'):
1707     if url is None or '.' not in url:
1708         return default_ext
1709     guess = url.partition('?')[0].rpartition('.')[2]
1710     if re.match(r'^[A-Za-z0-9]+$', guess):
1711         return guess
1712     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1713     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1714         return guess.rstrip('/')
1715     else:
1716         return default_ext
1717
1718
1719 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1720     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1721
1722
1723 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1724     """
1725     Return a datetime object from a string in the format YYYYMMDD or
1726     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1727
1728     format: string date format used to return datetime object from
1729     precision: round the time portion of a datetime object.
1730                 auto|microsecond|second|minute|hour|day.
1731                 auto: round to the unit provided in date_str (if applicable).
1732     """
1733     auto_precision = False
1734     if precision == 'auto':
1735         auto_precision = True
1736         precision = 'microsecond'
1737     today = datetime_round(datetime.datetime.now(), precision)
1738     if date_str in ('now', 'today'):
1739         return today
1740     if date_str == 'yesterday':
1741         return today - datetime.timedelta(days=1)
1742     match = re.match(
1743         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1744         date_str)
1745     if match is not None:
1746         start_time = datetime_from_str(match.group('start'), precision, format)
1747         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1748         unit = match.group('unit')
1749         if unit == 'month' or unit == 'year':
1750             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1751             unit = 'day'
1752         else:
1753             if unit == 'week':
1754                 unit = 'day'
1755                 time *= 7
1756             delta = datetime.timedelta(**{unit + 's': time})
1757             new_date = start_time + delta
1758         if auto_precision:
1759             return datetime_round(new_date, unit)
1760         return new_date
1761
1762     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1763
1764
1765 def date_from_str(date_str, format='%Y%m%d'):
1766     """
1767     Return a datetime object from a string in the format YYYYMMDD or
1768     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1769
1770     format: string date format used to return datetime object from
1771     """
1772     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1773
1774
1775 def datetime_add_months(dt, months):
1776     """Increment/Decrement a datetime object by months."""
1777     month = dt.month + months - 1
1778     year = dt.year + month // 12
1779     month = month % 12 + 1
1780     day = min(dt.day, calendar.monthrange(year, month)[1])
1781     return dt.replace(year, month, day)
1782
1783
1784 def datetime_round(dt, precision='day'):
1785     """
1786     Round a datetime object's time to a specific precision
1787     """
1788     if precision == 'microsecond':
1789         return dt
1790
1791     unit_seconds = {
1792         'day': 86400,
1793         'hour': 3600,
1794         'minute': 60,
1795         'second': 1,
1796     }
1797     roundto = lambda x, n: ((x + n / 2) // n) * n
1798     timestamp = calendar.timegm(dt.timetuple())
1799     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1800
1801
1802 def hyphenate_date(date_str):
1803     """
1804     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1805     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1806     if match is not None:
1807         return '-'.join(match.groups())
1808     else:
1809         return date_str
1810
1811
1812 class DateRange(object):
1813     """Represents a time interval between two dates"""
1814
1815     def __init__(self, start=None, end=None):
1816         """start and end must be strings in the format accepted by date"""
1817         if start is not None:
1818             self.start = date_from_str(start)
1819         else:
1820             self.start = datetime.datetime.min.date()
1821         if end is not None:
1822             self.end = date_from_str(end)
1823         else:
1824             self.end = datetime.datetime.max.date()
1825         if self.start > self.end:
1826             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1827
1828     @classmethod
1829     def day(cls, day):
1830         """Returns a range that only contains the given day"""
1831         return cls(day, day)
1832
1833     def __contains__(self, date):
1834         """Check if the date is in the range"""
1835         if not isinstance(date, datetime.date):
1836             date = date_from_str(date)
1837         return self.start <= date <= self.end
1838
1839     def __str__(self):
1840         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1841
1842
1843 def platform_name():
1844     """ Returns the platform name as a compat_str """
1845     res = platform.platform()
1846     if isinstance(res, bytes):
1847         res = res.decode(preferredencoding())
1848
1849     assert isinstance(res, compat_str)
1850     return res
1851
1852
1853 def get_windows_version():
1854     ''' Get Windows version. None if it's not running on Windows '''
1855     if compat_os_name == 'nt':
1856         return version_tuple(platform.win32_ver()[1])
1857     else:
1858         return None
1859
1860
1861 def _windows_write_string(s, out):
1862     """ Returns True if the string was written using special methods,
1863     False if it has yet to be written out."""
1864     # Adapted from http://stackoverflow.com/a/3259271/35070
1865
1866     import ctypes.wintypes
1867
1868     WIN_OUTPUT_IDS = {
1869         1: -11,
1870         2: -12,
1871     }
1872
1873     try:
1874         fileno = out.fileno()
1875     except AttributeError:
1876         # If the output stream doesn't have a fileno, it's virtual
1877         return False
1878     except io.UnsupportedOperation:
1879         # Some strange Windows pseudo files?
1880         return False
1881     if fileno not in WIN_OUTPUT_IDS:
1882         return False
1883
1884     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1885         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1886         ('GetStdHandle', ctypes.windll.kernel32))
1887     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1888
1889     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1890         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1891         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1892         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1893     written = ctypes.wintypes.DWORD(0)
1894
1895     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1896     FILE_TYPE_CHAR = 0x0002
1897     FILE_TYPE_REMOTE = 0x8000
1898     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1899         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1900         ctypes.POINTER(ctypes.wintypes.DWORD))(
1901         ('GetConsoleMode', ctypes.windll.kernel32))
1902     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1903
1904     def not_a_console(handle):
1905         if handle == INVALID_HANDLE_VALUE or handle is None:
1906             return True
1907         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1908                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1909
1910     if not_a_console(h):
1911         return False
1912
1913     def next_nonbmp_pos(s):
1914         try:
1915             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1916         except StopIteration:
1917             return len(s)
1918
1919     while s:
1920         count = min(next_nonbmp_pos(s), 1024)
1921
1922         ret = WriteConsoleW(
1923             h, s, count if count else 2, ctypes.byref(written), None)
1924         if ret == 0:
1925             raise OSError('Failed to write string')
1926         if not count:  # We just wrote a non-BMP character
1927             assert written.value == 2
1928             s = s[1:]
1929         else:
1930             assert written.value > 0
1931             s = s[written.value:]
1932     return True
1933
1934
1935 def write_string(s, out=None, encoding=None):
1936     if out is None:
1937         out = sys.stderr
1938     assert type(s) == compat_str
1939
1940     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1941         if _windows_write_string(s, out):
1942             return
1943
1944     if ('b' in getattr(out, 'mode', '')
1945             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1946         byt = s.encode(encoding or preferredencoding(), 'ignore')
1947         out.write(byt)
1948     elif hasattr(out, 'buffer'):
1949         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1950         byt = s.encode(enc, 'ignore')
1951         out.buffer.write(byt)
1952     else:
1953         out.write(s)
1954     out.flush()
1955
1956
1957 def bytes_to_intlist(bs):
1958     if not bs:
1959         return []
1960     if isinstance(bs[0], int):  # Python 3
1961         return list(bs)
1962     else:
1963         return [ord(c) for c in bs]
1964
1965
1966 def intlist_to_bytes(xs):
1967     if not xs:
1968         return b''
1969     return compat_struct_pack('%dB' % len(xs), *xs)
1970
1971
1972 # Cross-platform file locking
1973 if sys.platform == 'win32':
1974     import ctypes.wintypes
1975     import msvcrt
1976
1977     class OVERLAPPED(ctypes.Structure):
1978         _fields_ = [
1979             ('Internal', ctypes.wintypes.LPVOID),
1980             ('InternalHigh', ctypes.wintypes.LPVOID),
1981             ('Offset', ctypes.wintypes.DWORD),
1982             ('OffsetHigh', ctypes.wintypes.DWORD),
1983             ('hEvent', ctypes.wintypes.HANDLE),
1984         ]
1985
1986     kernel32 = ctypes.windll.kernel32
1987     LockFileEx = kernel32.LockFileEx
1988     LockFileEx.argtypes = [
1989         ctypes.wintypes.HANDLE,     # hFile
1990         ctypes.wintypes.DWORD,      # dwFlags
1991         ctypes.wintypes.DWORD,      # dwReserved
1992         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1993         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1994         ctypes.POINTER(OVERLAPPED)  # Overlapped
1995     ]
1996     LockFileEx.restype = ctypes.wintypes.BOOL
1997     UnlockFileEx = kernel32.UnlockFileEx
1998     UnlockFileEx.argtypes = [
1999         ctypes.wintypes.HANDLE,     # hFile
2000         ctypes.wintypes.DWORD,      # dwReserved
2001         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2002         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2003         ctypes.POINTER(OVERLAPPED)  # Overlapped
2004     ]
2005     UnlockFileEx.restype = ctypes.wintypes.BOOL
2006     whole_low = 0xffffffff
2007     whole_high = 0x7fffffff
2008
2009     def _lock_file(f, exclusive):
2010         overlapped = OVERLAPPED()
2011         overlapped.Offset = 0
2012         overlapped.OffsetHigh = 0
2013         overlapped.hEvent = 0
2014         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2015         handle = msvcrt.get_osfhandle(f.fileno())
2016         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2017                           whole_low, whole_high, f._lock_file_overlapped_p):
2018             raise OSError('Locking file failed: %r' % ctypes.FormatError())
2019
2020     def _unlock_file(f):
2021         assert f._lock_file_overlapped_p
2022         handle = msvcrt.get_osfhandle(f.fileno())
2023         if not UnlockFileEx(handle, 0,
2024                             whole_low, whole_high, f._lock_file_overlapped_p):
2025             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2026
2027 else:
2028     # Some platforms, such as Jython, is missing fcntl
2029     try:
2030         import fcntl
2031
2032         def _lock_file(f, exclusive):
2033             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
2034
2035         def _unlock_file(f):
2036             fcntl.flock(f, fcntl.LOCK_UN)
2037     except ImportError:
2038         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2039
2040         def _lock_file(f, exclusive):
2041             raise IOError(UNSUPPORTED_MSG)
2042
2043         def _unlock_file(f):
2044             raise IOError(UNSUPPORTED_MSG)
2045
2046
2047 class locked_file(object):
2048     def __init__(self, filename, mode, encoding=None):
2049         assert mode in ['r', 'a', 'w']
2050         self.f = io.open(filename, mode, encoding=encoding)
2051         self.mode = mode
2052
2053     def __enter__(self):
2054         exclusive = self.mode != 'r'
2055         try:
2056             _lock_file(self.f, exclusive)
2057         except IOError:
2058             self.f.close()
2059             raise
2060         return self
2061
2062     def __exit__(self, etype, value, traceback):
2063         try:
2064             _unlock_file(self.f)
2065         finally:
2066             self.f.close()
2067
2068     def __iter__(self):
2069         return iter(self.f)
2070
2071     def write(self, *args):
2072         return self.f.write(*args)
2073
2074     def read(self, *args):
2075         return self.f.read(*args)
2076
2077
2078 def get_filesystem_encoding():
2079     encoding = sys.getfilesystemencoding()
2080     return encoding if encoding is not None else 'utf-8'
2081
2082
2083 def shell_quote(args):
2084     quoted_args = []
2085     encoding = get_filesystem_encoding()
2086     for a in args:
2087         if isinstance(a, bytes):
2088             # We may get a filename encoded with 'encodeFilename'
2089             a = a.decode(encoding)
2090         quoted_args.append(compat_shlex_quote(a))
2091     return ' '.join(quoted_args)
2092
2093
2094 def smuggle_url(url, data):
2095     """ Pass additional data in a URL for internal use. """
2096
2097     url, idata = unsmuggle_url(url, {})
2098     data.update(idata)
2099     sdata = compat_urllib_parse_urlencode(
2100         {'__youtubedl_smuggle': json.dumps(data)})
2101     return url + '#' + sdata
2102
2103
2104 def unsmuggle_url(smug_url, default=None):
2105     if '#__youtubedl_smuggle' not in smug_url:
2106         return smug_url, default
2107     url, _, sdata = smug_url.rpartition('#')
2108     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2109     data = json.loads(jsond)
2110     return url, data
2111
2112
2113 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2114     """ Formats numbers with decimal sufixes like K, M, etc """
2115     num, factor = float_or_none(num), float(factor)
2116     if num is None:
2117         return None
2118     exponent = 0 if num == 0 else int(math.log(num, factor))
2119     suffix = ['', *'KMGTPEZY'][exponent]
2120     converted = num / (factor ** exponent)
2121     return fmt % (converted, suffix)
2122
2123
2124 def format_bytes(bytes):
2125     return format_decimal_suffix(bytes, '%.2f%siB', factor=1024) or 'N/A'
2126
2127
2128 def lookup_unit_table(unit_table, s):
2129     units_re = '|'.join(re.escape(u) for u in unit_table)
2130     m = re.match(
2131         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2132     if not m:
2133         return None
2134     num_str = m.group('num').replace(',', '.')
2135     mult = unit_table[m.group('unit')]
2136     return int(float(num_str) * mult)
2137
2138
2139 def parse_filesize(s):
2140     if s is None:
2141         return None
2142
2143     # The lower-case forms are of course incorrect and unofficial,
2144     # but we support those too
2145     _UNIT_TABLE = {
2146         'B': 1,
2147         'b': 1,
2148         'bytes': 1,
2149         'KiB': 1024,
2150         'KB': 1000,
2151         'kB': 1024,
2152         'Kb': 1000,
2153         'kb': 1000,
2154         'kilobytes': 1000,
2155         'kibibytes': 1024,
2156         'MiB': 1024 ** 2,
2157         'MB': 1000 ** 2,
2158         'mB': 1024 ** 2,
2159         'Mb': 1000 ** 2,
2160         'mb': 1000 ** 2,
2161         'megabytes': 1000 ** 2,
2162         'mebibytes': 1024 ** 2,
2163         'GiB': 1024 ** 3,
2164         'GB': 1000 ** 3,
2165         'gB': 1024 ** 3,
2166         'Gb': 1000 ** 3,
2167         'gb': 1000 ** 3,
2168         'gigabytes': 1000 ** 3,
2169         'gibibytes': 1024 ** 3,
2170         'TiB': 1024 ** 4,
2171         'TB': 1000 ** 4,
2172         'tB': 1024 ** 4,
2173         'Tb': 1000 ** 4,
2174         'tb': 1000 ** 4,
2175         'terabytes': 1000 ** 4,
2176         'tebibytes': 1024 ** 4,
2177         'PiB': 1024 ** 5,
2178         'PB': 1000 ** 5,
2179         'pB': 1024 ** 5,
2180         'Pb': 1000 ** 5,
2181         'pb': 1000 ** 5,
2182         'petabytes': 1000 ** 5,
2183         'pebibytes': 1024 ** 5,
2184         'EiB': 1024 ** 6,
2185         'EB': 1000 ** 6,
2186         'eB': 1024 ** 6,
2187         'Eb': 1000 ** 6,
2188         'eb': 1000 ** 6,
2189         'exabytes': 1000 ** 6,
2190         'exbibytes': 1024 ** 6,
2191         'ZiB': 1024 ** 7,
2192         'ZB': 1000 ** 7,
2193         'zB': 1024 ** 7,
2194         'Zb': 1000 ** 7,
2195         'zb': 1000 ** 7,
2196         'zettabytes': 1000 ** 7,
2197         'zebibytes': 1024 ** 7,
2198         'YiB': 1024 ** 8,
2199         'YB': 1000 ** 8,
2200         'yB': 1024 ** 8,
2201         'Yb': 1000 ** 8,
2202         'yb': 1000 ** 8,
2203         'yottabytes': 1000 ** 8,
2204         'yobibytes': 1024 ** 8,
2205     }
2206
2207     return lookup_unit_table(_UNIT_TABLE, s)
2208
2209
2210 def parse_count(s):
2211     if s is None:
2212         return None
2213
2214     s = re.sub(r'^[^\d]+\s', '', s).strip()
2215
2216     if re.match(r'^[\d,.]+$', s):
2217         return str_to_int(s)
2218
2219     _UNIT_TABLE = {
2220         'k': 1000,
2221         'K': 1000,
2222         'm': 1000 ** 2,
2223         'M': 1000 ** 2,
2224         'kk': 1000 ** 2,
2225         'KK': 1000 ** 2,
2226         'b': 1000 ** 3,
2227         'B': 1000 ** 3,
2228     }
2229
2230     ret = lookup_unit_table(_UNIT_TABLE, s)
2231     if ret is not None:
2232         return ret
2233
2234     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2235     if mobj:
2236         return str_to_int(mobj.group(1))
2237
2238
2239 def parse_resolution(s):
2240     if s is None:
2241         return {}
2242
2243     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2244     if mobj:
2245         return {
2246             'width': int(mobj.group('w')),
2247             'height': int(mobj.group('h')),
2248         }
2249
2250     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2251     if mobj:
2252         return {'height': int(mobj.group(1))}
2253
2254     mobj = re.search(r'\b([48])[kK]\b', s)
2255     if mobj:
2256         return {'height': int(mobj.group(1)) * 540}
2257
2258     return {}
2259
2260
2261 def parse_bitrate(s):
2262     if not isinstance(s, compat_str):
2263         return
2264     mobj = re.search(r'\b(\d+)\s*kbps', s)
2265     if mobj:
2266         return int(mobj.group(1))
2267
2268
2269 def month_by_name(name, lang='en'):
2270     """ Return the number of a month by (locale-independently) English name """
2271
2272     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2273
2274     try:
2275         return month_names.index(name) + 1
2276     except ValueError:
2277         return None
2278
2279
2280 def month_by_abbreviation(abbrev):
2281     """ Return the number of a month by (locale-independently) English
2282         abbreviations """
2283
2284     try:
2285         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2286     except ValueError:
2287         return None
2288
2289
2290 def fix_xml_ampersands(xml_str):
2291     """Replace all the '&' by '&amp;' in XML"""
2292     return re.sub(
2293         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2294         '&amp;',
2295         xml_str)
2296
2297
2298 def setproctitle(title):
2299     assert isinstance(title, compat_str)
2300
2301     # ctypes in Jython is not complete
2302     # http://bugs.jython.org/issue2148
2303     if sys.platform.startswith('java'):
2304         return
2305
2306     try:
2307         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2308     except OSError:
2309         return
2310     except TypeError:
2311         # LoadLibrary in Windows Python 2.7.13 only expects
2312         # a bytestring, but since unicode_literals turns
2313         # every string into a unicode string, it fails.
2314         return
2315     title_bytes = title.encode('utf-8')
2316     buf = ctypes.create_string_buffer(len(title_bytes))
2317     buf.value = title_bytes
2318     try:
2319         libc.prctl(15, buf, 0, 0, 0)
2320     except AttributeError:
2321         return  # Strange libc, just skip this
2322
2323
2324 def remove_start(s, start):
2325     return s[len(start):] if s is not None and s.startswith(start) else s
2326
2327
2328 def remove_end(s, end):
2329     return s[:-len(end)] if s is not None and s.endswith(end) else s
2330
2331
2332 def remove_quotes(s):
2333     if s is None or len(s) < 2:
2334         return s
2335     for quote in ('"', "'", ):
2336         if s[0] == quote and s[-1] == quote:
2337             return s[1:-1]
2338     return s
2339
2340
2341 def get_domain(url):
2342     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2343     return domain.group('domain') if domain else None
2344
2345
2346 def url_basename(url):
2347     path = compat_urlparse.urlparse(url).path
2348     return path.strip('/').split('/')[-1]
2349
2350
2351 def base_url(url):
2352     return re.match(r'https?://[^?#&]+/', url).group()
2353
2354
2355 def urljoin(base, path):
2356     if isinstance(path, bytes):
2357         path = path.decode('utf-8')
2358     if not isinstance(path, compat_str) or not path:
2359         return None
2360     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2361         return path
2362     if isinstance(base, bytes):
2363         base = base.decode('utf-8')
2364     if not isinstance(base, compat_str) or not re.match(
2365             r'^(?:https?:)?//', base):
2366         return None
2367     return compat_urlparse.urljoin(base, path)
2368
2369
2370 class HEADRequest(compat_urllib_request.Request):
2371     def get_method(self):
2372         return 'HEAD'
2373
2374
2375 class PUTRequest(compat_urllib_request.Request):
2376     def get_method(self):
2377         return 'PUT'
2378
2379
2380 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2381     if get_attr:
2382         if v is not None:
2383             v = getattr(v, get_attr, None)
2384     if v == '':
2385         v = None
2386     if v is None:
2387         return default
2388     try:
2389         return int(v) * invscale // scale
2390     except (ValueError, TypeError, OverflowError):
2391         return default
2392
2393
2394 def str_or_none(v, default=None):
2395     return default if v is None else compat_str(v)
2396
2397
2398 def str_to_int(int_str):
2399     """ A more relaxed version of int_or_none """
2400     if isinstance(int_str, compat_integer_types):
2401         return int_str
2402     elif isinstance(int_str, compat_str):
2403         int_str = re.sub(r'[,\.\+]', '', int_str)
2404         return int_or_none(int_str)
2405
2406
2407 def float_or_none(v, scale=1, invscale=1, default=None):
2408     if v is None:
2409         return default
2410     try:
2411         return float(v) * invscale / scale
2412     except (ValueError, TypeError):
2413         return default
2414
2415
2416 def bool_or_none(v, default=None):
2417     return v if isinstance(v, bool) else default
2418
2419
2420 def strip_or_none(v, default=None):
2421     return v.strip() if isinstance(v, compat_str) else default
2422
2423
2424 def url_or_none(url):
2425     if not url or not isinstance(url, compat_str):
2426         return None
2427     url = url.strip()
2428     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2429
2430
2431 def strftime_or_none(timestamp, date_format, default=None):
2432     datetime_object = None
2433     try:
2434         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2435             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2436         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2437             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2438         return datetime_object.strftime(date_format)
2439     except (ValueError, TypeError, AttributeError):
2440         return default
2441
2442
2443 def parse_duration(s):
2444     if not isinstance(s, compat_basestring):
2445         return None
2446     s = s.strip()
2447     if not s:
2448         return None
2449
2450     days, hours, mins, secs, ms = [None] * 5
2451     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
2452     if m:
2453         days, hours, mins, secs, ms = m.groups()
2454     else:
2455         m = re.match(
2456             r'''(?ix)(?:P?
2457                 (?:
2458                     [0-9]+\s*y(?:ears?)?\s*
2459                 )?
2460                 (?:
2461                     [0-9]+\s*m(?:onths?)?\s*
2462                 )?
2463                 (?:
2464                     [0-9]+\s*w(?:eeks?)?\s*
2465                 )?
2466                 (?:
2467                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2468                 )?
2469                 T)?
2470                 (?:
2471                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2472                 )?
2473                 (?:
2474                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2475                 )?
2476                 (?:
2477                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2478                 )?Z?$''', s)
2479         if m:
2480             days, hours, mins, secs, ms = m.groups()
2481         else:
2482             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2483             if m:
2484                 hours, mins = m.groups()
2485             else:
2486                 return None
2487
2488     duration = 0
2489     if secs:
2490         duration += float(secs)
2491     if mins:
2492         duration += float(mins) * 60
2493     if hours:
2494         duration += float(hours) * 60 * 60
2495     if days:
2496         duration += float(days) * 24 * 60 * 60
2497     if ms:
2498         duration += float(ms)
2499     return duration
2500
2501
2502 def prepend_extension(filename, ext, expected_real_ext=None):
2503     name, real_ext = os.path.splitext(filename)
2504     return (
2505         '{0}.{1}{2}'.format(name, ext, real_ext)
2506         if not expected_real_ext or real_ext[1:] == expected_real_ext
2507         else '{0}.{1}'.format(filename, ext))
2508
2509
2510 def replace_extension(filename, ext, expected_real_ext=None):
2511     name, real_ext = os.path.splitext(filename)
2512     return '{0}.{1}'.format(
2513         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2514         ext)
2515
2516
2517 def check_executable(exe, args=[]):
2518     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2519     args can be a list of arguments for a short output (like -version) """
2520     try:
2521         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2522     except OSError:
2523         return False
2524     return exe
2525
2526
2527 def _get_exe_version_output(exe, args):
2528     try:
2529         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2530         # SIGTTOU if yt-dlp is run in the background.
2531         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2532         out, _ = Popen(
2533             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2534             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2535     except OSError:
2536         return False
2537     if isinstance(out, bytes):  # Python 2.x
2538         out = out.decode('ascii', 'ignore')
2539     return out
2540
2541
2542 def detect_exe_version(output, version_re=None, unrecognized='present'):
2543     assert isinstance(output, compat_str)
2544     if version_re is None:
2545         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2546     m = re.search(version_re, output)
2547     if m:
2548         return m.group(1)
2549     else:
2550         return unrecognized
2551
2552
2553 def get_exe_version(exe, args=['--version'],
2554                     version_re=None, unrecognized='present'):
2555     """ Returns the version of the specified executable,
2556     or False if the executable is not present """
2557     out = _get_exe_version_output(exe, args)
2558     return detect_exe_version(out, version_re, unrecognized) if out else False
2559
2560
2561 class LazyList(collections.abc.Sequence):
2562     ''' Lazy immutable list from an iterable
2563     Note that slices of a LazyList are lists and not LazyList'''
2564
2565     class IndexError(IndexError):
2566         pass
2567
2568     def __init__(self, iterable, *, reverse=False, _cache=None):
2569         self.__iterable = iter(iterable)
2570         self.__cache = [] if _cache is None else _cache
2571         self.__reversed = reverse
2572
2573     def __iter__(self):
2574         if self.__reversed:
2575             # We need to consume the entire iterable to iterate in reverse
2576             yield from self.exhaust()
2577             return
2578         yield from self.__cache
2579         for item in self.__iterable:
2580             self.__cache.append(item)
2581             yield item
2582
2583     def __exhaust(self):
2584         self.__cache.extend(self.__iterable)
2585         # Discard the emptied iterable to make it pickle-able
2586         self.__iterable = []
2587         return self.__cache
2588
2589     def exhaust(self):
2590         ''' Evaluate the entire iterable '''
2591         return self.__exhaust()[::-1 if self.__reversed else 1]
2592
2593     @staticmethod
2594     def __reverse_index(x):
2595         return None if x is None else -(x + 1)
2596
2597     def __getitem__(self, idx):
2598         if isinstance(idx, slice):
2599             if self.__reversed:
2600                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2601             start, stop, step = idx.start, idx.stop, idx.step or 1
2602         elif isinstance(idx, int):
2603             if self.__reversed:
2604                 idx = self.__reverse_index(idx)
2605             start, stop, step = idx, idx, 0
2606         else:
2607             raise TypeError('indices must be integers or slices')
2608         if ((start or 0) < 0 or (stop or 0) < 0
2609                 or (start is None and step < 0)
2610                 or (stop is None and step > 0)):
2611             # We need to consume the entire iterable to be able to slice from the end
2612             # Obviously, never use this with infinite iterables
2613             self.__exhaust()
2614             try:
2615                 return self.__cache[idx]
2616             except IndexError as e:
2617                 raise self.IndexError(e) from e
2618         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2619         if n > 0:
2620             self.__cache.extend(itertools.islice(self.__iterable, n))
2621         try:
2622             return self.__cache[idx]
2623         except IndexError as e:
2624             raise self.IndexError(e) from e
2625
2626     def __bool__(self):
2627         try:
2628             self[-1] if self.__reversed else self[0]
2629         except self.IndexError:
2630             return False
2631         return True
2632
2633     def __len__(self):
2634         self.__exhaust()
2635         return len(self.__cache)
2636
2637     def __reversed__(self):
2638         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2639
2640     def __copy__(self):
2641         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2642
2643     def __repr__(self):
2644         # repr and str should mimic a list. So we exhaust the iterable
2645         return repr(self.exhaust())
2646
2647     def __str__(self):
2648         return repr(self.exhaust())
2649
2650
2651 class PagedList:
2652
2653     class IndexError(IndexError):
2654         pass
2655
2656     def __len__(self):
2657         # This is only useful for tests
2658         return len(self.getslice())
2659
2660     def __init__(self, pagefunc, pagesize, use_cache=True):
2661         self._pagefunc = pagefunc
2662         self._pagesize = pagesize
2663         self._use_cache = use_cache
2664         self._cache = {}
2665
2666     def getpage(self, pagenum):
2667         page_results = self._cache.get(pagenum)
2668         if page_results is None:
2669             page_results = list(self._pagefunc(pagenum))
2670         if self._use_cache:
2671             self._cache[pagenum] = page_results
2672         return page_results
2673
2674     def getslice(self, start=0, end=None):
2675         return list(self._getslice(start, end))
2676
2677     def _getslice(self, start, end):
2678         raise NotImplementedError('This method must be implemented by subclasses')
2679
2680     def __getitem__(self, idx):
2681         # NOTE: cache must be enabled if this is used
2682         if not isinstance(idx, int) or idx < 0:
2683             raise TypeError('indices must be non-negative integers')
2684         entries = self.getslice(idx, idx + 1)
2685         if not entries:
2686             raise self.IndexError()
2687         return entries[0]
2688
2689
2690 class OnDemandPagedList(PagedList):
2691     def _getslice(self, start, end):
2692         for pagenum in itertools.count(start // self._pagesize):
2693             firstid = pagenum * self._pagesize
2694             nextfirstid = pagenum * self._pagesize + self._pagesize
2695             if start >= nextfirstid:
2696                 continue
2697
2698             startv = (
2699                 start % self._pagesize
2700                 if firstid <= start < nextfirstid
2701                 else 0)
2702             endv = (
2703                 ((end - 1) % self._pagesize) + 1
2704                 if (end is not None and firstid <= end <= nextfirstid)
2705                 else None)
2706
2707             page_results = self.getpage(pagenum)
2708             if startv != 0 or endv is not None:
2709                 page_results = page_results[startv:endv]
2710             yield from page_results
2711
2712             # A little optimization - if current page is not "full", ie. does
2713             # not contain page_size videos then we can assume that this page
2714             # is the last one - there are no more ids on further pages -
2715             # i.e. no need to query again.
2716             if len(page_results) + startv < self._pagesize:
2717                 break
2718
2719             # If we got the whole page, but the next page is not interesting,
2720             # break out early as well
2721             if end == nextfirstid:
2722                 break
2723
2724
2725 class InAdvancePagedList(PagedList):
2726     def __init__(self, pagefunc, pagecount, pagesize):
2727         self._pagecount = pagecount
2728         PagedList.__init__(self, pagefunc, pagesize, True)
2729
2730     def _getslice(self, start, end):
2731         start_page = start // self._pagesize
2732         end_page = (
2733             self._pagecount if end is None else (end // self._pagesize + 1))
2734         skip_elems = start - start_page * self._pagesize
2735         only_more = None if end is None else end - start
2736         for pagenum in range(start_page, end_page):
2737             page_results = self.getpage(pagenum)
2738             if skip_elems:
2739                 page_results = page_results[skip_elems:]
2740                 skip_elems = None
2741             if only_more is not None:
2742                 if len(page_results) < only_more:
2743                     only_more -= len(page_results)
2744                 else:
2745                     yield from page_results[:only_more]
2746                     break
2747             yield from page_results
2748
2749
2750 def uppercase_escape(s):
2751     unicode_escape = codecs.getdecoder('unicode_escape')
2752     return re.sub(
2753         r'\\U[0-9a-fA-F]{8}',
2754         lambda m: unicode_escape(m.group(0))[0],
2755         s)
2756
2757
2758 def lowercase_escape(s):
2759     unicode_escape = codecs.getdecoder('unicode_escape')
2760     return re.sub(
2761         r'\\u[0-9a-fA-F]{4}',
2762         lambda m: unicode_escape(m.group(0))[0],
2763         s)
2764
2765
2766 def escape_rfc3986(s):
2767     """Escape non-ASCII characters as suggested by RFC 3986"""
2768     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2769         s = s.encode('utf-8')
2770     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2771
2772
2773 def escape_url(url):
2774     """Escape URL as suggested by RFC 3986"""
2775     url_parsed = compat_urllib_parse_urlparse(url)
2776     return url_parsed._replace(
2777         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2778         path=escape_rfc3986(url_parsed.path),
2779         params=escape_rfc3986(url_parsed.params),
2780         query=escape_rfc3986(url_parsed.query),
2781         fragment=escape_rfc3986(url_parsed.fragment)
2782     ).geturl()
2783
2784
2785 def parse_qs(url):
2786     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2787
2788
2789 def read_batch_urls(batch_fd):
2790     def fixup(url):
2791         if not isinstance(url, compat_str):
2792             url = url.decode('utf-8', 'replace')
2793         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2794         for bom in BOM_UTF8:
2795             if url.startswith(bom):
2796                 url = url[len(bom):]
2797         url = url.lstrip()
2798         if not url or url.startswith(('#', ';', ']')):
2799             return False
2800         # "#" cannot be stripped out since it is part of the URI
2801         # However, it can be safely stipped out if follwing a whitespace
2802         return re.split(r'\s#', url, 1)[0].rstrip()
2803
2804     with contextlib.closing(batch_fd) as fd:
2805         return [url for url in map(fixup, fd) if url]
2806
2807
2808 def urlencode_postdata(*args, **kargs):
2809     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2810
2811
2812 def update_url_query(url, query):
2813     if not query:
2814         return url
2815     parsed_url = compat_urlparse.urlparse(url)
2816     qs = compat_parse_qs(parsed_url.query)
2817     qs.update(query)
2818     return compat_urlparse.urlunparse(parsed_url._replace(
2819         query=compat_urllib_parse_urlencode(qs, True)))
2820
2821
2822 def update_Request(req, url=None, data=None, headers={}, query={}):
2823     req_headers = req.headers.copy()
2824     req_headers.update(headers)
2825     req_data = data or req.data
2826     req_url = update_url_query(url or req.get_full_url(), query)
2827     req_get_method = req.get_method()
2828     if req_get_method == 'HEAD':
2829         req_type = HEADRequest
2830     elif req_get_method == 'PUT':
2831         req_type = PUTRequest
2832     else:
2833         req_type = compat_urllib_request.Request
2834     new_req = req_type(
2835         req_url, data=req_data, headers=req_headers,
2836         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2837     if hasattr(req, 'timeout'):
2838         new_req.timeout = req.timeout
2839     return new_req
2840
2841
2842 def _multipart_encode_impl(data, boundary):
2843     content_type = 'multipart/form-data; boundary=%s' % boundary
2844
2845     out = b''
2846     for k, v in data.items():
2847         out += b'--' + boundary.encode('ascii') + b'\r\n'
2848         if isinstance(k, compat_str):
2849             k = k.encode('utf-8')
2850         if isinstance(v, compat_str):
2851             v = v.encode('utf-8')
2852         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2853         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2854         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2855         if boundary.encode('ascii') in content:
2856             raise ValueError('Boundary overlaps with data')
2857         out += content
2858
2859     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2860
2861     return out, content_type
2862
2863
2864 def multipart_encode(data, boundary=None):
2865     '''
2866     Encode a dict to RFC 7578-compliant form-data
2867
2868     data:
2869         A dict where keys and values can be either Unicode or bytes-like
2870         objects.
2871     boundary:
2872         If specified a Unicode object, it's used as the boundary. Otherwise
2873         a random boundary is generated.
2874
2875     Reference: https://tools.ietf.org/html/rfc7578
2876     '''
2877     has_specified_boundary = boundary is not None
2878
2879     while True:
2880         if boundary is None:
2881             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2882
2883         try:
2884             out, content_type = _multipart_encode_impl(data, boundary)
2885             break
2886         except ValueError:
2887             if has_specified_boundary:
2888                 raise
2889             boundary = None
2890
2891     return out, content_type
2892
2893
2894 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2895     if isinstance(key_or_keys, (list, tuple)):
2896         for key in key_or_keys:
2897             if key not in d or d[key] is None or skip_false_values and not d[key]:
2898                 continue
2899             return d[key]
2900         return default
2901     return d.get(key_or_keys, default)
2902
2903
2904 def try_get(src, getter, expected_type=None):
2905     for get in variadic(getter):
2906         try:
2907             v = get(src)
2908         except (AttributeError, KeyError, TypeError, IndexError):
2909             pass
2910         else:
2911             if expected_type is None or isinstance(v, expected_type):
2912                 return v
2913
2914
2915 def merge_dicts(*dicts):
2916     merged = {}
2917     for a_dict in dicts:
2918         for k, v in a_dict.items():
2919             if v is None:
2920                 continue
2921             if (k not in merged
2922                     or (isinstance(v, compat_str) and v
2923                         and isinstance(merged[k], compat_str)
2924                         and not merged[k])):
2925                 merged[k] = v
2926     return merged
2927
2928
2929 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2930     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2931
2932
2933 US_RATINGS = {
2934     'G': 0,
2935     'PG': 10,
2936     'PG-13': 13,
2937     'R': 16,
2938     'NC': 18,
2939 }
2940
2941
2942 TV_PARENTAL_GUIDELINES = {
2943     'TV-Y': 0,
2944     'TV-Y7': 7,
2945     'TV-G': 0,
2946     'TV-PG': 0,
2947     'TV-14': 14,
2948     'TV-MA': 17,
2949 }
2950
2951
2952 def parse_age_limit(s):
2953     if type(s) == int:
2954         return s if 0 <= s <= 21 else None
2955     if not isinstance(s, compat_basestring):
2956         return None
2957     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2958     if m:
2959         return int(m.group('age'))
2960     s = s.upper()
2961     if s in US_RATINGS:
2962         return US_RATINGS[s]
2963     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2964     if m:
2965         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2966     return None
2967
2968
2969 def strip_jsonp(code):
2970     return re.sub(
2971         r'''(?sx)^
2972             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2973             (?:\s*&&\s*(?P=func_name))?
2974             \s*\(\s*(?P<callback_data>.*)\);?
2975             \s*?(?://[^\n]*)*$''',
2976         r'\g<callback_data>', code)
2977
2978
2979 def js_to_json(code, vars={}):
2980     # vars is a dict of var, val pairs to substitute
2981     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2982     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2983     INTEGER_TABLE = (
2984         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2985         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2986     )
2987
2988     def fix_kv(m):
2989         v = m.group(0)
2990         if v in ('true', 'false', 'null'):
2991             return v
2992         elif v in ('undefined', 'void 0'):
2993             return 'null'
2994         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2995             return ""
2996
2997         if v[0] in ("'", '"'):
2998             v = re.sub(r'(?s)\\.|"', lambda m: {
2999                 '"': '\\"',
3000                 "\\'": "'",
3001                 '\\\n': '',
3002                 '\\x': '\\u00',
3003             }.get(m.group(0), m.group(0)), v[1:-1])
3004         else:
3005             for regex, base in INTEGER_TABLE:
3006                 im = re.match(regex, v)
3007                 if im:
3008                     i = int(im.group(1), base)
3009                     return '"%d":' % i if v.endswith(':') else '%d' % i
3010
3011             if v in vars:
3012                 return vars[v]
3013
3014         return '"%s"' % v
3015
3016     return re.sub(r'''(?sx)
3017         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3018         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3019         {comment}|,(?={skip}[\]}}])|
3020         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3021         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3022         [0-9]+(?={skip}:)|
3023         !+
3024         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3025
3026
3027 def qualities(quality_ids):
3028     """ Get a numeric quality value out of a list of possible values """
3029     def q(qid):
3030         try:
3031             return quality_ids.index(qid)
3032         except ValueError:
3033             return -1
3034     return q
3035
3036
3037 DEFAULT_OUTTMPL = {
3038     'default': '%(title)s [%(id)s].%(ext)s',
3039     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3040 }
3041 OUTTMPL_TYPES = {
3042     'chapter': None,
3043     'subtitle': None,
3044     'thumbnail': None,
3045     'description': 'description',
3046     'annotation': 'annotations.xml',
3047     'infojson': 'info.json',
3048     'link': None,
3049     'pl_thumbnail': None,
3050     'pl_description': 'description',
3051     'pl_infojson': 'info.json',
3052 }
3053
3054 # As of [1] format syntax is:
3055 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3056 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3057 STR_FORMAT_RE_TMPL = r'''(?x)
3058     (?<!%)(?P<prefix>(?:%%)*)
3059     %
3060     (?P<has_key>\((?P<key>{0})\))?
3061     (?P<format>
3062         (?P<conversion>[#0\-+ ]+)?
3063         (?P<min_width>\d+)?
3064         (?P<precision>\.\d+)?
3065         (?P<len_mod>[hlL])?  # unused in python
3066         {1}  # conversion type
3067     )
3068 '''
3069
3070
3071 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3072
3073
3074 def limit_length(s, length):
3075     """ Add ellipses to overly long strings """
3076     if s is None:
3077         return None
3078     ELLIPSES = '...'
3079     if len(s) > length:
3080         return s[:length - len(ELLIPSES)] + ELLIPSES
3081     return s
3082
3083
3084 def version_tuple(v):
3085     return tuple(int(e) for e in re.split(r'[-.]', v))
3086
3087
3088 def is_outdated_version(version, limit, assume_new=True):
3089     if not version:
3090         return not assume_new
3091     try:
3092         return version_tuple(version) < version_tuple(limit)
3093     except ValueError:
3094         return not assume_new
3095
3096
3097 def ytdl_is_updateable():
3098     """ Returns if yt-dlp can be updated with -U """
3099
3100     from .update import is_non_updateable
3101
3102     return not is_non_updateable()
3103
3104
3105 def args_to_str(args):
3106     # Get a short string representation for a subprocess command
3107     return ' '.join(compat_shlex_quote(a) for a in args)
3108
3109
3110 def error_to_compat_str(err):
3111     err_str = str(err)
3112     # On python 2 error byte string must be decoded with proper
3113     # encoding rather than ascii
3114     if sys.version_info[0] < 3:
3115         err_str = err_str.decode(preferredencoding())
3116     return err_str
3117
3118
3119 def mimetype2ext(mt):
3120     if mt is None:
3121         return None
3122
3123     mt, _, params = mt.partition(';')
3124     mt = mt.strip()
3125
3126     FULL_MAP = {
3127         'audio/mp4': 'm4a',
3128         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3129         # it's the most popular one
3130         'audio/mpeg': 'mp3',
3131         'audio/x-wav': 'wav',
3132         'audio/wav': 'wav',
3133         'audio/wave': 'wav',
3134     }
3135
3136     ext = FULL_MAP.get(mt)
3137     if ext is not None:
3138         return ext
3139
3140     SUBTYPE_MAP = {
3141         '3gpp': '3gp',
3142         'smptett+xml': 'tt',
3143         'ttaf+xml': 'dfxp',
3144         'ttml+xml': 'ttml',
3145         'x-flv': 'flv',
3146         'x-mp4-fragmented': 'mp4',
3147         'x-ms-sami': 'sami',
3148         'x-ms-wmv': 'wmv',
3149         'mpegurl': 'm3u8',
3150         'x-mpegurl': 'm3u8',
3151         'vnd.apple.mpegurl': 'm3u8',
3152         'dash+xml': 'mpd',
3153         'f4m+xml': 'f4m',
3154         'hds+xml': 'f4m',
3155         'vnd.ms-sstr+xml': 'ism',
3156         'quicktime': 'mov',
3157         'mp2t': 'ts',
3158         'x-wav': 'wav',
3159         'filmstrip+json': 'fs',
3160         'svg+xml': 'svg',
3161     }
3162
3163     _, _, subtype = mt.rpartition('/')
3164     ext = SUBTYPE_MAP.get(subtype.lower())
3165     if ext is not None:
3166         return ext
3167
3168     SUFFIX_MAP = {
3169         'json': 'json',
3170         'xml': 'xml',
3171         'zip': 'zip',
3172         'gzip': 'gz',
3173     }
3174
3175     _, _, suffix = subtype.partition('+')
3176     ext = SUFFIX_MAP.get(suffix)
3177     if ext is not None:
3178         return ext
3179
3180     return subtype.replace('+', '.')
3181
3182
3183 def ext2mimetype(ext_or_url):
3184     if not ext_or_url:
3185         return None
3186     if '.' not in ext_or_url:
3187         ext_or_url = f'file.{ext_or_url}'
3188     return mimetypes.guess_type(ext_or_url)[0]
3189
3190
3191 def parse_codecs(codecs_str):
3192     # http://tools.ietf.org/html/rfc6381
3193     if not codecs_str:
3194         return {}
3195     split_codecs = list(filter(None, map(
3196         str.strip, codecs_str.strip().strip(',').split(','))))
3197     vcodec, acodec, hdr = None, None, None
3198     for full_codec in split_codecs:
3199         parts = full_codec.split('.')
3200         codec = parts[0].replace('0', '')
3201         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3202                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3203             if not vcodec:
3204                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3205                 if codec in ('dvh1', 'dvhe'):
3206                     hdr = 'DV'
3207                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3208                     hdr = 'HDR10'
3209                 elif full_codec.replace('0', '').startswith('vp9.2'):
3210                     hdr = 'HDR10'
3211         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3212             if not acodec:
3213                 acodec = full_codec
3214         else:
3215             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3216     if vcodec or acodec:
3217         return {
3218             'vcodec': vcodec or 'none',
3219             'acodec': acodec or 'none',
3220             'dynamic_range': hdr,
3221         }
3222     elif len(split_codecs) == 2:
3223         return {
3224             'vcodec': split_codecs[0],
3225             'acodec': split_codecs[1],
3226         }
3227     return {}
3228
3229
3230 def urlhandle_detect_ext(url_handle):
3231     getheader = url_handle.headers.get
3232
3233     cd = getheader('Content-Disposition')
3234     if cd:
3235         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3236         if m:
3237             e = determine_ext(m.group('filename'), default_ext=None)
3238             if e:
3239                 return e
3240
3241     return mimetype2ext(getheader('Content-Type'))
3242
3243
3244 def encode_data_uri(data, mime_type):
3245     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3246
3247
3248 def age_restricted(content_limit, age_limit):
3249     """ Returns True iff the content should be blocked """
3250
3251     if age_limit is None:  # No limit set
3252         return False
3253     if content_limit is None:
3254         return False  # Content available for everyone
3255     return age_limit < content_limit
3256
3257
3258 def is_html(first_bytes):
3259     """ Detect whether a file contains HTML by examining its first bytes. """
3260
3261     BOMS = [
3262         (b'\xef\xbb\xbf', 'utf-8'),
3263         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3264         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3265         (b'\xff\xfe', 'utf-16-le'),
3266         (b'\xfe\xff', 'utf-16-be'),
3267     ]
3268     for bom, enc in BOMS:
3269         if first_bytes.startswith(bom):
3270             s = first_bytes[len(bom):].decode(enc, 'replace')
3271             break
3272     else:
3273         s = first_bytes.decode('utf-8', 'replace')
3274
3275     return re.match(r'^\s*<', s)
3276
3277
3278 def determine_protocol(info_dict):
3279     protocol = info_dict.get('protocol')
3280     if protocol is not None:
3281         return protocol
3282
3283     url = sanitize_url(info_dict['url'])
3284     if url.startswith('rtmp'):
3285         return 'rtmp'
3286     elif url.startswith('mms'):
3287         return 'mms'
3288     elif url.startswith('rtsp'):
3289         return 'rtsp'
3290
3291     ext = determine_ext(url)
3292     if ext == 'm3u8':
3293         return 'm3u8'
3294     elif ext == 'f4m':
3295         return 'f4m'
3296
3297     return compat_urllib_parse_urlparse(url).scheme
3298
3299
3300 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3301     """ Render a list of rows, each as a list of values.
3302     Text after a \t will be right aligned """
3303     def width(string):
3304         return len(remove_terminal_sequences(string).replace('\t', ''))
3305
3306     def get_max_lens(table):
3307         return [max(width(str(v)) for v in col) for col in zip(*table)]
3308
3309     def filter_using_list(row, filterArray):
3310         return [col for (take, col) in zip(filterArray, row) if take]
3311
3312     if hide_empty:
3313         max_lens = get_max_lens(data)
3314         header_row = filter_using_list(header_row, max_lens)
3315         data = [filter_using_list(row, max_lens) for row in data]
3316
3317     table = [header_row] + data
3318     max_lens = get_max_lens(table)
3319     extra_gap += 1
3320     if delim:
3321         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3322         table[1][-1] = table[1][-1][:-extra_gap]  # Remove extra_gap from end of delimiter
3323     for row in table:
3324         for pos, text in enumerate(map(str, row)):
3325             if '\t' in text:
3326                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3327             else:
3328                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3329     ret = '\n'.join(''.join(row).rstrip() for row in table)
3330     return ret
3331
3332
3333 def _match_one(filter_part, dct, incomplete):
3334     # TODO: Generalize code with YoutubeDL._build_format_filter
3335     STRING_OPERATORS = {
3336         '*=': operator.contains,
3337         '^=': lambda attr, value: attr.startswith(value),
3338         '$=': lambda attr, value: attr.endswith(value),
3339         '~=': lambda attr, value: re.search(value, attr),
3340     }
3341     COMPARISON_OPERATORS = {
3342         **STRING_OPERATORS,
3343         '<=': operator.le,  # "<=" must be defined above "<"
3344         '<': operator.lt,
3345         '>=': operator.ge,
3346         '>': operator.gt,
3347         '=': operator.eq,
3348     }
3349
3350     operator_rex = re.compile(r'''(?x)\s*
3351         (?P<key>[a-z_]+)
3352         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3353         (?:
3354             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3355             (?P<strval>.+?)
3356         )
3357         \s*$
3358         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3359     m = operator_rex.search(filter_part)
3360     if m:
3361         m = m.groupdict()
3362         unnegated_op = COMPARISON_OPERATORS[m['op']]
3363         if m['negation']:
3364             op = lambda attr, value: not unnegated_op(attr, value)
3365         else:
3366             op = unnegated_op
3367         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3368         if m['quote']:
3369             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3370         actual_value = dct.get(m['key'])
3371         numeric_comparison = None
3372         if isinstance(actual_value, compat_numeric_types):
3373             # If the original field is a string and matching comparisonvalue is
3374             # a number we should respect the origin of the original field
3375             # and process comparison value as a string (see
3376             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3377             try:
3378                 numeric_comparison = int(comparison_value)
3379             except ValueError:
3380                 numeric_comparison = parse_filesize(comparison_value)
3381                 if numeric_comparison is None:
3382                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3383                 if numeric_comparison is None:
3384                     numeric_comparison = parse_duration(comparison_value)
3385         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3386             raise ValueError('Operator %s only supports string values!' % m['op'])
3387         if actual_value is None:
3388             return incomplete or m['none_inclusive']
3389         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3390
3391     UNARY_OPERATORS = {
3392         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3393         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3394     }
3395     operator_rex = re.compile(r'''(?x)\s*
3396         (?P<op>%s)\s*(?P<key>[a-z_]+)
3397         \s*$
3398         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3399     m = operator_rex.search(filter_part)
3400     if m:
3401         op = UNARY_OPERATORS[m.group('op')]
3402         actual_value = dct.get(m.group('key'))
3403         if incomplete and actual_value is None:
3404             return True
3405         return op(actual_value)
3406
3407     raise ValueError('Invalid filter part %r' % filter_part)
3408
3409
3410 def match_str(filter_str, dct, incomplete=False):
3411     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3412         When incomplete, all conditions passes on missing fields
3413     """
3414     return all(
3415         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3416         for filter_part in re.split(r'(?<!\\)&', filter_str))
3417
3418
3419 def match_filter_func(filter_str):
3420     def _match_func(info_dict, *args, **kwargs):
3421         if match_str(filter_str, info_dict, *args, **kwargs):
3422             return None
3423         else:
3424             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3425             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3426     return _match_func
3427
3428
3429 def parse_dfxp_time_expr(time_expr):
3430     if not time_expr:
3431         return
3432
3433     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3434     if mobj:
3435         return float(mobj.group('time_offset'))
3436
3437     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3438     if mobj:
3439         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3440
3441
3442 def srt_subtitles_timecode(seconds):
3443     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3444
3445
3446 def ass_subtitles_timecode(seconds):
3447     time = timetuple_from_msec(seconds * 1000)
3448     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3449
3450
3451 def dfxp2srt(dfxp_data):
3452     '''
3453     @param dfxp_data A bytes-like object containing DFXP data
3454     @returns A unicode object containing converted SRT data
3455     '''
3456     LEGACY_NAMESPACES = (
3457         (b'http://www.w3.org/ns/ttml', [
3458             b'http://www.w3.org/2004/11/ttaf1',
3459             b'http://www.w3.org/2006/04/ttaf1',
3460             b'http://www.w3.org/2006/10/ttaf1',
3461         ]),
3462         (b'http://www.w3.org/ns/ttml#styling', [
3463             b'http://www.w3.org/ns/ttml#style',
3464         ]),
3465     )
3466
3467     SUPPORTED_STYLING = [
3468         'color',
3469         'fontFamily',
3470         'fontSize',
3471         'fontStyle',
3472         'fontWeight',
3473         'textDecoration'
3474     ]
3475
3476     _x = functools.partial(xpath_with_ns, ns_map={
3477         'xml': 'http://www.w3.org/XML/1998/namespace',
3478         'ttml': 'http://www.w3.org/ns/ttml',
3479         'tts': 'http://www.w3.org/ns/ttml#styling',
3480     })
3481
3482     styles = {}
3483     default_style = {}
3484
3485     class TTMLPElementParser(object):
3486         _out = ''
3487         _unclosed_elements = []
3488         _applied_styles = []
3489
3490         def start(self, tag, attrib):
3491             if tag in (_x('ttml:br'), 'br'):
3492                 self._out += '\n'
3493             else:
3494                 unclosed_elements = []
3495                 style = {}
3496                 element_style_id = attrib.get('style')
3497                 if default_style:
3498                     style.update(default_style)
3499                 if element_style_id:
3500                     style.update(styles.get(element_style_id, {}))
3501                 for prop in SUPPORTED_STYLING:
3502                     prop_val = attrib.get(_x('tts:' + prop))
3503                     if prop_val:
3504                         style[prop] = prop_val
3505                 if style:
3506                     font = ''
3507                     for k, v in sorted(style.items()):
3508                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3509                             continue
3510                         if k == 'color':
3511                             font += ' color="%s"' % v
3512                         elif k == 'fontSize':
3513                             font += ' size="%s"' % v
3514                         elif k == 'fontFamily':
3515                             font += ' face="%s"' % v
3516                         elif k == 'fontWeight' and v == 'bold':
3517                             self._out += '<b>'
3518                             unclosed_elements.append('b')
3519                         elif k == 'fontStyle' and v == 'italic':
3520                             self._out += '<i>'
3521                             unclosed_elements.append('i')
3522                         elif k == 'textDecoration' and v == 'underline':
3523                             self._out += '<u>'
3524                             unclosed_elements.append('u')
3525                     if font:
3526                         self._out += '<font' + font + '>'
3527                         unclosed_elements.append('font')
3528                     applied_style = {}
3529                     if self._applied_styles:
3530                         applied_style.update(self._applied_styles[-1])
3531                     applied_style.update(style)
3532                     self._applied_styles.append(applied_style)
3533                 self._unclosed_elements.append(unclosed_elements)
3534
3535         def end(self, tag):
3536             if tag not in (_x('ttml:br'), 'br'):
3537                 unclosed_elements = self._unclosed_elements.pop()
3538                 for element in reversed(unclosed_elements):
3539                     self._out += '</%s>' % element
3540                 if unclosed_elements and self._applied_styles:
3541                     self._applied_styles.pop()
3542
3543         def data(self, data):
3544             self._out += data
3545
3546         def close(self):
3547             return self._out.strip()
3548
3549     def parse_node(node):
3550         target = TTMLPElementParser()
3551         parser = xml.etree.ElementTree.XMLParser(target=target)
3552         parser.feed(xml.etree.ElementTree.tostring(node))
3553         return parser.close()
3554
3555     for k, v in LEGACY_NAMESPACES:
3556         for ns in v:
3557             dfxp_data = dfxp_data.replace(ns, k)
3558
3559     dfxp = compat_etree_fromstring(dfxp_data)
3560     out = []
3561     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3562
3563     if not paras:
3564         raise ValueError('Invalid dfxp/TTML subtitle')
3565
3566     repeat = False
3567     while True:
3568         for style in dfxp.findall(_x('.//ttml:style')):
3569             style_id = style.get('id') or style.get(_x('xml:id'))
3570             if not style_id:
3571                 continue
3572             parent_style_id = style.get('style')
3573             if parent_style_id:
3574                 if parent_style_id not in styles:
3575                     repeat = True
3576                     continue
3577                 styles[style_id] = styles[parent_style_id].copy()
3578             for prop in SUPPORTED_STYLING:
3579                 prop_val = style.get(_x('tts:' + prop))
3580                 if prop_val:
3581                     styles.setdefault(style_id, {})[prop] = prop_val
3582         if repeat:
3583             repeat = False
3584         else:
3585             break
3586
3587     for p in ('body', 'div'):
3588         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3589         if ele is None:
3590             continue
3591         style = styles.get(ele.get('style'))
3592         if not style:
3593             continue
3594         default_style.update(style)
3595
3596     for para, index in zip(paras, itertools.count(1)):
3597         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3598         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3599         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3600         if begin_time is None:
3601             continue
3602         if not end_time:
3603             if not dur:
3604                 continue
3605             end_time = begin_time + dur
3606         out.append('%d\n%s --> %s\n%s\n\n' % (
3607             index,
3608             srt_subtitles_timecode(begin_time),
3609             srt_subtitles_timecode(end_time),
3610             parse_node(para)))
3611
3612     return ''.join(out)
3613
3614
3615 def cli_option(params, command_option, param):
3616     param = params.get(param)
3617     if param:
3618         param = compat_str(param)
3619     return [command_option, param] if param is not None else []
3620
3621
3622 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3623     param = params.get(param)
3624     if param is None:
3625         return []
3626     assert isinstance(param, bool)
3627     if separator:
3628         return [command_option + separator + (true_value if param else false_value)]
3629     return [command_option, true_value if param else false_value]
3630
3631
3632 def cli_valueless_option(params, command_option, param, expected_value=True):
3633     param = params.get(param)
3634     return [command_option] if param == expected_value else []
3635
3636
3637 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3638     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3639         if use_compat:
3640             return argdict
3641         else:
3642             argdict = None
3643     if argdict is None:
3644         return default
3645     assert isinstance(argdict, dict)
3646
3647     assert isinstance(keys, (list, tuple))
3648     for key_list in keys:
3649         arg_list = list(filter(
3650             lambda x: x is not None,
3651             [argdict.get(key.lower()) for key in variadic(key_list)]))
3652         if arg_list:
3653             return [arg for args in arg_list for arg in args]
3654     return default
3655
3656
3657 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3658     main_key, exe = main_key.lower(), exe.lower()
3659     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3660     keys = [f'{root_key}{k}' for k in (keys or [''])]
3661     if root_key in keys:
3662         if main_key != exe:
3663             keys.append((main_key, exe))
3664         keys.append('default')
3665     else:
3666         use_compat = False
3667     return cli_configuration_args(argdict, keys, default, use_compat)
3668
3669
3670 class ISO639Utils(object):
3671     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3672     _lang_map = {
3673         'aa': 'aar',
3674         'ab': 'abk',
3675         'ae': 'ave',
3676         'af': 'afr',
3677         'ak': 'aka',
3678         'am': 'amh',
3679         'an': 'arg',
3680         'ar': 'ara',
3681         'as': 'asm',
3682         'av': 'ava',
3683         'ay': 'aym',
3684         'az': 'aze',
3685         'ba': 'bak',
3686         'be': 'bel',
3687         'bg': 'bul',
3688         'bh': 'bih',
3689         'bi': 'bis',
3690         'bm': 'bam',
3691         'bn': 'ben',
3692         'bo': 'bod',
3693         'br': 'bre',
3694         'bs': 'bos',
3695         'ca': 'cat',
3696         'ce': 'che',
3697         'ch': 'cha',
3698         'co': 'cos',
3699         'cr': 'cre',
3700         'cs': 'ces',
3701         'cu': 'chu',
3702         'cv': 'chv',
3703         'cy': 'cym',
3704         'da': 'dan',
3705         'de': 'deu',
3706         'dv': 'div',
3707         'dz': 'dzo',
3708         'ee': 'ewe',
3709         'el': 'ell',
3710         'en': 'eng',
3711         'eo': 'epo',
3712         'es': 'spa',
3713         'et': 'est',
3714         'eu': 'eus',
3715         'fa': 'fas',
3716         'ff': 'ful',
3717         'fi': 'fin',
3718         'fj': 'fij',
3719         'fo': 'fao',
3720         'fr': 'fra',
3721         'fy': 'fry',
3722         'ga': 'gle',
3723         'gd': 'gla',
3724         'gl': 'glg',
3725         'gn': 'grn',
3726         'gu': 'guj',
3727         'gv': 'glv',
3728         'ha': 'hau',
3729         'he': 'heb',
3730         'iw': 'heb',  # Replaced by he in 1989 revision
3731         'hi': 'hin',
3732         'ho': 'hmo',
3733         'hr': 'hrv',
3734         'ht': 'hat',
3735         'hu': 'hun',
3736         'hy': 'hye',
3737         'hz': 'her',
3738         'ia': 'ina',
3739         'id': 'ind',
3740         'in': 'ind',  # Replaced by id in 1989 revision
3741         'ie': 'ile',
3742         'ig': 'ibo',
3743         'ii': 'iii',
3744         'ik': 'ipk',
3745         'io': 'ido',
3746         'is': 'isl',
3747         'it': 'ita',
3748         'iu': 'iku',
3749         'ja': 'jpn',
3750         'jv': 'jav',
3751         'ka': 'kat',
3752         'kg': 'kon',
3753         'ki': 'kik',
3754         'kj': 'kua',
3755         'kk': 'kaz',
3756         'kl': 'kal',
3757         'km': 'khm',
3758         'kn': 'kan',
3759         'ko': 'kor',
3760         'kr': 'kau',
3761         'ks': 'kas',
3762         'ku': 'kur',
3763         'kv': 'kom',
3764         'kw': 'cor',
3765         'ky': 'kir',
3766         'la': 'lat',
3767         'lb': 'ltz',
3768         'lg': 'lug',
3769         'li': 'lim',
3770         'ln': 'lin',
3771         'lo': 'lao',
3772         'lt': 'lit',
3773         'lu': 'lub',
3774         'lv': 'lav',
3775         'mg': 'mlg',
3776         'mh': 'mah',
3777         'mi': 'mri',
3778         'mk': 'mkd',
3779         'ml': 'mal',
3780         'mn': 'mon',
3781         'mr': 'mar',
3782         'ms': 'msa',
3783         'mt': 'mlt',
3784         'my': 'mya',
3785         'na': 'nau',
3786         'nb': 'nob',
3787         'nd': 'nde',
3788         'ne': 'nep',
3789         'ng': 'ndo',
3790         'nl': 'nld',
3791         'nn': 'nno',
3792         'no': 'nor',
3793         'nr': 'nbl',
3794         'nv': 'nav',
3795         'ny': 'nya',
3796         'oc': 'oci',
3797         'oj': 'oji',
3798         'om': 'orm',
3799         'or': 'ori',
3800         'os': 'oss',
3801         'pa': 'pan',
3802         'pi': 'pli',
3803         'pl': 'pol',
3804         'ps': 'pus',
3805         'pt': 'por',
3806         'qu': 'que',
3807         'rm': 'roh',
3808         'rn': 'run',
3809         'ro': 'ron',
3810         'ru': 'rus',
3811         'rw': 'kin',
3812         'sa': 'san',
3813         'sc': 'srd',
3814         'sd': 'snd',
3815         'se': 'sme',
3816         'sg': 'sag',
3817         'si': 'sin',
3818         'sk': 'slk',
3819         'sl': 'slv',
3820         'sm': 'smo',
3821         'sn': 'sna',
3822         'so': 'som',
3823         'sq': 'sqi',
3824         'sr': 'srp',
3825         'ss': 'ssw',
3826         'st': 'sot',
3827         'su': 'sun',
3828         'sv': 'swe',
3829         'sw': 'swa',
3830         'ta': 'tam',
3831         'te': 'tel',
3832         'tg': 'tgk',
3833         'th': 'tha',
3834         'ti': 'tir',
3835         'tk': 'tuk',
3836         'tl': 'tgl',
3837         'tn': 'tsn',
3838         'to': 'ton',
3839         'tr': 'tur',
3840         'ts': 'tso',
3841         'tt': 'tat',
3842         'tw': 'twi',
3843         'ty': 'tah',
3844         'ug': 'uig',
3845         'uk': 'ukr',
3846         'ur': 'urd',
3847         'uz': 'uzb',
3848         've': 'ven',
3849         'vi': 'vie',
3850         'vo': 'vol',
3851         'wa': 'wln',
3852         'wo': 'wol',
3853         'xh': 'xho',
3854         'yi': 'yid',
3855         'ji': 'yid',  # Replaced by yi in 1989 revision
3856         'yo': 'yor',
3857         'za': 'zha',
3858         'zh': 'zho',
3859         'zu': 'zul',
3860     }
3861
3862     @classmethod
3863     def short2long(cls, code):
3864         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3865         return cls._lang_map.get(code[:2])
3866
3867     @classmethod
3868     def long2short(cls, code):
3869         """Convert language code from ISO 639-2/T to ISO 639-1"""
3870         for short_name, long_name in cls._lang_map.items():
3871             if long_name == code:
3872                 return short_name
3873
3874
3875 class ISO3166Utils(object):
3876     # From http://data.okfn.org/data/core/country-list
3877     _country_map = {
3878         'AF': 'Afghanistan',
3879         'AX': 'Åland Islands',
3880         'AL': 'Albania',
3881         'DZ': 'Algeria',
3882         'AS': 'American Samoa',
3883         'AD': 'Andorra',
3884         'AO': 'Angola',
3885         'AI': 'Anguilla',
3886         'AQ': 'Antarctica',
3887         'AG': 'Antigua and Barbuda',
3888         'AR': 'Argentina',
3889         'AM': 'Armenia',
3890         'AW': 'Aruba',
3891         'AU': 'Australia',
3892         'AT': 'Austria',
3893         'AZ': 'Azerbaijan',
3894         'BS': 'Bahamas',
3895         'BH': 'Bahrain',
3896         'BD': 'Bangladesh',
3897         'BB': 'Barbados',
3898         'BY': 'Belarus',
3899         'BE': 'Belgium',
3900         'BZ': 'Belize',
3901         'BJ': 'Benin',
3902         'BM': 'Bermuda',
3903         'BT': 'Bhutan',
3904         'BO': 'Bolivia, Plurinational State of',
3905         'BQ': 'Bonaire, Sint Eustatius and Saba',
3906         'BA': 'Bosnia and Herzegovina',
3907         'BW': 'Botswana',
3908         'BV': 'Bouvet Island',
3909         'BR': 'Brazil',
3910         'IO': 'British Indian Ocean Territory',
3911         'BN': 'Brunei Darussalam',
3912         'BG': 'Bulgaria',
3913         'BF': 'Burkina Faso',
3914         'BI': 'Burundi',
3915         'KH': 'Cambodia',
3916         'CM': 'Cameroon',
3917         'CA': 'Canada',
3918         'CV': 'Cape Verde',
3919         'KY': 'Cayman Islands',
3920         'CF': 'Central African Republic',
3921         'TD': 'Chad',
3922         'CL': 'Chile',
3923         'CN': 'China',
3924         'CX': 'Christmas Island',
3925         'CC': 'Cocos (Keeling) Islands',
3926         'CO': 'Colombia',
3927         'KM': 'Comoros',
3928         'CG': 'Congo',
3929         'CD': 'Congo, the Democratic Republic of the',
3930         'CK': 'Cook Islands',
3931         'CR': 'Costa Rica',
3932         'CI': 'Côte d\'Ivoire',
3933         'HR': 'Croatia',
3934         'CU': 'Cuba',
3935         'CW': 'Curaçao',
3936         'CY': 'Cyprus',
3937         'CZ': 'Czech Republic',
3938         'DK': 'Denmark',
3939         'DJ': 'Djibouti',
3940         'DM': 'Dominica',
3941         'DO': 'Dominican Republic',
3942         'EC': 'Ecuador',
3943         'EG': 'Egypt',
3944         'SV': 'El Salvador',
3945         'GQ': 'Equatorial Guinea',
3946         'ER': 'Eritrea',
3947         'EE': 'Estonia',
3948         'ET': 'Ethiopia',
3949         'FK': 'Falkland Islands (Malvinas)',
3950         'FO': 'Faroe Islands',
3951         'FJ': 'Fiji',
3952         'FI': 'Finland',
3953         'FR': 'France',
3954         'GF': 'French Guiana',
3955         'PF': 'French Polynesia',
3956         'TF': 'French Southern Territories',
3957         'GA': 'Gabon',
3958         'GM': 'Gambia',
3959         'GE': 'Georgia',
3960         'DE': 'Germany',
3961         'GH': 'Ghana',
3962         'GI': 'Gibraltar',
3963         'GR': 'Greece',
3964         'GL': 'Greenland',
3965         'GD': 'Grenada',
3966         'GP': 'Guadeloupe',
3967         'GU': 'Guam',
3968         'GT': 'Guatemala',
3969         'GG': 'Guernsey',
3970         'GN': 'Guinea',
3971         'GW': 'Guinea-Bissau',
3972         'GY': 'Guyana',
3973         'HT': 'Haiti',
3974         'HM': 'Heard Island and McDonald Islands',
3975         'VA': 'Holy See (Vatican City State)',
3976         'HN': 'Honduras',
3977         'HK': 'Hong Kong',
3978         'HU': 'Hungary',
3979         'IS': 'Iceland',
3980         'IN': 'India',
3981         'ID': 'Indonesia',
3982         'IR': 'Iran, Islamic Republic of',
3983         'IQ': 'Iraq',
3984         'IE': 'Ireland',
3985         'IM': 'Isle of Man',
3986         'IL': 'Israel',
3987         'IT': 'Italy',
3988         'JM': 'Jamaica',
3989         'JP': 'Japan',
3990         'JE': 'Jersey',
3991         'JO': 'Jordan',
3992         'KZ': 'Kazakhstan',
3993         'KE': 'Kenya',
3994         'KI': 'Kiribati',
3995         'KP': 'Korea, Democratic People\'s Republic of',
3996         'KR': 'Korea, Republic of',
3997         'KW': 'Kuwait',
3998         'KG': 'Kyrgyzstan',
3999         'LA': 'Lao People\'s Democratic Republic',
4000         'LV': 'Latvia',
4001         'LB': 'Lebanon',
4002         'LS': 'Lesotho',
4003         'LR': 'Liberia',
4004         'LY': 'Libya',
4005         'LI': 'Liechtenstein',
4006         'LT': 'Lithuania',
4007         'LU': 'Luxembourg',
4008         'MO': 'Macao',
4009         'MK': 'Macedonia, the Former Yugoslav Republic of',
4010         'MG': 'Madagascar',
4011         'MW': 'Malawi',
4012         'MY': 'Malaysia',
4013         'MV': 'Maldives',
4014         'ML': 'Mali',
4015         'MT': 'Malta',
4016         'MH': 'Marshall Islands',
4017         'MQ': 'Martinique',
4018         'MR': 'Mauritania',
4019         'MU': 'Mauritius',
4020         'YT': 'Mayotte',
4021         'MX': 'Mexico',
4022         'FM': 'Micronesia, Federated States of',
4023         'MD': 'Moldova, Republic of',
4024         'MC': 'Monaco',
4025         'MN': 'Mongolia',
4026         'ME': 'Montenegro',
4027         'MS': 'Montserrat',
4028         'MA': 'Morocco',
4029         'MZ': 'Mozambique',
4030         'MM': 'Myanmar',
4031         'NA': 'Namibia',
4032         'NR': 'Nauru',
4033         'NP': 'Nepal',
4034         'NL': 'Netherlands',
4035         'NC': 'New Caledonia',
4036         'NZ': 'New Zealand',
4037         'NI': 'Nicaragua',
4038         'NE': 'Niger',
4039         'NG': 'Nigeria',
4040         'NU': 'Niue',
4041         'NF': 'Norfolk Island',
4042         'MP': 'Northern Mariana Islands',
4043         'NO': 'Norway',
4044         'OM': 'Oman',
4045         'PK': 'Pakistan',
4046         'PW': 'Palau',
4047         'PS': 'Palestine, State of',
4048         'PA': 'Panama',
4049         'PG': 'Papua New Guinea',
4050         'PY': 'Paraguay',
4051         'PE': 'Peru',
4052         'PH': 'Philippines',
4053         'PN': 'Pitcairn',
4054         'PL': 'Poland',
4055         'PT': 'Portugal',
4056         'PR': 'Puerto Rico',
4057         'QA': 'Qatar',
4058         'RE': 'Réunion',
4059         'RO': 'Romania',
4060         'RU': 'Russian Federation',
4061         'RW': 'Rwanda',
4062         'BL': 'Saint Barthélemy',
4063         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4064         'KN': 'Saint Kitts and Nevis',
4065         'LC': 'Saint Lucia',
4066         'MF': 'Saint Martin (French part)',
4067         'PM': 'Saint Pierre and Miquelon',
4068         'VC': 'Saint Vincent and the Grenadines',
4069         'WS': 'Samoa',
4070         'SM': 'San Marino',
4071         'ST': 'Sao Tome and Principe',
4072         'SA': 'Saudi Arabia',
4073         'SN': 'Senegal',
4074         'RS': 'Serbia',
4075         'SC': 'Seychelles',
4076         'SL': 'Sierra Leone',
4077         'SG': 'Singapore',
4078         'SX': 'Sint Maarten (Dutch part)',
4079         'SK': 'Slovakia',
4080         'SI': 'Slovenia',
4081         'SB': 'Solomon Islands',
4082         'SO': 'Somalia',
4083         'ZA': 'South Africa',
4084         'GS': 'South Georgia and the South Sandwich Islands',
4085         'SS': 'South Sudan',
4086         'ES': 'Spain',
4087         'LK': 'Sri Lanka',
4088         'SD': 'Sudan',
4089         'SR': 'Suriname',
4090         'SJ': 'Svalbard and Jan Mayen',
4091         'SZ': 'Swaziland',
4092         'SE': 'Sweden',
4093         'CH': 'Switzerland',
4094         'SY': 'Syrian Arab Republic',
4095         'TW': 'Taiwan, Province of China',
4096         'TJ': 'Tajikistan',
4097         'TZ': 'Tanzania, United Republic of',
4098         'TH': 'Thailand',
4099         'TL': 'Timor-Leste',
4100         'TG': 'Togo',
4101         'TK': 'Tokelau',
4102         'TO': 'Tonga',
4103         'TT': 'Trinidad and Tobago',
4104         'TN': 'Tunisia',
4105         'TR': 'Turkey',
4106         'TM': 'Turkmenistan',
4107         'TC': 'Turks and Caicos Islands',
4108         'TV': 'Tuvalu',
4109         'UG': 'Uganda',
4110         'UA': 'Ukraine',
4111         'AE': 'United Arab Emirates',
4112         'GB': 'United Kingdom',
4113         'US': 'United States',
4114         'UM': 'United States Minor Outlying Islands',
4115         'UY': 'Uruguay',
4116         'UZ': 'Uzbekistan',
4117         'VU': 'Vanuatu',
4118         'VE': 'Venezuela, Bolivarian Republic of',
4119         'VN': 'Viet Nam',
4120         'VG': 'Virgin Islands, British',
4121         'VI': 'Virgin Islands, U.S.',
4122         'WF': 'Wallis and Futuna',
4123         'EH': 'Western Sahara',
4124         'YE': 'Yemen',
4125         'ZM': 'Zambia',
4126         'ZW': 'Zimbabwe',
4127     }
4128
4129     @classmethod
4130     def short2full(cls, code):
4131         """Convert an ISO 3166-2 country code to the corresponding full name"""
4132         return cls._country_map.get(code.upper())
4133
4134
4135 class GeoUtils(object):
4136     # Major IPv4 address blocks per country
4137     _country_ip_map = {
4138         'AD': '46.172.224.0/19',
4139         'AE': '94.200.0.0/13',
4140         'AF': '149.54.0.0/17',
4141         'AG': '209.59.64.0/18',
4142         'AI': '204.14.248.0/21',
4143         'AL': '46.99.0.0/16',
4144         'AM': '46.70.0.0/15',
4145         'AO': '105.168.0.0/13',
4146         'AP': '182.50.184.0/21',
4147         'AQ': '23.154.160.0/24',
4148         'AR': '181.0.0.0/12',
4149         'AS': '202.70.112.0/20',
4150         'AT': '77.116.0.0/14',
4151         'AU': '1.128.0.0/11',
4152         'AW': '181.41.0.0/18',
4153         'AX': '185.217.4.0/22',
4154         'AZ': '5.197.0.0/16',
4155         'BA': '31.176.128.0/17',
4156         'BB': '65.48.128.0/17',
4157         'BD': '114.130.0.0/16',
4158         'BE': '57.0.0.0/8',
4159         'BF': '102.178.0.0/15',
4160         'BG': '95.42.0.0/15',
4161         'BH': '37.131.0.0/17',
4162         'BI': '154.117.192.0/18',
4163         'BJ': '137.255.0.0/16',
4164         'BL': '185.212.72.0/23',
4165         'BM': '196.12.64.0/18',
4166         'BN': '156.31.0.0/16',
4167         'BO': '161.56.0.0/16',
4168         'BQ': '161.0.80.0/20',
4169         'BR': '191.128.0.0/12',
4170         'BS': '24.51.64.0/18',
4171         'BT': '119.2.96.0/19',
4172         'BW': '168.167.0.0/16',
4173         'BY': '178.120.0.0/13',
4174         'BZ': '179.42.192.0/18',
4175         'CA': '99.224.0.0/11',
4176         'CD': '41.243.0.0/16',
4177         'CF': '197.242.176.0/21',
4178         'CG': '160.113.0.0/16',
4179         'CH': '85.0.0.0/13',
4180         'CI': '102.136.0.0/14',
4181         'CK': '202.65.32.0/19',
4182         'CL': '152.172.0.0/14',
4183         'CM': '102.244.0.0/14',
4184         'CN': '36.128.0.0/10',
4185         'CO': '181.240.0.0/12',
4186         'CR': '201.192.0.0/12',
4187         'CU': '152.206.0.0/15',
4188         'CV': '165.90.96.0/19',
4189         'CW': '190.88.128.0/17',
4190         'CY': '31.153.0.0/16',
4191         'CZ': '88.100.0.0/14',
4192         'DE': '53.0.0.0/8',
4193         'DJ': '197.241.0.0/17',
4194         'DK': '87.48.0.0/12',
4195         'DM': '192.243.48.0/20',
4196         'DO': '152.166.0.0/15',
4197         'DZ': '41.96.0.0/12',
4198         'EC': '186.68.0.0/15',
4199         'EE': '90.190.0.0/15',
4200         'EG': '156.160.0.0/11',
4201         'ER': '196.200.96.0/20',
4202         'ES': '88.0.0.0/11',
4203         'ET': '196.188.0.0/14',
4204         'EU': '2.16.0.0/13',
4205         'FI': '91.152.0.0/13',
4206         'FJ': '144.120.0.0/16',
4207         'FK': '80.73.208.0/21',
4208         'FM': '119.252.112.0/20',
4209         'FO': '88.85.32.0/19',
4210         'FR': '90.0.0.0/9',
4211         'GA': '41.158.0.0/15',
4212         'GB': '25.0.0.0/8',
4213         'GD': '74.122.88.0/21',
4214         'GE': '31.146.0.0/16',
4215         'GF': '161.22.64.0/18',
4216         'GG': '62.68.160.0/19',
4217         'GH': '154.160.0.0/12',
4218         'GI': '95.164.0.0/16',
4219         'GL': '88.83.0.0/19',
4220         'GM': '160.182.0.0/15',
4221         'GN': '197.149.192.0/18',
4222         'GP': '104.250.0.0/19',
4223         'GQ': '105.235.224.0/20',
4224         'GR': '94.64.0.0/13',
4225         'GT': '168.234.0.0/16',
4226         'GU': '168.123.0.0/16',
4227         'GW': '197.214.80.0/20',
4228         'GY': '181.41.64.0/18',
4229         'HK': '113.252.0.0/14',
4230         'HN': '181.210.0.0/16',
4231         'HR': '93.136.0.0/13',
4232         'HT': '148.102.128.0/17',
4233         'HU': '84.0.0.0/14',
4234         'ID': '39.192.0.0/10',
4235         'IE': '87.32.0.0/12',
4236         'IL': '79.176.0.0/13',
4237         'IM': '5.62.80.0/20',
4238         'IN': '117.192.0.0/10',
4239         'IO': '203.83.48.0/21',
4240         'IQ': '37.236.0.0/14',
4241         'IR': '2.176.0.0/12',
4242         'IS': '82.221.0.0/16',
4243         'IT': '79.0.0.0/10',
4244         'JE': '87.244.64.0/18',
4245         'JM': '72.27.0.0/17',
4246         'JO': '176.29.0.0/16',
4247         'JP': '133.0.0.0/8',
4248         'KE': '105.48.0.0/12',
4249         'KG': '158.181.128.0/17',
4250         'KH': '36.37.128.0/17',
4251         'KI': '103.25.140.0/22',
4252         'KM': '197.255.224.0/20',
4253         'KN': '198.167.192.0/19',
4254         'KP': '175.45.176.0/22',
4255         'KR': '175.192.0.0/10',
4256         'KW': '37.36.0.0/14',
4257         'KY': '64.96.0.0/15',
4258         'KZ': '2.72.0.0/13',
4259         'LA': '115.84.64.0/18',
4260         'LB': '178.135.0.0/16',
4261         'LC': '24.92.144.0/20',
4262         'LI': '82.117.0.0/19',
4263         'LK': '112.134.0.0/15',
4264         'LR': '102.183.0.0/16',
4265         'LS': '129.232.0.0/17',
4266         'LT': '78.56.0.0/13',
4267         'LU': '188.42.0.0/16',
4268         'LV': '46.109.0.0/16',
4269         'LY': '41.252.0.0/14',
4270         'MA': '105.128.0.0/11',
4271         'MC': '88.209.64.0/18',
4272         'MD': '37.246.0.0/16',
4273         'ME': '178.175.0.0/17',
4274         'MF': '74.112.232.0/21',
4275         'MG': '154.126.0.0/17',
4276         'MH': '117.103.88.0/21',
4277         'MK': '77.28.0.0/15',
4278         'ML': '154.118.128.0/18',
4279         'MM': '37.111.0.0/17',
4280         'MN': '49.0.128.0/17',
4281         'MO': '60.246.0.0/16',
4282         'MP': '202.88.64.0/20',
4283         'MQ': '109.203.224.0/19',
4284         'MR': '41.188.64.0/18',
4285         'MS': '208.90.112.0/22',
4286         'MT': '46.11.0.0/16',
4287         'MU': '105.16.0.0/12',
4288         'MV': '27.114.128.0/18',
4289         'MW': '102.70.0.0/15',
4290         'MX': '187.192.0.0/11',
4291         'MY': '175.136.0.0/13',
4292         'MZ': '197.218.0.0/15',
4293         'NA': '41.182.0.0/16',
4294         'NC': '101.101.0.0/18',
4295         'NE': '197.214.0.0/18',
4296         'NF': '203.17.240.0/22',
4297         'NG': '105.112.0.0/12',
4298         'NI': '186.76.0.0/15',
4299         'NL': '145.96.0.0/11',
4300         'NO': '84.208.0.0/13',
4301         'NP': '36.252.0.0/15',
4302         'NR': '203.98.224.0/19',
4303         'NU': '49.156.48.0/22',
4304         'NZ': '49.224.0.0/14',
4305         'OM': '5.36.0.0/15',
4306         'PA': '186.72.0.0/15',
4307         'PE': '186.160.0.0/14',
4308         'PF': '123.50.64.0/18',
4309         'PG': '124.240.192.0/19',
4310         'PH': '49.144.0.0/13',
4311         'PK': '39.32.0.0/11',
4312         'PL': '83.0.0.0/11',
4313         'PM': '70.36.0.0/20',
4314         'PR': '66.50.0.0/16',
4315         'PS': '188.161.0.0/16',
4316         'PT': '85.240.0.0/13',
4317         'PW': '202.124.224.0/20',
4318         'PY': '181.120.0.0/14',
4319         'QA': '37.210.0.0/15',
4320         'RE': '102.35.0.0/16',
4321         'RO': '79.112.0.0/13',
4322         'RS': '93.86.0.0/15',
4323         'RU': '5.136.0.0/13',
4324         'RW': '41.186.0.0/16',
4325         'SA': '188.48.0.0/13',
4326         'SB': '202.1.160.0/19',
4327         'SC': '154.192.0.0/11',
4328         'SD': '102.120.0.0/13',
4329         'SE': '78.64.0.0/12',
4330         'SG': '8.128.0.0/10',
4331         'SI': '188.196.0.0/14',
4332         'SK': '78.98.0.0/15',
4333         'SL': '102.143.0.0/17',
4334         'SM': '89.186.32.0/19',
4335         'SN': '41.82.0.0/15',
4336         'SO': '154.115.192.0/18',
4337         'SR': '186.179.128.0/17',
4338         'SS': '105.235.208.0/21',
4339         'ST': '197.159.160.0/19',
4340         'SV': '168.243.0.0/16',
4341         'SX': '190.102.0.0/20',
4342         'SY': '5.0.0.0/16',
4343         'SZ': '41.84.224.0/19',
4344         'TC': '65.255.48.0/20',
4345         'TD': '154.68.128.0/19',
4346         'TG': '196.168.0.0/14',
4347         'TH': '171.96.0.0/13',
4348         'TJ': '85.9.128.0/18',
4349         'TK': '27.96.24.0/21',
4350         'TL': '180.189.160.0/20',
4351         'TM': '95.85.96.0/19',
4352         'TN': '197.0.0.0/11',
4353         'TO': '175.176.144.0/21',
4354         'TR': '78.160.0.0/11',
4355         'TT': '186.44.0.0/15',
4356         'TV': '202.2.96.0/19',
4357         'TW': '120.96.0.0/11',
4358         'TZ': '156.156.0.0/14',
4359         'UA': '37.52.0.0/14',
4360         'UG': '102.80.0.0/13',
4361         'US': '6.0.0.0/8',
4362         'UY': '167.56.0.0/13',
4363         'UZ': '84.54.64.0/18',
4364         'VA': '212.77.0.0/19',
4365         'VC': '207.191.240.0/21',
4366         'VE': '186.88.0.0/13',
4367         'VG': '66.81.192.0/20',
4368         'VI': '146.226.0.0/16',
4369         'VN': '14.160.0.0/11',
4370         'VU': '202.80.32.0/20',
4371         'WF': '117.20.32.0/21',
4372         'WS': '202.4.32.0/19',
4373         'YE': '134.35.0.0/16',
4374         'YT': '41.242.116.0/22',
4375         'ZA': '41.0.0.0/11',
4376         'ZM': '102.144.0.0/13',
4377         'ZW': '102.177.192.0/18',
4378     }
4379
4380     @classmethod
4381     def random_ipv4(cls, code_or_block):
4382         if len(code_or_block) == 2:
4383             block = cls._country_ip_map.get(code_or_block.upper())
4384             if not block:
4385                 return None
4386         else:
4387             block = code_or_block
4388         addr, preflen = block.split('/')
4389         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4390         addr_max = addr_min | (0xffffffff >> int(preflen))
4391         return compat_str(socket.inet_ntoa(
4392             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4393
4394
4395 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4396     def __init__(self, proxies=None):
4397         # Set default handlers
4398         for type in ('http', 'https'):
4399             setattr(self, '%s_open' % type,
4400                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4401                         meth(r, proxy, type))
4402         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4403
4404     def proxy_open(self, req, proxy, type):
4405         req_proxy = req.headers.get('Ytdl-request-proxy')
4406         if req_proxy is not None:
4407             proxy = req_proxy
4408             del req.headers['Ytdl-request-proxy']
4409
4410         if proxy == '__noproxy__':
4411             return None  # No Proxy
4412         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4413             req.add_header('Ytdl-socks-proxy', proxy)
4414             # yt-dlp's http/https handlers do wrapping the socket with socks
4415             return None
4416         return compat_urllib_request.ProxyHandler.proxy_open(
4417             self, req, proxy, type)
4418
4419
4420 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4421 # released into Public Domain
4422 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4423
4424 def long_to_bytes(n, blocksize=0):
4425     """long_to_bytes(n:long, blocksize:int) : string
4426     Convert a long integer to a byte string.
4427
4428     If optional blocksize is given and greater than zero, pad the front of the
4429     byte string with binary zeros so that the length is a multiple of
4430     blocksize.
4431     """
4432     # after much testing, this algorithm was deemed to be the fastest
4433     s = b''
4434     n = int(n)
4435     while n > 0:
4436         s = compat_struct_pack('>I', n & 0xffffffff) + s
4437         n = n >> 32
4438     # strip off leading zeros
4439     for i in range(len(s)):
4440         if s[i] != b'\000'[0]:
4441             break
4442     else:
4443         # only happens when n == 0
4444         s = b'\000'
4445         i = 0
4446     s = s[i:]
4447     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4448     # de-padding being done above, but sigh...
4449     if blocksize > 0 and len(s) % blocksize:
4450         s = (blocksize - len(s) % blocksize) * b'\000' + s
4451     return s
4452
4453
4454 def bytes_to_long(s):
4455     """bytes_to_long(string) : long
4456     Convert a byte string to a long integer.
4457
4458     This is (essentially) the inverse of long_to_bytes().
4459     """
4460     acc = 0
4461     length = len(s)
4462     if length % 4:
4463         extra = (4 - length % 4)
4464         s = b'\000' * extra + s
4465         length = length + extra
4466     for i in range(0, length, 4):
4467         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4468     return acc
4469
4470
4471 def ohdave_rsa_encrypt(data, exponent, modulus):
4472     '''
4473     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4474
4475     Input:
4476         data: data to encrypt, bytes-like object
4477         exponent, modulus: parameter e and N of RSA algorithm, both integer
4478     Output: hex string of encrypted data
4479
4480     Limitation: supports one block encryption only
4481     '''
4482
4483     payload = int(binascii.hexlify(data[::-1]), 16)
4484     encrypted = pow(payload, exponent, modulus)
4485     return '%x' % encrypted
4486
4487
4488 def pkcs1pad(data, length):
4489     """
4490     Padding input data with PKCS#1 scheme
4491
4492     @param {int[]} data        input data
4493     @param {int}   length      target length
4494     @returns {int[]}           padded data
4495     """
4496     if len(data) > length - 11:
4497         raise ValueError('Input data too long for PKCS#1 padding')
4498
4499     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4500     return [0, 2] + pseudo_random + [0] + data
4501
4502
4503 def encode_base_n(num, n, table=None):
4504     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4505     if not table:
4506         table = FULL_TABLE[:n]
4507
4508     if n > len(table):
4509         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4510
4511     if num == 0:
4512         return table[0]
4513
4514     ret = ''
4515     while num:
4516         ret = table[num % n] + ret
4517         num = num // n
4518     return ret
4519
4520
4521 def decode_packed_codes(code):
4522     mobj = re.search(PACKED_CODES_RE, code)
4523     obfuscated_code, base, count, symbols = mobj.groups()
4524     base = int(base)
4525     count = int(count)
4526     symbols = symbols.split('|')
4527     symbol_table = {}
4528
4529     while count:
4530         count -= 1
4531         base_n_count = encode_base_n(count, base)
4532         symbol_table[base_n_count] = symbols[count] or base_n_count
4533
4534     return re.sub(
4535         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4536         obfuscated_code)
4537
4538
4539 def caesar(s, alphabet, shift):
4540     if shift == 0:
4541         return s
4542     l = len(alphabet)
4543     return ''.join(
4544         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4545         for c in s)
4546
4547
4548 def rot47(s):
4549     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4550
4551
4552 def parse_m3u8_attributes(attrib):
4553     info = {}
4554     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4555         if val.startswith('"'):
4556             val = val[1:-1]
4557         info[key] = val
4558     return info
4559
4560
4561 def urshift(val, n):
4562     return val >> n if val >= 0 else (val + 0x100000000) >> n
4563
4564
4565 # Based on png2str() written by @gdkchan and improved by @yokrysty
4566 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4567 def decode_png(png_data):
4568     # Reference: https://www.w3.org/TR/PNG/
4569     header = png_data[8:]
4570
4571     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4572         raise IOError('Not a valid PNG file.')
4573
4574     int_map = {1: '>B', 2: '>H', 4: '>I'}
4575     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4576
4577     chunks = []
4578
4579     while header:
4580         length = unpack_integer(header[:4])
4581         header = header[4:]
4582
4583         chunk_type = header[:4]
4584         header = header[4:]
4585
4586         chunk_data = header[:length]
4587         header = header[length:]
4588
4589         header = header[4:]  # Skip CRC
4590
4591         chunks.append({
4592             'type': chunk_type,
4593             'length': length,
4594             'data': chunk_data
4595         })
4596
4597     ihdr = chunks[0]['data']
4598
4599     width = unpack_integer(ihdr[:4])
4600     height = unpack_integer(ihdr[4:8])
4601
4602     idat = b''
4603
4604     for chunk in chunks:
4605         if chunk['type'] == b'IDAT':
4606             idat += chunk['data']
4607
4608     if not idat:
4609         raise IOError('Unable to read PNG data.')
4610
4611     decompressed_data = bytearray(zlib.decompress(idat))
4612
4613     stride = width * 3
4614     pixels = []
4615
4616     def _get_pixel(idx):
4617         x = idx % stride
4618         y = idx // stride
4619         return pixels[y][x]
4620
4621     for y in range(height):
4622         basePos = y * (1 + stride)
4623         filter_type = decompressed_data[basePos]
4624
4625         current_row = []
4626
4627         pixels.append(current_row)
4628
4629         for x in range(stride):
4630             color = decompressed_data[1 + basePos + x]
4631             basex = y * stride + x
4632             left = 0
4633             up = 0
4634
4635             if x > 2:
4636                 left = _get_pixel(basex - 3)
4637             if y > 0:
4638                 up = _get_pixel(basex - stride)
4639
4640             if filter_type == 1:  # Sub
4641                 color = (color + left) & 0xff
4642             elif filter_type == 2:  # Up
4643                 color = (color + up) & 0xff
4644             elif filter_type == 3:  # Average
4645                 color = (color + ((left + up) >> 1)) & 0xff
4646             elif filter_type == 4:  # Paeth
4647                 a = left
4648                 b = up
4649                 c = 0
4650
4651                 if x > 2 and y > 0:
4652                     c = _get_pixel(basex - stride - 3)
4653
4654                 p = a + b - c
4655
4656                 pa = abs(p - a)
4657                 pb = abs(p - b)
4658                 pc = abs(p - c)
4659
4660                 if pa <= pb and pa <= pc:
4661                     color = (color + a) & 0xff
4662                 elif pb <= pc:
4663                     color = (color + b) & 0xff
4664                 else:
4665                     color = (color + c) & 0xff
4666
4667             current_row.append(color)
4668
4669     return width, height, pixels
4670
4671
4672 def write_xattr(path, key, value):
4673     # This mess below finds the best xattr tool for the job
4674     try:
4675         # try the pyxattr module...
4676         import xattr
4677
4678         if hasattr(xattr, 'set'):  # pyxattr
4679             # Unicode arguments are not supported in python-pyxattr until
4680             # version 0.5.0
4681             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4682             pyxattr_required_version = '0.5.0'
4683             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4684                 # TODO: fallback to CLI tools
4685                 raise XAttrUnavailableError(
4686                     'python-pyxattr is detected but is too old. '
4687                     'yt-dlp requires %s or above while your version is %s. '
4688                     'Falling back to other xattr implementations' % (
4689                         pyxattr_required_version, xattr.__version__))
4690
4691             setxattr = xattr.set
4692         else:  # xattr
4693             setxattr = xattr.setxattr
4694
4695         try:
4696             setxattr(path, key, value)
4697         except EnvironmentError as e:
4698             raise XAttrMetadataError(e.errno, e.strerror)
4699
4700     except ImportError:
4701         if compat_os_name == 'nt':
4702             # Write xattrs to NTFS Alternate Data Streams:
4703             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4704             assert ':' not in key
4705             assert os.path.exists(path)
4706
4707             ads_fn = path + ':' + key
4708             try:
4709                 with open(ads_fn, 'wb') as f:
4710                     f.write(value)
4711             except EnvironmentError as e:
4712                 raise XAttrMetadataError(e.errno, e.strerror)
4713         else:
4714             user_has_setfattr = check_executable('setfattr', ['--version'])
4715             user_has_xattr = check_executable('xattr', ['-h'])
4716
4717             if user_has_setfattr or user_has_xattr:
4718
4719                 value = value.decode('utf-8')
4720                 if user_has_setfattr:
4721                     executable = 'setfattr'
4722                     opts = ['-n', key, '-v', value]
4723                 elif user_has_xattr:
4724                     executable = 'xattr'
4725                     opts = ['-w', key, value]
4726
4727                 cmd = ([encodeFilename(executable, True)]
4728                        + [encodeArgument(o) for o in opts]
4729                        + [encodeFilename(path, True)])
4730
4731                 try:
4732                     p = Popen(
4733                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4734                 except EnvironmentError as e:
4735                     raise XAttrMetadataError(e.errno, e.strerror)
4736                 stdout, stderr = p.communicate_or_kill()
4737                 stderr = stderr.decode('utf-8', 'replace')
4738                 if p.returncode != 0:
4739                     raise XAttrMetadataError(p.returncode, stderr)
4740
4741             else:
4742                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4743                 if sys.platform.startswith('linux'):
4744                     raise XAttrUnavailableError(
4745                         "Couldn't find a tool to set the xattrs. "
4746                         "Install either the python 'pyxattr' or 'xattr' "
4747                         "modules, or the GNU 'attr' package "
4748                         "(which contains the 'setfattr' tool).")
4749                 else:
4750                     raise XAttrUnavailableError(
4751                         "Couldn't find a tool to set the xattrs. "
4752                         "Install either the python 'xattr' module, "
4753                         "or the 'xattr' binary.")
4754
4755
4756 def random_birthday(year_field, month_field, day_field):
4757     start_date = datetime.date(1950, 1, 1)
4758     end_date = datetime.date(1995, 12, 31)
4759     offset = random.randint(0, (end_date - start_date).days)
4760     random_date = start_date + datetime.timedelta(offset)
4761     return {
4762         year_field: str(random_date.year),
4763         month_field: str(random_date.month),
4764         day_field: str(random_date.day),
4765     }
4766
4767
4768 # Templates for internet shortcut files, which are plain text files.
4769 DOT_URL_LINK_TEMPLATE = '''
4770 [InternetShortcut]
4771 URL=%(url)s
4772 '''.lstrip()
4773
4774 DOT_WEBLOC_LINK_TEMPLATE = '''
4775 <?xml version="1.0" encoding="UTF-8"?>
4776 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4777 <plist version="1.0">
4778 <dict>
4779 \t<key>URL</key>
4780 \t<string>%(url)s</string>
4781 </dict>
4782 </plist>
4783 '''.lstrip()
4784
4785 DOT_DESKTOP_LINK_TEMPLATE = '''
4786 [Desktop Entry]
4787 Encoding=UTF-8
4788 Name=%(filename)s
4789 Type=Link
4790 URL=%(url)s
4791 Icon=text-html
4792 '''.lstrip()
4793
4794 LINK_TEMPLATES = {
4795     'url': DOT_URL_LINK_TEMPLATE,
4796     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4797     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4798 }
4799
4800
4801 def iri_to_uri(iri):
4802     """
4803     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4804
4805     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4806     """
4807
4808     iri_parts = compat_urllib_parse_urlparse(iri)
4809
4810     if '[' in iri_parts.netloc:
4811         raise ValueError('IPv6 URIs are not, yet, supported.')
4812         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4813
4814     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4815
4816     net_location = ''
4817     if iri_parts.username:
4818         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4819         if iri_parts.password is not None:
4820             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4821         net_location += '@'
4822
4823     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4824     # The 'idna' encoding produces ASCII text.
4825     if iri_parts.port is not None and iri_parts.port != 80:
4826         net_location += ':' + str(iri_parts.port)
4827
4828     return compat_urllib_parse_urlunparse(
4829         (iri_parts.scheme,
4830             net_location,
4831
4832             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4833
4834             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4835             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4836
4837             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4838             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4839
4840             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4841
4842     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4843
4844
4845 def to_high_limit_path(path):
4846     if sys.platform in ['win32', 'cygwin']:
4847         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4848         return r'\\?\ '.rstrip() + os.path.abspath(path)
4849
4850     return path
4851
4852
4853 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4854     if field is None:
4855         val = obj if obj is not None else default
4856     else:
4857         val = obj.get(field, default)
4858     if func and val not in ignore:
4859         val = func(val)
4860     return template % val if val not in ignore else default
4861
4862
4863 def clean_podcast_url(url):
4864     return re.sub(r'''(?x)
4865         (?:
4866             (?:
4867                 chtbl\.com/track|
4868                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4869                 play\.podtrac\.com
4870             )/[^/]+|
4871             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4872             flex\.acast\.com|
4873             pd(?:
4874                 cn\.co| # https://podcorn.com/analytics-prefix/
4875                 st\.fm # https://podsights.com/docs/
4876             )/e
4877         )/''', '', url)
4878
4879
4880 _HEX_TABLE = '0123456789abcdef'
4881
4882
4883 def random_uuidv4():
4884     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4885
4886
4887 def make_dir(path, to_screen=None):
4888     try:
4889         dn = os.path.dirname(path)
4890         if dn and not os.path.exists(dn):
4891             os.makedirs(dn)
4892         return True
4893     except (OSError, IOError) as err:
4894         if callable(to_screen) is not None:
4895             to_screen('unable to create directory ' + error_to_compat_str(err))
4896         return False
4897
4898
4899 def get_executable_path():
4900     from zipimport import zipimporter
4901     if hasattr(sys, 'frozen'):  # Running from PyInstaller
4902         path = os.path.dirname(sys.executable)
4903     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
4904         path = os.path.join(os.path.dirname(__file__), '../..')
4905     else:
4906         path = os.path.join(os.path.dirname(__file__), '..')
4907     return os.path.abspath(path)
4908
4909
4910 def load_plugins(name, suffix, namespace):
4911     classes = {}
4912     try:
4913         plugins_spec = importlib.util.spec_from_file_location(
4914             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4915         plugins = importlib.util.module_from_spec(plugins_spec)
4916         sys.modules[plugins_spec.name] = plugins
4917         plugins_spec.loader.exec_module(plugins)
4918         for name in dir(plugins):
4919             if name in namespace:
4920                 continue
4921             if not name.endswith(suffix):
4922                 continue
4923             klass = getattr(plugins, name)
4924             classes[name] = namespace[name] = klass
4925     except FileNotFoundError:
4926         pass
4927     return classes
4928
4929
4930 def traverse_obj(
4931         obj, *path_list, default=None, expected_type=None, get_all=True,
4932         casesense=True, is_user_input=False, traverse_string=False):
4933     ''' Traverse nested list/dict/tuple
4934     @param path_list        A list of paths which are checked one by one.
4935                             Each path is a list of keys where each key is a string,
4936                             a function, a tuple of strings or "...".
4937                             When a fuction is given, it takes the key as argument and
4938                             returns whether the key matches or not. When a tuple is given,
4939                             all the keys given in the tuple are traversed, and
4940                             "..." traverses all the keys in the object
4941     @param default          Default value to return
4942     @param expected_type    Only accept final value of this type (Can also be any callable)
4943     @param get_all          Return all the values obtained from a path or only the first one
4944     @param casesense        Whether to consider dictionary keys as case sensitive
4945     @param is_user_input    Whether the keys are generated from user input. If True,
4946                             strings are converted to int/slice if necessary
4947     @param traverse_string  Whether to traverse inside strings. If True, any
4948                             non-compatible object will also be converted into a string
4949     # TODO: Write tests
4950     '''
4951     if not casesense:
4952         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4953         path_list = (map(_lower, variadic(path)) for path in path_list)
4954
4955     def _traverse_obj(obj, path, _current_depth=0):
4956         nonlocal depth
4957         path = tuple(variadic(path))
4958         for i, key in enumerate(path):
4959             if obj is None:
4960                 return None
4961             if isinstance(key, (list, tuple)):
4962                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4963                 key = ...
4964             if key is ...:
4965                 obj = (obj.values() if isinstance(obj, dict)
4966                        else obj if isinstance(obj, (list, tuple, LazyList))
4967                        else str(obj) if traverse_string else [])
4968                 _current_depth += 1
4969                 depth = max(depth, _current_depth)
4970                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4971             elif callable(key):
4972                 if isinstance(obj, (list, tuple, LazyList)):
4973                     obj = enumerate(obj)
4974                 elif isinstance(obj, dict):
4975                     obj = obj.items()
4976                 else:
4977                     if not traverse_string:
4978                         return None
4979                     obj = str(obj)
4980                 _current_depth += 1
4981                 depth = max(depth, _current_depth)
4982                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
4983             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
4984                 obj = (obj.get(key) if casesense or (key in obj)
4985                        else next((v for k, v in obj.items() if _lower(k) == key), None))
4986             else:
4987                 if is_user_input:
4988                     key = (int_or_none(key) if ':' not in key
4989                            else slice(*map(int_or_none, key.split(':'))))
4990                     if key == slice(None):
4991                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
4992                 if not isinstance(key, (int, slice)):
4993                     return None
4994                 if not isinstance(obj, (list, tuple, LazyList)):
4995                     if not traverse_string:
4996                         return None
4997                     obj = str(obj)
4998                 try:
4999                     obj = obj[key]
5000                 except IndexError:
5001                     return None
5002         return obj
5003
5004     if isinstance(expected_type, type):
5005         type_test = lambda val: val if isinstance(val, expected_type) else None
5006     elif expected_type is not None:
5007         type_test = expected_type
5008     else:
5009         type_test = lambda val: val
5010
5011     for path in path_list:
5012         depth = 0
5013         val = _traverse_obj(obj, path)
5014         if val is not None:
5015             if depth:
5016                 for _ in range(depth - 1):
5017                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5018                 val = [v for v in map(type_test, val) if v is not None]
5019                 if val:
5020                     return val if get_all else val[0]
5021             else:
5022                 val = type_test(val)
5023                 if val is not None:
5024                     return val
5025     return default
5026
5027
5028 # Deprecated
5029 def traverse_dict(dictn, keys, casesense=True):
5030     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5031                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5032     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5033
5034
5035 def variadic(x, allowed_types=(str, bytes, dict)):
5036     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5037
5038
5039 # create a JSON Web Signature (jws) with HS256 algorithm
5040 # the resulting format is in JWS Compact Serialization
5041 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5042 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5043 def jwt_encode_hs256(payload_data, key, headers={}):
5044     header_data = {
5045         'alg': 'HS256',
5046         'typ': 'JWT',
5047     }
5048     if headers:
5049         header_data.update(headers)
5050     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5051     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5052     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5053     signature_b64 = base64.b64encode(h.digest())
5054     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5055     return token
5056
5057
5058 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5059 def jwt_decode_hs256(jwt):
5060     header_b64, payload_b64, signature_b64 = jwt.split('.')
5061     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5062     return payload_data
5063
5064
5065 def supports_terminal_sequences(stream):
5066     if compat_os_name == 'nt':
5067         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5068         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5069             return False
5070     elif not os.getenv('TERM'):
5071         return False
5072     try:
5073         return stream.isatty()
5074     except BaseException:
5075         return False
5076
5077
5078 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5079
5080
5081 def remove_terminal_sequences(string):
5082     return _terminal_sequences_re.sub('', string)
5083
5084
5085 def number_of_digits(number):
5086     return len('%d' % number)
5087
5088
5089 def join_nonempty(*values, delim='-', from_dict=None):
5090     if from_dict is not None:
5091         values = map(from_dict.get, values)
5092     return delim.join(map(str, filter(None, values)))