yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import collections
  11 import contextlib
  12 import ctypes
  13 import datetime
  14 import email.utils
  15 import email.header
  16 import errno
  17 import functools
  18 import gzip
  19 import hashlib
  20 import hmac
  21 import importlib.util
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import operator
  28 import os
  29 import platform
  30 import random
  31 import re
  32 import socket
  33 import ssl
  34 import subprocess
  35 import sys
  36 import tempfile
  37 import time
  38 import traceback
  39 import xml.etree.ElementTree
  40 import zlib
  41 import mimetypes
  42
  43 from .compat import (
  44     compat_HTMLParseError,
  45     compat_HTMLParser,
  46     compat_HTTPError,
  47     compat_basestring,
  48     compat_chr,
  49     compat_cookiejar,
  50     compat_ctypes_WINFUNCTYPE,
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_html_entities,
  54     compat_html_entities_html5,
  55     compat_http_client,
  56     compat_integer_types,
  57     compat_numeric_types,
  58     compat_kwargs,
  59     compat_os_name,
  60     compat_parse_qs,
  61     compat_shlex_quote,
  62     compat_str,
  63     compat_struct_pack,
  64     compat_struct_unpack,
  65     compat_urllib_error,
  66     compat_urllib_parse,
  67     compat_urllib_parse_urlencode,
  68     compat_urllib_parse_urlparse,
  69     compat_urllib_parse_urlunparse,
  70     compat_urllib_parse_quote,
  71     compat_urllib_parse_quote_plus,
  72     compat_urllib_parse_unquote_plus,
  73     compat_urllib_request,
  74     compat_urlparse,
  75     compat_xpath,
  76 )
  77
  78 from .socks import (
  79     ProxyType,
  80     sockssocket,
  81 )
  82
  83
  84 def register_socks_protocols():
  85     # "Register" SOCKS protocols
  86     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  87     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  88     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  89         if scheme not in compat_urlparse.uses_netloc:
  90             compat_urlparse.uses_netloc.append(scheme)
  91
  92
  93 # This is not clearly defined otherwise
  94 compiled_regex_type = type(re.compile(''))
  95
  96
  97 def random_user_agent():
  98     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  99     _CHROME_VERSIONS = (
 100         '90.0.4430.212',
 101         '90.0.4430.24',
 102         '90.0.4430.70',
 103         '90.0.4430.72',
 104         '90.0.4430.85',
 105         '90.0.4430.93',
 106         '91.0.4472.101',
 107         '91.0.4472.106',
 108         '91.0.4472.114',
 109         '91.0.4472.124',
 110         '91.0.4472.164',
 111         '91.0.4472.19',
 112         '91.0.4472.77',
 113         '92.0.4515.107',
 114         '92.0.4515.115',
 115         '92.0.4515.131',
 116         '92.0.4515.159',
 117         '92.0.4515.43',
 118         '93.0.4556.0',
 119         '93.0.4577.15',
 120         '93.0.4577.63',
 121         '93.0.4577.82',
 122         '94.0.4606.41',
 123         '94.0.4606.54',
 124         '94.0.4606.61',
 125         '94.0.4606.71',
 126         '94.0.4606.81',
 127         '94.0.4606.85',
 128         '95.0.4638.17',
 129         '95.0.4638.50',
 130         '95.0.4638.54',
 131         '95.0.4638.69',
 132         '95.0.4638.74',
 133         '96.0.4664.18',
 134         '96.0.4664.45',
 135         '96.0.4664.55',
 136         '96.0.4664.93',
 137         '97.0.4692.20',
 138     )
 139     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 140
 141
 142 std_headers = {
 143     'User-Agent': random_user_agent(),
 144     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 145     'Accept-Encoding': 'gzip, deflate',
 146     'Accept-Language': 'en-us,en;q=0.5',
 147 }
 148
 149
 150 USER_AGENTS = {
 151     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 152 }
 153
 154
 155 NO_DEFAULT = object()
 156
 157 ENGLISH_MONTH_NAMES = [
 158     'January', 'February', 'March', 'April', 'May', 'June',
 159     'July', 'August', 'September', 'October', 'November', 'December']
 160
 161 MONTH_NAMES = {
 162     'en': ENGLISH_MONTH_NAMES,
 163     'fr': [
 164         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 165         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 166 }
 167
 168 KNOWN_EXTENSIONS = (
 169     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 170     'flv', 'f4v', 'f4a', 'f4b',
 171     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 172     'mkv', 'mka', 'mk3d',
 173     'avi', 'divx',
 174     'mov',
 175     'asf', 'wmv', 'wma',
 176     '3gp', '3g2',
 177     'mp3',
 178     'flac',
 179     'ape',
 180     'wav',
 181     'f4f', 'f4m', 'm3u8', 'smil')
 182
 183 # needed for sanitizing filenames in restricted mode
 184 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 185                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 186                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 187
 188 DATE_FORMATS = (
 189     '%d %B %Y',
 190     '%d %b %Y',
 191     '%B %d %Y',
 192     '%B %dst %Y',
 193     '%B %dnd %Y',
 194     '%B %drd %Y',
 195     '%B %dth %Y',
 196     '%b %d %Y',
 197     '%b %dst %Y',
 198     '%b %dnd %Y',
 199     '%b %drd %Y',
 200     '%b %dth %Y',
 201     '%b %dst %Y %I:%M',
 202     '%b %dnd %Y %I:%M',
 203     '%b %drd %Y %I:%M',
 204     '%b %dth %Y %I:%M',
 205     '%Y %m %d',
 206     '%Y-%m-%d',
 207     '%Y.%m.%d.',
 208     '%Y/%m/%d',
 209     '%Y/%m/%d %H:%M',
 210     '%Y/%m/%d %H:%M:%S',
 211     '%Y%m%d%H%M',
 212     '%Y%m%d%H%M%S',
 213     '%Y%m%d',
 214     '%Y-%m-%d %H:%M',
 215     '%Y-%m-%d %H:%M:%S',
 216     '%Y-%m-%d %H:%M:%S.%f',
 217     '%Y-%m-%d %H:%M:%S:%f',
 218     '%d.%m.%Y %H:%M',
 219     '%d.%m.%Y %H.%M',
 220     '%Y-%m-%dT%H:%M:%SZ',
 221     '%Y-%m-%dT%H:%M:%S.%fZ',
 222     '%Y-%m-%dT%H:%M:%S.%f0Z',
 223     '%Y-%m-%dT%H:%M:%S',
 224     '%Y-%m-%dT%H:%M:%S.%f',
 225     '%Y-%m-%dT%H:%M',
 226     '%b %d %Y at %H:%M',
 227     '%b %d %Y at %H:%M:%S',
 228     '%B %d %Y at %H:%M',
 229     '%B %d %Y at %H:%M:%S',
 230     '%H:%M %d-%b-%Y',
 231 )
 232
 233 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 234 DATE_FORMATS_DAY_FIRST.extend([
 235     '%d-%m-%Y',
 236     '%d.%m.%Y',
 237     '%d.%m.%y',
 238     '%d/%m/%Y',
 239     '%d/%m/%y',
 240     '%d/%m/%Y %H:%M:%S',
 241 ])
 242
 243 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 244 DATE_FORMATS_MONTH_FIRST.extend([
 245     '%m-%d-%Y',
 246     '%m.%d.%Y',
 247     '%m/%d/%Y',
 248     '%m/%d/%y',
 249     '%m/%d/%Y %H:%M:%S',
 250 ])
 251
 252 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 253 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 254
 255
 256 def preferredencoding():
 257     """Get preferred encoding.
 258
 259     Returns the best encoding scheme for the system, based on
 260     locale.getpreferredencoding() and some further tweaks.
 261     """
 262     try:
 263         pref = locale.getpreferredencoding()
 264         'TEST'.encode(pref)
 265     except Exception:
 266         pref = 'UTF-8'
 267
 268     return pref
 269
 270
 271 def write_json_file(obj, fn):
 272     """ Encode obj as JSON and write it to fn, atomically if possible """
 273
 274     fn = encodeFilename(fn)
 275     if sys.version_info < (3, 0) and sys.platform != 'win32':
 276         encoding = get_filesystem_encoding()
 277         # os.path.basename returns a bytes object, but NamedTemporaryFile
 278         # will fail if the filename contains non ascii characters unless we
 279         # use a unicode object
 280         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 281         # the same for os.path.dirname
 282         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 283     else:
 284         path_basename = os.path.basename
 285         path_dirname = os.path.dirname
 286
 287     args = {
 288         'suffix': '.tmp',
 289         'prefix': path_basename(fn) + '.',
 290         'dir': path_dirname(fn),
 291         'delete': False,
 292     }
 293
 294     # In Python 2.x, json.dump expects a bytestream.
 295     # In Python 3.x, it writes to a character stream
 296     if sys.version_info < (3, 0):
 297         args['mode'] = 'wb'
 298     else:
 299         args.update({
 300             'mode': 'w',
 301             'encoding': 'utf-8',
 302         })
 303
 304     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 305
 306     try:
 307         with tf:
 308             json.dump(obj, tf, ensure_ascii=False)
 309         if sys.platform == 'win32':
 310             # Need to remove existing file on Windows, else os.rename raises
 311             # WindowsError or FileExistsError.
 312             try:
 313                 os.unlink(fn)
 314             except OSError:
 315                 pass
 316         try:
 317             mask = os.umask(0)
 318             os.umask(mask)
 319             os.chmod(tf.name, 0o666 & ~mask)
 320         except OSError:
 321             pass
 322         os.rename(tf.name, fn)
 323     except Exception:
 324         try:
 325             os.remove(tf.name)
 326         except OSError:
 327             pass
 328         raise
 329
 330
 331 if sys.version_info >= (2, 7):
 332     def find_xpath_attr(node, xpath, key, val=None):
 333         """ Find the xpath xpath[@key=val] """
 334         assert re.match(r'^[a-zA-Z_-]+$', key)
 335         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 336         return node.find(expr)
 337 else:
 338     def find_xpath_attr(node, xpath, key, val=None):
 339         for f in node.findall(compat_xpath(xpath)):
 340             if key not in f.attrib:
 341                 continue
 342             if val is None or f.attrib.get(key) == val:
 343                 return f
 344         return None
 345
 346 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 347 # the namespace parameter
 348
 349
 350 def xpath_with_ns(path, ns_map):
 351     components = [c.split(':') for c in path.split('/')]
 352     replaced = []
 353     for c in components:
 354         if len(c) == 1:
 355             replaced.append(c[0])
 356         else:
 357             ns, tag = c
 358             replaced.append('{%s}%s' % (ns_map[ns], tag))
 359     return '/'.join(replaced)
 360
 361
 362 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 363     def _find_xpath(xpath):
 364         return node.find(compat_xpath(xpath))
 365
 366     if isinstance(xpath, (str, compat_str)):
 367         n = _find_xpath(xpath)
 368     else:
 369         for xp in xpath:
 370             n = _find_xpath(xp)
 371             if n is not None:
 372                 break
 373
 374     if n is None:
 375         if default is not NO_DEFAULT:
 376             return default
 377         elif fatal:
 378             name = xpath if name is None else name
 379             raise ExtractorError('Could not find XML element %s' % name)
 380         else:
 381             return None
 382     return n
 383
 384
 385 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 386     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 387     if n is None or n == default:
 388         return n
 389     if n.text is None:
 390         if default is not NO_DEFAULT:
 391             return default
 392         elif fatal:
 393             name = xpath if name is None else name
 394             raise ExtractorError('Could not find XML element\'s text %s' % name)
 395         else:
 396             return None
 397     return n.text
 398
 399
 400 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 401     n = find_xpath_attr(node, xpath, key)
 402     if n is None:
 403         if default is not NO_DEFAULT:
 404             return default
 405         elif fatal:
 406             name = '%s[@%s]' % (xpath, key) if name is None else name
 407             raise ExtractorError('Could not find XML attribute %s' % name)
 408         else:
 409             return None
 410     return n.attrib[key]
 411
 412
 413 def get_element_by_id(id, html):
 414     """Return the content of the tag with the specified ID in the passed HTML document"""
 415     return get_element_by_attribute('id', id, html)
 416
 417
 418 def get_element_by_class(class_name, html):
 419     """Return the content of the first tag with the specified class in the passed HTML document"""
 420     retval = get_elements_by_class(class_name, html)
 421     return retval[0] if retval else None
 422
 423
 424 def get_element_by_attribute(attribute, value, html, escape_value=True):
 425     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 426     return retval[0] if retval else None
 427
 428
 429 def get_elements_by_class(class_name, html):
 430     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 431     return get_elements_by_attribute(
 432         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 433         html, escape_value=False)
 434
 435
 436 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 437     """Return the content of the tag with the specified attribute in the passed HTML document"""
 438
 439     value = re.escape(value) if escape_value else value
 440
 441     retlist = []
 442     for m in re.finditer(r'''(?xs)
 443         <([a-zA-Z0-9:._-]+)
 444          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 445          \s+%s=['"]?%s['"]?
 446          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 447         \s*>
 448         (?P<content>.*?)
 449         </\1>
 450     ''' % (re.escape(attribute), value), html):
 451         res = m.group('content')
 452
 453         if res.startswith('"') or res.startswith("'"):
 454             res = res[1:-1]
 455
 456         retlist.append(unescapeHTML(res))
 457
 458     return retlist
 459
 460
 461 class HTMLAttributeParser(compat_HTMLParser):
 462     """Trivial HTML parser to gather the attributes for a single element"""
 463
 464     def __init__(self):
 465         self.attrs = {}
 466         compat_HTMLParser.__init__(self)
 467
 468     def handle_starttag(self, tag, attrs):
 469         self.attrs = dict(attrs)
 470
 471
 472 class HTMLListAttrsParser(compat_HTMLParser):
 473     """HTML parser to gather the attributes for the elements of a list"""
 474
 475     def __init__(self):
 476         compat_HTMLParser.__init__(self)
 477         self.items = []
 478         self._level = 0
 479
 480     def handle_starttag(self, tag, attrs):
 481         if tag == 'li' and self._level == 0:
 482             self.items.append(dict(attrs))
 483         self._level += 1
 484
 485     def handle_endtag(self, tag):
 486         self._level -= 1
 487
 488
 489 def extract_attributes(html_element):
 490     """Given a string for an HTML element such as
 491     <el
 492          a="foo" B="bar" c="&98;az" d=boz
 493          empty= noval entity="&amp;"
 494          sq='"' dq="'"
 495     >
 496     Decode and return a dictionary of attributes.
 497     {
 498         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 499         'empty': '', 'noval': None, 'entity': '&',
 500         'sq': '"', 'dq': '\''
 501     }.
 502     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 503     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 504     """
 505     parser = HTMLAttributeParser()
 506     try:
 507         parser.feed(html_element)
 508         parser.close()
 509     # Older Python may throw HTMLParseError in case of malformed HTML
 510     except compat_HTMLParseError:
 511         pass
 512     return parser.attrs
 513
 514
 515 def parse_list(webpage):
 516     """Given a string for an series of HTML <li> elements,
 517     return a dictionary of their attributes"""
 518     parser = HTMLListAttrsParser()
 519     parser.feed(webpage)
 520     parser.close()
 521     return parser.items
 522
 523
 524 def clean_html(html):
 525     """Clean an HTML snippet into a readable string"""
 526
 527     if html is None:  # Convenience for sanitizing descriptions etc.
 528         return html
 529
 530     # Newline vs <br />
 531     html = html.replace('\n', ' ')
 532     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 533     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 534     # Strip html tags
 535     html = re.sub('<.*?>', '', html)
 536     # Replace html entities
 537     html = unescapeHTML(html)
 538     return html.strip()
 539
 540
 541 def sanitize_open(filename, open_mode):
 542     """Try to open the given filename, and slightly tweak it if this fails.
 543
 544     Attempts to open the given filename. If this fails, it tries to change
 545     the filename slightly, step by step, until it's either able to open it
 546     or it fails and raises a final exception, like the standard open()
 547     function.
 548
 549     It returns the tuple (stream, definitive_file_name).
 550     """
 551     try:
 552         if filename == '-':
 553             if sys.platform == 'win32':
 554                 import msvcrt
 555                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 556             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 557         stream = open(encodeFilename(filename), open_mode)
 558         return (stream, filename)
 559     except (IOError, OSError) as err:
 560         if err.errno in (errno.EACCES,):
 561             raise
 562
 563         # In case of error, try to remove win32 forbidden chars
 564         alt_filename = sanitize_path(filename)
 565         if alt_filename == filename:
 566             raise
 567         else:
 568             # An exception here should be caught in the caller
 569             stream = open(encodeFilename(alt_filename), open_mode)
 570             return (stream, alt_filename)
 571
 572
 573 def timeconvert(timestr):
 574     """Convert RFC 2822 defined time string into system timestamp"""
 575     timestamp = None
 576     timetuple = email.utils.parsedate_tz(timestr)
 577     if timetuple is not None:
 578         timestamp = email.utils.mktime_tz(timetuple)
 579     return timestamp
 580
 581
 582 def sanitize_filename(s, restricted=False, is_id=False):
 583     """Sanitizes a string so it could be used as part of a filename.
 584     If restricted is set, use a stricter subset of allowed characters.
 585     Set is_id if this is not an arbitrary string, but an ID that should be kept
 586     if possible.
 587     """
 588     def replace_insane(char):
 589         if restricted and char in ACCENT_CHARS:
 590             return ACCENT_CHARS[char]
 591         elif not restricted and char == '\n':
 592             return ' '
 593         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 594             return ''
 595         elif char == '"':
 596             return '' if restricted else '\''
 597         elif char == ':':
 598             return '_-' if restricted else ' -'
 599         elif char in '\\/|*<>':
 600             return '_'
 601         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 602             return '_'
 603         if restricted and ord(char) > 127:
 604             return '_'
 605         return char
 606
 607     if s == '':
 608         return ''
 609     # Handle timestamps
 610     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 611     result = ''.join(map(replace_insane, s))
 612     if not is_id:
 613         while '__' in result:
 614             result = result.replace('__', '_')
 615         result = result.strip('_')
 616         # Common case of "Foreign band name - English song title"
 617         if restricted and result.startswith('-_'):
 618             result = result[2:]
 619         if result.startswith('-'):
 620             result = '_' + result[len('-'):]
 621         result = result.lstrip('.')
 622         if not result:
 623             result = '_'
 624     return result
 625
 626
 627 def sanitize_path(s, force=False):
 628     """Sanitizes and normalizes path on Windows"""
 629     if sys.platform == 'win32':
 630         force = False
 631         drive_or_unc, _ = os.path.splitdrive(s)
 632         if sys.version_info < (2, 7) and not drive_or_unc:
 633             drive_or_unc, _ = os.path.splitunc(s)
 634     elif force:
 635         drive_or_unc = ''
 636     else:
 637         return s
 638
 639     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 640     if drive_or_unc:
 641         norm_path.pop(0)
 642     sanitized_path = [
 643         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 644         for path_part in norm_path]
 645     if drive_or_unc:
 646         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 647     elif force and s[0] == os.path.sep:
 648         sanitized_path.insert(0, os.path.sep)
 649     return os.path.join(*sanitized_path)
 650
 651
 652 def sanitize_url(url):
 653     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 654     # the number of unwanted failures due to missing protocol
 655     if url.startswith('//'):
 656         return 'http:%s' % url
 657     # Fix some common typos seen so far
 658     COMMON_TYPOS = (
 659         # https://github.com/ytdl-org/youtube-dl/issues/15649
 660         (r'^httpss://', r'https://'),
 661         # https://bx1.be/lives/direct-tv/
 662         (r'^rmtp([es]?)://', r'rtmp\1://'),
 663     )
 664     for mistake, fixup in COMMON_TYPOS:
 665         if re.match(mistake, url):
 666             return re.sub(mistake, fixup, url)
 667     return url
 668
 669
 670 def extract_basic_auth(url):
 671     parts = compat_urlparse.urlsplit(url)
 672     if parts.username is None:
 673         return url, None
 674     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 675         parts.hostname if parts.port is None
 676         else '%s:%d' % (parts.hostname, parts.port))))
 677     auth_payload = base64.b64encode(
 678         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 679     return url, 'Basic ' + auth_payload.decode('utf-8')
 680
 681
 682 def sanitized_Request(url, *args, **kwargs):
 683     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 684     if auth_header is not None:
 685         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 686         headers['Authorization'] = auth_header
 687     return compat_urllib_request.Request(url, *args, **kwargs)
 688
 689
 690 def expand_path(s):
 691     """Expand shell variables and ~"""
 692     return os.path.expandvars(compat_expanduser(s))
 693
 694
 695 def orderedSet(iterable):
 696     """ Remove all duplicates from the input iterable """
 697     res = []
 698     for el in iterable:
 699         if el not in res:
 700             res.append(el)
 701     return res
 702
 703
 704 def _htmlentity_transform(entity_with_semicolon):
 705     """Transforms an HTML entity to a character."""
 706     entity = entity_with_semicolon[:-1]
 707
 708     # Known non-numeric HTML entity
 709     if entity in compat_html_entities.name2codepoint:
 710         return compat_chr(compat_html_entities.name2codepoint[entity])
 711
 712     # TODO: HTML5 allows entities without a semicolon. For example,
 713     # '&Eacuteric' should be decoded as 'Éric'.
 714     if entity_with_semicolon in compat_html_entities_html5:
 715         return compat_html_entities_html5[entity_with_semicolon]
 716
 717     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 718     if mobj is not None:
 719         numstr = mobj.group(1)
 720         if numstr.startswith('x'):
 721             base = 16
 722             numstr = '0%s' % numstr
 723         else:
 724             base = 10
 725         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 726         try:
 727             return compat_chr(int(numstr, base))
 728         except ValueError:
 729             pass
 730
 731     # Unknown entity in name, return its literal representation
 732     return '&%s;' % entity
 733
 734
 735 def unescapeHTML(s):
 736     if s is None:
 737         return None
 738     assert type(s) == compat_str
 739
 740     return re.sub(
 741         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 742
 743
 744 def escapeHTML(text):
 745     return (
 746         text
 747         .replace('&', '&amp;')
 748         .replace('<', '&lt;')
 749         .replace('>', '&gt;')
 750         .replace('"', '&quot;')
 751         .replace("'", '&#39;')
 752     )
 753
 754
 755 def process_communicate_or_kill(p, *args, **kwargs):
 756     try:
 757         return p.communicate(*args, **kwargs)
 758     except BaseException:  # Including KeyboardInterrupt
 759         p.kill()
 760         p.wait()
 761         raise
 762
 763
 764 class Popen(subprocess.Popen):
 765     if sys.platform == 'win32':
 766         _startupinfo = subprocess.STARTUPINFO()
 767         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 768     else:
 769         _startupinfo = None
 770
 771     def __init__(self, *args, **kwargs):
 772         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 773
 774     def communicate_or_kill(self, *args, **kwargs):
 775         return process_communicate_or_kill(self, *args, **kwargs)
 776
 777
 778 def get_subprocess_encoding():
 779     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 780         # For subprocess calls, encode with locale encoding
 781         # Refer to http://stackoverflow.com/a/9951851/35070
 782         encoding = preferredencoding()
 783     else:
 784         encoding = sys.getfilesystemencoding()
 785     if encoding is None:
 786         encoding = 'utf-8'
 787     return encoding
 788
 789
 790 def encodeFilename(s, for_subprocess=False):
 791     """
 792     @param s The name of the file
 793     """
 794
 795     assert type(s) == compat_str
 796
 797     # Python 3 has a Unicode API
 798     if sys.version_info >= (3, 0):
 799         return s
 800
 801     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 802     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 803     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 804     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 805         return s
 806
 807     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 808     if sys.platform.startswith('java'):
 809         return s
 810
 811     return s.encode(get_subprocess_encoding(), 'ignore')
 812
 813
 814 def decodeFilename(b, for_subprocess=False):
 815
 816     if sys.version_info >= (3, 0):
 817         return b
 818
 819     if not isinstance(b, bytes):
 820         return b
 821
 822     return b.decode(get_subprocess_encoding(), 'ignore')
 823
 824
 825 def encodeArgument(s):
 826     if not isinstance(s, compat_str):
 827         # Legacy code that uses byte strings
 828         # Uncomment the following line after fixing all post processors
 829         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 830         s = s.decode('ascii')
 831     return encodeFilename(s, True)
 832
 833
 834 def decodeArgument(b):
 835     return decodeFilename(b, True)
 836
 837
 838 def decodeOption(optval):
 839     if optval is None:
 840         return optval
 841     if isinstance(optval, bytes):
 842         optval = optval.decode(preferredencoding())
 843
 844     assert isinstance(optval, compat_str)
 845     return optval
 846
 847
 848 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 849
 850
 851 def timetuple_from_msec(msec):
 852     secs, msec = divmod(msec, 1000)
 853     mins, secs = divmod(secs, 60)
 854     hrs, mins = divmod(mins, 60)
 855     return _timetuple(hrs, mins, secs, msec)
 856
 857
 858 def formatSeconds(secs, delim=':', msec=False):
 859     time = timetuple_from_msec(secs * 1000)
 860     if time.hours:
 861         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 862     elif time.minutes:
 863         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 864     else:
 865         ret = '%d' % time.seconds
 866     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 867
 868
 869 def _ssl_load_windows_store_certs(ssl_context, storename):
 870     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 871     try:
 872         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 873                  if encoding == 'x509_asn' and (
 874                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 875     except PermissionError:
 876         return
 877     for cert in certs:
 878         try:
 879             ssl_context.load_verify_locations(cadata=cert)
 880         except ssl.SSLError:
 881             pass
 882
 883
 884 def make_HTTPS_handler(params, **kwargs):
 885     opts_check_certificate = not params.get('nocheckcertificate')
 886     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 887     context.check_hostname = opts_check_certificate
 888     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 889     if opts_check_certificate:
 890         try:
 891             context.load_default_certs()
 892             # Work around the issue in load_default_certs when there are bad certificates. See:
 893             # https://github.com/yt-dlp/yt-dlp/issues/1060,
 894             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 895         except ssl.SSLError:
 896             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 897             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 898                 # Create a new context to discard any certificates that were already loaded
 899                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 900                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
 901                 for storename in ('CA', 'ROOT'):
 902                     _ssl_load_windows_store_certs(context, storename)
 903             context.set_default_verify_paths()
 904     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 905
 906
 907 def bug_reports_message(before=';'):
 908     if ytdl_is_updateable():
 909         update_cmd = 'type  yt-dlp -U  to update'
 910     else:
 911         update_cmd = 'see  https://github.com/yt-dlp/yt-dlp  on how to update'
 912     msg = 'please report this issue on  https://github.com/yt-dlp/yt-dlp .'
 913     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 914     msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
 915
 916     before = before.rstrip()
 917     if not before or before.endswith(('.', '!', '?')):
 918         msg = msg[0].title() + msg[1:]
 919
 920     return (before + ' ' if before else '') + msg
 921
 922
 923 class YoutubeDLError(Exception):
 924     """Base exception for YoutubeDL errors."""
 925     msg = None
 926
 927     def __init__(self, msg=None):
 928         if msg is not None:
 929             self.msg = msg
 930         elif self.msg is None:
 931             self.msg = type(self).__name__
 932         super().__init__(self.msg)
 933
 934
 935 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 936 if hasattr(ssl, 'CertificateError'):
 937     network_exceptions.append(ssl.CertificateError)
 938 network_exceptions = tuple(network_exceptions)
 939
 940
 941 class ExtractorError(YoutubeDLError):
 942     """Error during info extraction."""
 943
 944     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 945         """ tb, if given, is the original traceback (so that it can be printed out).
 946         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 947         """
 948         if sys.exc_info()[0] in network_exceptions:
 949             expected = True
 950
 951         self.msg = str(msg)
 952         self.traceback = tb
 953         self.expected = expected
 954         self.cause = cause
 955         self.video_id = video_id
 956         self.ie = ie
 957         self.exc_info = sys.exc_info()  # preserve original exception
 958
 959         super(ExtractorError, self).__init__(''.join((
 960             format_field(ie, template='[%s] '),
 961             format_field(video_id, template='%s: '),
 962             self.msg,
 963             format_field(cause, template=' (caused by %r)'),
 964             '' if expected else bug_reports_message())))
 965
 966     def format_traceback(self):
 967         if self.traceback is None:
 968             return None
 969         return ''.join(traceback.format_tb(self.traceback))
 970
 971
 972 class UnsupportedError(ExtractorError):
 973     def __init__(self, url):
 974         super(UnsupportedError, self).__init__(
 975             'Unsupported URL: %s' % url, expected=True)
 976         self.url = url
 977
 978
 979 class RegexNotFoundError(ExtractorError):
 980     """Error when a regex didn't match"""
 981     pass
 982
 983
 984 class GeoRestrictedError(ExtractorError):
 985     """Geographic restriction Error exception.
 986
 987     This exception may be thrown when a video is not available from your
 988     geographic location due to geographic restrictions imposed by a website.
 989     """
 990
 991     def __init__(self, msg, countries=None, **kwargs):
 992         kwargs['expected'] = True
 993         super(GeoRestrictedError, self).__init__(msg, **kwargs)
 994         self.countries = countries
 995
 996
 997 class DownloadError(YoutubeDLError):
 998     """Download Error exception.
 999
1000     This exception may be thrown by FileDownloader objects if they are not
1001     configured to continue on errors. They will contain the appropriate
1002     error message.
1003     """
1004
1005     def __init__(self, msg, exc_info=None):
1006         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1007         super(DownloadError, self).__init__(msg)
1008         self.exc_info = exc_info
1009
1010
1011 class EntryNotInPlaylist(YoutubeDLError):
1012     """Entry not in playlist exception.
1013
1014     This exception will be thrown by YoutubeDL when a requested entry
1015     is not found in the playlist info_dict
1016     """
1017     msg = 'Entry not found in info'
1018
1019
1020 class SameFileError(YoutubeDLError):
1021     """Same File exception.
1022
1023     This exception will be thrown by FileDownloader objects if they detect
1024     multiple files would have to be downloaded to the same file on disk.
1025     """
1026     msg = 'Fixed output name but more than one file to download'
1027
1028     def __init__(self, filename=None):
1029         if filename is not None:
1030             self.msg += f': {filename}'
1031         super().__init__(self.msg)
1032
1033
1034 class PostProcessingError(YoutubeDLError):
1035     """Post Processing exception.
1036
1037     This exception may be raised by PostProcessor's .run() method to
1038     indicate an error in the postprocessing task.
1039     """
1040
1041
1042 class DownloadCancelled(YoutubeDLError):
1043     """ Exception raised when the download queue should be interrupted """
1044     msg = 'The download was cancelled'
1045
1046
1047 class ExistingVideoReached(DownloadCancelled):
1048     """ --break-on-existing triggered """
1049     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1050
1051
1052 class RejectedVideoReached(DownloadCancelled):
1053     """ --break-on-reject triggered """
1054     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1055
1056
1057 class MaxDownloadsReached(DownloadCancelled):
1058     """ --max-downloads limit has been reached. """
1059     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1060
1061
1062 class ReExtractInfo(YoutubeDLError):
1063     """ Video info needs to be re-extracted. """
1064
1065     def __init__(self, msg, expected=False):
1066         super().__init__(msg)
1067         self.expected = expected
1068
1069
1070 class ThrottledDownload(ReExtractInfo):
1071     """ Download speed below --throttled-rate. """
1072     msg = 'The download speed is below throttle limit'
1073
1074     def __init__(self):
1075         super().__init__(self.msg, expected=False)
1076
1077
1078 class UnavailableVideoError(YoutubeDLError):
1079     """Unavailable Format exception.
1080
1081     This exception will be thrown when a video is requested
1082     in a format that is not available for that video.
1083     """
1084     msg = 'Unable to download video'
1085
1086     def __init__(self, err=None):
1087         if err is not None:
1088             self.msg += f': {err}'
1089         super().__init__(self.msg)
1090
1091
1092 class ContentTooShortError(YoutubeDLError):
1093     """Content Too Short exception.
1094
1095     This exception may be raised by FileDownloader objects when a file they
1096     download is too small for what the server announced first, indicating
1097     the connection was probably interrupted.
1098     """
1099
1100     def __init__(self, downloaded, expected):
1101         super(ContentTooShortError, self).__init__(
1102             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1103         )
1104         # Both in bytes
1105         self.downloaded = downloaded
1106         self.expected = expected
1107
1108
1109 class XAttrMetadataError(YoutubeDLError):
1110     def __init__(self, code=None, msg='Unknown error'):
1111         super(XAttrMetadataError, self).__init__(msg)
1112         self.code = code
1113         self.msg = msg
1114
1115         # Parsing code and msg
1116         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1117                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1118             self.reason = 'NO_SPACE'
1119         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1120             self.reason = 'VALUE_TOO_LONG'
1121         else:
1122             self.reason = 'NOT_SUPPORTED'
1123
1124
1125 class XAttrUnavailableError(YoutubeDLError):
1126     pass
1127
1128
1129 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1130     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1131     # expected HTTP responses to meet HTTP/1.0 or later (see also
1132     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1133     if sys.version_info < (3, 0):
1134         kwargs['strict'] = True
1135     hc = http_class(*args, **compat_kwargs(kwargs))
1136     source_address = ydl_handler._params.get('source_address')
1137
1138     if source_address is not None:
1139         # This is to workaround _create_connection() from socket where it will try all
1140         # address data from getaddrinfo() including IPv6. This filters the result from
1141         # getaddrinfo() based on the source_address value.
1142         # This is based on the cpython socket.create_connection() function.
1143         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1144         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1145             host, port = address
1146             err = None
1147             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1148             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1149             ip_addrs = [addr for addr in addrs if addr[0] == af]
1150             if addrs and not ip_addrs:
1151                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1152                 raise socket.error(
1153                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1154                     % (ip_version, source_address[0]))
1155             for res in ip_addrs:
1156                 af, socktype, proto, canonname, sa = res
1157                 sock = None
1158                 try:
1159                     sock = socket.socket(af, socktype, proto)
1160                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1161                         sock.settimeout(timeout)
1162                     sock.bind(source_address)
1163                     sock.connect(sa)
1164                     err = None  # Explicitly break reference cycle
1165                     return sock
1166                 except socket.error as _:
1167                     err = _
1168                     if sock is not None:
1169                         sock.close()
1170             if err is not None:
1171                 raise err
1172             else:
1173                 raise socket.error('getaddrinfo returns an empty list')
1174         if hasattr(hc, '_create_connection'):
1175             hc._create_connection = _create_connection
1176         sa = (source_address, 0)
1177         if hasattr(hc, 'source_address'):  # Python 2.7+
1178             hc.source_address = sa
1179         else:  # Python 2.6
1180             def _hc_connect(self, *args, **kwargs):
1181                 sock = _create_connection(
1182                     (self.host, self.port), self.timeout, sa)
1183                 if is_https:
1184                     self.sock = ssl.wrap_socket(
1185                         sock, self.key_file, self.cert_file,
1186                         ssl_version=ssl.PROTOCOL_TLSv1)
1187                 else:
1188                     self.sock = sock
1189             hc.connect = functools.partial(_hc_connect, hc)
1190
1191     return hc
1192
1193
1194 def handle_youtubedl_headers(headers):
1195     filtered_headers = headers
1196
1197     if 'Youtubedl-no-compression' in filtered_headers:
1198         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1199         del filtered_headers['Youtubedl-no-compression']
1200
1201     return filtered_headers
1202
1203
1204 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1205     """Handler for HTTP requests and responses.
1206
1207     This class, when installed with an OpenerDirector, automatically adds
1208     the standard headers to every HTTP request and handles gzipped and
1209     deflated responses from web servers. If compression is to be avoided in
1210     a particular request, the original request in the program code only has
1211     to include the HTTP header "Youtubedl-no-compression", which will be
1212     removed before making the real request.
1213
1214     Part of this code was copied from:
1215
1216     http://techknack.net/python-urllib2-handlers/
1217
1218     Andrew Rowls, the author of that code, agreed to release it to the
1219     public domain.
1220     """
1221
1222     def __init__(self, params, *args, **kwargs):
1223         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1224         self._params = params
1225
1226     def http_open(self, req):
1227         conn_class = compat_http_client.HTTPConnection
1228
1229         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1230         if socks_proxy:
1231             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1232             del req.headers['Ytdl-socks-proxy']
1233
1234         return self.do_open(functools.partial(
1235             _create_http_connection, self, conn_class, False),
1236             req)
1237
1238     @staticmethod
1239     def deflate(data):
1240         if not data:
1241             return data
1242         try:
1243             return zlib.decompress(data, -zlib.MAX_WBITS)
1244         except zlib.error:
1245             return zlib.decompress(data)
1246
1247     def http_request(self, req):
1248         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1249         # always respected by websites, some tend to give out URLs with non percent-encoded
1250         # non-ASCII characters (see telemb.py, ard.py [#3412])
1251         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1252         # To work around aforementioned issue we will replace request's original URL with
1253         # percent-encoded one
1254         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1255         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1256         url = req.get_full_url()
1257         url_escaped = escape_url(url)
1258
1259         # Substitute URL if any change after escaping
1260         if url != url_escaped:
1261             req = update_Request(req, url=url_escaped)
1262
1263         for h, v in std_headers.items():
1264             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1265             # The dict keys are capitalized because of this bug by urllib
1266             if h.capitalize() not in req.headers:
1267                 req.add_header(h, v)
1268
1269         req.headers = handle_youtubedl_headers(req.headers)
1270
1271         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1272             # Python 2.6 is brain-dead when it comes to fragments
1273             req._Request__original = req._Request__original.partition('#')[0]
1274             req._Request__r_type = req._Request__r_type.partition('#')[0]
1275
1276         return req
1277
1278     def http_response(self, req, resp):
1279         old_resp = resp
1280         # gzip
1281         if resp.headers.get('Content-encoding', '') == 'gzip':
1282             content = resp.read()
1283             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1284             try:
1285                 uncompressed = io.BytesIO(gz.read())
1286             except IOError as original_ioerror:
1287                 # There may be junk add the end of the file
1288                 # See http://stackoverflow.com/q/4928560/35070 for details
1289                 for i in range(1, 1024):
1290                     try:
1291                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1292                         uncompressed = io.BytesIO(gz.read())
1293                     except IOError:
1294                         continue
1295                     break
1296                 else:
1297                     raise original_ioerror
1298             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1299             resp.msg = old_resp.msg
1300             del resp.headers['Content-encoding']
1301         # deflate
1302         if resp.headers.get('Content-encoding', '') == 'deflate':
1303             gz = io.BytesIO(self.deflate(resp.read()))
1304             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1305             resp.msg = old_resp.msg
1306             del resp.headers['Content-encoding']
1307         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1308         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1309         if 300 <= resp.code < 400:
1310             location = resp.headers.get('Location')
1311             if location:
1312                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1313                 if sys.version_info >= (3, 0):
1314                     location = location.encode('iso-8859-1').decode('utf-8')
1315                 else:
1316                     location = location.decode('utf-8')
1317                 location_escaped = escape_url(location)
1318                 if location != location_escaped:
1319                     del resp.headers['Location']
1320                     if sys.version_info < (3, 0):
1321                         location_escaped = location_escaped.encode('utf-8')
1322                     resp.headers['Location'] = location_escaped
1323         return resp
1324
1325     https_request = http_request
1326     https_response = http_response
1327
1328
1329 def make_socks_conn_class(base_class, socks_proxy):
1330     assert issubclass(base_class, (
1331         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1332
1333     url_components = compat_urlparse.urlparse(socks_proxy)
1334     if url_components.scheme.lower() == 'socks5':
1335         socks_type = ProxyType.SOCKS5
1336     elif url_components.scheme.lower() in ('socks', 'socks4'):
1337         socks_type = ProxyType.SOCKS4
1338     elif url_components.scheme.lower() == 'socks4a':
1339         socks_type = ProxyType.SOCKS4A
1340
1341     def unquote_if_non_empty(s):
1342         if not s:
1343             return s
1344         return compat_urllib_parse_unquote_plus(s)
1345
1346     proxy_args = (
1347         socks_type,
1348         url_components.hostname, url_components.port or 1080,
1349         True,  # Remote DNS
1350         unquote_if_non_empty(url_components.username),
1351         unquote_if_non_empty(url_components.password),
1352     )
1353
1354     class SocksConnection(base_class):
1355         def connect(self):
1356             self.sock = sockssocket()
1357             self.sock.setproxy(*proxy_args)
1358             if type(self.timeout) in (int, float):
1359                 self.sock.settimeout(self.timeout)
1360             self.sock.connect((self.host, self.port))
1361
1362             if isinstance(self, compat_http_client.HTTPSConnection):
1363                 if hasattr(self, '_context'):  # Python > 2.6
1364                     self.sock = self._context.wrap_socket(
1365                         self.sock, server_hostname=self.host)
1366                 else:
1367                     self.sock = ssl.wrap_socket(self.sock)
1368
1369     return SocksConnection
1370
1371
1372 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1373     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1374         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1375         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1376         self._params = params
1377
1378     def https_open(self, req):
1379         kwargs = {}
1380         conn_class = self._https_conn_class
1381
1382         if hasattr(self, '_context'):  # python > 2.6
1383             kwargs['context'] = self._context
1384         if hasattr(self, '_check_hostname'):  # python 3.x
1385             kwargs['check_hostname'] = self._check_hostname
1386
1387         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1388         if socks_proxy:
1389             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1390             del req.headers['Ytdl-socks-proxy']
1391
1392         return self.do_open(functools.partial(
1393             _create_http_connection, self, conn_class, True),
1394             req, **kwargs)
1395
1396
1397 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1398     """
1399     See [1] for cookie file format.
1400
1401     1. https://curl.haxx.se/docs/http-cookies.html
1402     """
1403     _HTTPONLY_PREFIX = '#HttpOnly_'
1404     _ENTRY_LEN = 7
1405     _HEADER = '''# Netscape HTTP Cookie File
1406 # This file is generated by yt-dlp.  Do not edit.
1407
1408 '''
1409     _CookieFileEntry = collections.namedtuple(
1410         'CookieFileEntry',
1411         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1412
1413     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1414         """
1415         Save cookies to a file.
1416
1417         Most of the code is taken from CPython 3.8 and slightly adapted
1418         to support cookie files with UTF-8 in both python 2 and 3.
1419         """
1420         if filename is None:
1421             if self.filename is not None:
1422                 filename = self.filename
1423             else:
1424                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1425
1426         # Store session cookies with `expires` set to 0 instead of an empty
1427         # string
1428         for cookie in self:
1429             if cookie.expires is None:
1430                 cookie.expires = 0
1431
1432         with io.open(filename, 'w', encoding='utf-8') as f:
1433             f.write(self._HEADER)
1434             now = time.time()
1435             for cookie in self:
1436                 if not ignore_discard and cookie.discard:
1437                     continue
1438                 if not ignore_expires and cookie.is_expired(now):
1439                     continue
1440                 if cookie.secure:
1441                     secure = 'TRUE'
1442                 else:
1443                     secure = 'FALSE'
1444                 if cookie.domain.startswith('.'):
1445                     initial_dot = 'TRUE'
1446                 else:
1447                     initial_dot = 'FALSE'
1448                 if cookie.expires is not None:
1449                     expires = compat_str(cookie.expires)
1450                 else:
1451                     expires = ''
1452                 if cookie.value is None:
1453                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1454                     # with no name, whereas http.cookiejar regards it as a
1455                     # cookie with no value.
1456                     name = ''
1457                     value = cookie.name
1458                 else:
1459                     name = cookie.name
1460                     value = cookie.value
1461                 f.write(
1462                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1463                                secure, expires, name, value]) + '\n')
1464
1465     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1466         """Load cookies from a file."""
1467         if filename is None:
1468             if self.filename is not None:
1469                 filename = self.filename
1470             else:
1471                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1472
1473         def prepare_line(line):
1474             if line.startswith(self._HTTPONLY_PREFIX):
1475                 line = line[len(self._HTTPONLY_PREFIX):]
1476             # comments and empty lines are fine
1477             if line.startswith('#') or not line.strip():
1478                 return line
1479             cookie_list = line.split('\t')
1480             if len(cookie_list) != self._ENTRY_LEN:
1481                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1482             cookie = self._CookieFileEntry(*cookie_list)
1483             if cookie.expires_at and not cookie.expires_at.isdigit():
1484                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1485             return line
1486
1487         cf = io.StringIO()
1488         with io.open(filename, encoding='utf-8') as f:
1489             for line in f:
1490                 try:
1491                     cf.write(prepare_line(line))
1492                 except compat_cookiejar.LoadError as e:
1493                     write_string(
1494                         'WARNING: skipping cookie file entry due to %s: %r\n'
1495                         % (e, line), sys.stderr)
1496                     continue
1497         cf.seek(0)
1498         self._really_load(cf, filename, ignore_discard, ignore_expires)
1499         # Session cookies are denoted by either `expires` field set to
1500         # an empty string or 0. MozillaCookieJar only recognizes the former
1501         # (see [1]). So we need force the latter to be recognized as session
1502         # cookies on our own.
1503         # Session cookies may be important for cookies-based authentication,
1504         # e.g. usually, when user does not check 'Remember me' check box while
1505         # logging in on a site, some important cookies are stored as session
1506         # cookies so that not recognizing them will result in failed login.
1507         # 1. https://bugs.python.org/issue17164
1508         for cookie in self:
1509             # Treat `expires=0` cookies as session cookies
1510             if cookie.expires == 0:
1511                 cookie.expires = None
1512                 cookie.discard = True
1513
1514
1515 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1516     def __init__(self, cookiejar=None):
1517         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1518
1519     def http_response(self, request, response):
1520         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1521         # characters in Set-Cookie HTTP header of last response (see
1522         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1523         # In order to at least prevent crashing we will percent encode Set-Cookie
1524         # header before HTTPCookieProcessor starts processing it.
1525         # if sys.version_info < (3, 0) and response.headers:
1526         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1527         #         set_cookie = response.headers.get(set_cookie_header)
1528         #         if set_cookie:
1529         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1530         #             if set_cookie != set_cookie_escaped:
1531         #                 del response.headers[set_cookie_header]
1532         #                 response.headers[set_cookie_header] = set_cookie_escaped
1533         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1534
1535     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1536     https_response = http_response
1537
1538
1539 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1540     """YoutubeDL redirect handler
1541
1542     The code is based on HTTPRedirectHandler implementation from CPython [1].
1543
1544     This redirect handler solves two issues:
1545      - ensures redirect URL is always unicode under python 2
1546      - introduces support for experimental HTTP response status code
1547        308 Permanent Redirect [2] used by some sites [3]
1548
1549     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1550     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1551     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1552     """
1553
1554     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1555
1556     def redirect_request(self, req, fp, code, msg, headers, newurl):
1557         """Return a Request or None in response to a redirect.
1558
1559         This is called by the http_error_30x methods when a
1560         redirection response is received.  If a redirection should
1561         take place, return a new Request to allow http_error_30x to
1562         perform the redirect.  Otherwise, raise HTTPError if no-one
1563         else should try to handle this url.  Return None if you can't
1564         but another Handler might.
1565         """
1566         m = req.get_method()
1567         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1568                  or code in (301, 302, 303) and m == "POST")):
1569             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1570         # Strictly (according to RFC 2616), 301 or 302 in response to
1571         # a POST MUST NOT cause a redirection without confirmation
1572         # from the user (of urllib.request, in this case).  In practice,
1573         # essentially all clients do redirect in this case, so we do
1574         # the same.
1575
1576         # On python 2 urlh.geturl() may sometimes return redirect URL
1577         # as byte string instead of unicode. This workaround allows
1578         # to force it always return unicode.
1579         if sys.version_info[0] < 3:
1580             newurl = compat_str(newurl)
1581
1582         # Be conciliant with URIs containing a space.  This is mainly
1583         # redundant with the more complete encoding done in http_error_302(),
1584         # but it is kept for compatibility with other callers.
1585         newurl = newurl.replace(' ', '%20')
1586
1587         CONTENT_HEADERS = ("content-length", "content-type")
1588         # NB: don't use dict comprehension for python 2.6 compatibility
1589         newheaders = dict((k, v) for k, v in req.headers.items()
1590                           if k.lower() not in CONTENT_HEADERS)
1591         return compat_urllib_request.Request(
1592             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1593             unverifiable=True)
1594
1595
1596 def extract_timezone(date_str):
1597     m = re.search(
1598         r'''(?x)
1599             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1600             (?P<tz>Z|                                            # just the UTC Z, or
1601                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1602                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1603                    [ ]?                                          # optional space
1604                 (?P<sign>\+|-)                                   # +/-
1605                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1606             $)
1607         ''', date_str)
1608     if not m:
1609         timezone = datetime.timedelta()
1610     else:
1611         date_str = date_str[:-len(m.group('tz'))]
1612         if not m.group('sign'):
1613             timezone = datetime.timedelta()
1614         else:
1615             sign = 1 if m.group('sign') == '+' else -1
1616             timezone = datetime.timedelta(
1617                 hours=sign * int(m.group('hours')),
1618                 minutes=sign * int(m.group('minutes')))
1619     return timezone, date_str
1620
1621
1622 def parse_iso8601(date_str, delimiter='T', timezone=None):
1623     """ Return a UNIX timestamp from the given date """
1624
1625     if date_str is None:
1626         return None
1627
1628     date_str = re.sub(r'\.[0-9]+', '', date_str)
1629
1630     if timezone is None:
1631         timezone, date_str = extract_timezone(date_str)
1632
1633     try:
1634         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1635         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1636         return calendar.timegm(dt.timetuple())
1637     except ValueError:
1638         pass
1639
1640
1641 def date_formats(day_first=True):
1642     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1643
1644
1645 def unified_strdate(date_str, day_first=True):
1646     """Return a string with the date in the format YYYYMMDD"""
1647
1648     if date_str is None:
1649         return None
1650     upload_date = None
1651     # Replace commas
1652     date_str = date_str.replace(',', ' ')
1653     # Remove AM/PM + timezone
1654     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1655     _, date_str = extract_timezone(date_str)
1656
1657     for expression in date_formats(day_first):
1658         try:
1659             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1660         except ValueError:
1661             pass
1662     if upload_date is None:
1663         timetuple = email.utils.parsedate_tz(date_str)
1664         if timetuple:
1665             try:
1666                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1667             except ValueError:
1668                 pass
1669     if upload_date is not None:
1670         return compat_str(upload_date)
1671
1672
1673 def unified_timestamp(date_str, day_first=True):
1674     if date_str is None:
1675         return None
1676
1677     date_str = re.sub(r'[,|]', '', date_str)
1678
1679     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1680     timezone, date_str = extract_timezone(date_str)
1681
1682     # Remove AM/PM + timezone
1683     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1684
1685     # Remove unrecognized timezones from ISO 8601 alike timestamps
1686     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1687     if m:
1688         date_str = date_str[:-len(m.group('tz'))]
1689
1690     # Python only supports microseconds, so remove nanoseconds
1691     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1692     if m:
1693         date_str = m.group(1)
1694
1695     for expression in date_formats(day_first):
1696         try:
1697             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1698             return calendar.timegm(dt.timetuple())
1699         except ValueError:
1700             pass
1701     timetuple = email.utils.parsedate_tz(date_str)
1702     if timetuple:
1703         return calendar.timegm(timetuple) + pm_delta * 3600
1704
1705
1706 def determine_ext(url, default_ext='unknown_video'):
1707     if url is None or '.' not in url:
1708         return default_ext
1709     guess = url.partition('?')[0].rpartition('.')[2]
1710     if re.match(r'^[A-Za-z0-9]+$', guess):
1711         return guess
1712     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1713     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1714         return guess.rstrip('/')
1715     else:
1716         return default_ext
1717
1718
1719 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1720     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1721
1722
1723 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1724     """
1725     Return a datetime object from a string in the format YYYYMMDD or
1726     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1727
1728     format: string date format used to return datetime object from
1729     precision: round the time portion of a datetime object.
1730                 auto|microsecond|second|minute|hour|day.
1731                 auto: round to the unit provided in date_str (if applicable).
1732     """
1733     auto_precision = False
1734     if precision == 'auto':
1735         auto_precision = True
1736         precision = 'microsecond'
1737     today = datetime_round(datetime.datetime.now(), precision)
1738     if date_str in ('now', 'today'):
1739         return today
1740     if date_str == 'yesterday':
1741         return today - datetime.timedelta(days=1)
1742     match = re.match(
1743         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1744         date_str)
1745     if match is not None:
1746         start_time = datetime_from_str(match.group('start'), precision, format)
1747         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1748         unit = match.group('unit')
1749         if unit == 'month' or unit == 'year':
1750             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1751             unit = 'day'
1752         else:
1753             if unit == 'week':
1754                 unit = 'day'
1755                 time *= 7
1756             delta = datetime.timedelta(**{unit + 's': time})
1757             new_date = start_time + delta
1758         if auto_precision:
1759             return datetime_round(new_date, unit)
1760         return new_date
1761
1762     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1763
1764
1765 def date_from_str(date_str, format='%Y%m%d'):
1766     """
1767     Return a datetime object from a string in the format YYYYMMDD or
1768     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1769
1770     format: string date format used to return datetime object from
1771     """
1772     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1773
1774
1775 def datetime_add_months(dt, months):
1776     """Increment/Decrement a datetime object by months."""
1777     month = dt.month + months - 1
1778     year = dt.year + month // 12
1779     month = month % 12 + 1
1780     day = min(dt.day, calendar.monthrange(year, month)[1])
1781     return dt.replace(year, month, day)
1782
1783
1784 def datetime_round(dt, precision='day'):
1785     """
1786     Round a datetime object's time to a specific precision
1787     """
1788     if precision == 'microsecond':
1789         return dt
1790
1791     unit_seconds = {
1792         'day': 86400,
1793         'hour': 3600,
1794         'minute': 60,
1795         'second': 1,
1796     }
1797     roundto = lambda x, n: ((x + n / 2) // n) * n
1798     timestamp = calendar.timegm(dt.timetuple())
1799     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1800
1801
1802 def hyphenate_date(date_str):
1803     """
1804     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1805     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1806     if match is not None:
1807         return '-'.join(match.groups())
1808     else:
1809         return date_str
1810
1811
1812 class DateRange(object):
1813     """Represents a time interval between two dates"""
1814
1815     def __init__(self, start=None, end=None):
1816         """start and end must be strings in the format accepted by date"""
1817         if start is not None:
1818             self.start = date_from_str(start)
1819         else:
1820             self.start = datetime.datetime.min.date()
1821         if end is not None:
1822             self.end = date_from_str(end)
1823         else:
1824             self.end = datetime.datetime.max.date()
1825         if self.start > self.end:
1826             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1827
1828     @classmethod
1829     def day(cls, day):
1830         """Returns a range that only contains the given day"""
1831         return cls(day, day)
1832
1833     def __contains__(self, date):
1834         """Check if the date is in the range"""
1835         if not isinstance(date, datetime.date):
1836             date = date_from_str(date)
1837         return self.start <= date <= self.end
1838
1839     def __str__(self):
1840         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1841
1842
1843 def platform_name():
1844     """ Returns the platform name as a compat_str """
1845     res = platform.platform()
1846     if isinstance(res, bytes):
1847         res = res.decode(preferredencoding())
1848
1849     assert isinstance(res, compat_str)
1850     return res
1851
1852
1853 def get_windows_version():
1854     ''' Get Windows version. None if it's not running on Windows '''
1855     if compat_os_name == 'nt':
1856         return version_tuple(platform.win32_ver()[1])
1857     else:
1858         return None
1859
1860
1861 def _windows_write_string(s, out):
1862     """ Returns True if the string was written using special methods,
1863     False if it has yet to be written out."""
1864     # Adapted from http://stackoverflow.com/a/3259271/35070
1865
1866     import ctypes.wintypes
1867
1868     WIN_OUTPUT_IDS = {
1869         1: -11,
1870         2: -12,
1871     }
1872
1873     try:
1874         fileno = out.fileno()
1875     except AttributeError:
1876         # If the output stream doesn't have a fileno, it's virtual
1877         return False
1878     except io.UnsupportedOperation:
1879         # Some strange Windows pseudo files?
1880         return False
1881     if fileno not in WIN_OUTPUT_IDS:
1882         return False
1883
1884     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1885         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1886         ('GetStdHandle', ctypes.windll.kernel32))
1887     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1888
1889     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1890         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1891         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1892         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1893     written = ctypes.wintypes.DWORD(0)
1894
1895     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1896     FILE_TYPE_CHAR = 0x0002
1897     FILE_TYPE_REMOTE = 0x8000
1898     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1899         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1900         ctypes.POINTER(ctypes.wintypes.DWORD))(
1901         ('GetConsoleMode', ctypes.windll.kernel32))
1902     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1903
1904     def not_a_console(handle):
1905         if handle == INVALID_HANDLE_VALUE or handle is None:
1906             return True
1907         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1908                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1909
1910     if not_a_console(h):
1911         return False
1912
1913     def next_nonbmp_pos(s):
1914         try:
1915             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1916         except StopIteration:
1917             return len(s)
1918
1919     while s:
1920         count = min(next_nonbmp_pos(s), 1024)
1921
1922         ret = WriteConsoleW(
1923             h, s, count if count else 2, ctypes.byref(written), None)
1924         if ret == 0:
1925             raise OSError('Failed to write string')
1926         if not count:  # We just wrote a non-BMP character
1927             assert written.value == 2
1928             s = s[1:]
1929         else:
1930             assert written.value > 0
1931             s = s[written.value:]
1932     return True
1933
1934
1935 def write_string(s, out=None, encoding=None):
1936     if out is None:
1937         out = sys.stderr
1938     assert type(s) == compat_str
1939
1940     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1941         if _windows_write_string(s, out):
1942             return
1943
1944     if ('b' in getattr(out, 'mode', '')
1945             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1946         byt = s.encode(encoding or preferredencoding(), 'ignore')
1947         out.write(byt)
1948     elif hasattr(out, 'buffer'):
1949         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1950         byt = s.encode(enc, 'ignore')
1951         out.buffer.write(byt)
1952     else:
1953         out.write(s)
1954     out.flush()
1955
1956
1957 def bytes_to_intlist(bs):
1958     if not bs:
1959         return []
1960     if isinstance(bs[0], int):  # Python 3
1961         return list(bs)
1962     else:
1963         return [ord(c) for c in bs]
1964
1965
1966 def intlist_to_bytes(xs):
1967     if not xs:
1968         return b''
1969     return compat_struct_pack('%dB' % len(xs), *xs)
1970
1971
1972 # Cross-platform file locking
1973 if sys.platform == 'win32':
1974     import ctypes.wintypes
1975     import msvcrt
1976
1977     class OVERLAPPED(ctypes.Structure):
1978         _fields_ = [
1979             ('Internal', ctypes.wintypes.LPVOID),
1980             ('InternalHigh', ctypes.wintypes.LPVOID),
1981             ('Offset', ctypes.wintypes.DWORD),
1982             ('OffsetHigh', ctypes.wintypes.DWORD),
1983             ('hEvent', ctypes.wintypes.HANDLE),
1984         ]
1985
1986     kernel32 = ctypes.windll.kernel32
1987     LockFileEx = kernel32.LockFileEx
1988     LockFileEx.argtypes = [
1989         ctypes.wintypes.HANDLE,     # hFile
1990         ctypes.wintypes.DWORD,      # dwFlags
1991         ctypes.wintypes.DWORD,      # dwReserved
1992         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1993         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1994         ctypes.POINTER(OVERLAPPED)  # Overlapped
1995     ]
1996     LockFileEx.restype = ctypes.wintypes.BOOL
1997     UnlockFileEx = kernel32.UnlockFileEx
1998     UnlockFileEx.argtypes = [
1999         ctypes.wintypes.HANDLE,     # hFile
2000         ctypes.wintypes.DWORD,      # dwReserved
2001         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2002         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2003         ctypes.POINTER(OVERLAPPED)  # Overlapped
2004     ]
2005     UnlockFileEx.restype = ctypes.wintypes.BOOL
2006     whole_low = 0xffffffff
2007     whole_high = 0x7fffffff
2008
2009     def _lock_file(f, exclusive):
2010         overlapped = OVERLAPPED()
2011         overlapped.Offset = 0
2012         overlapped.OffsetHigh = 0
2013         overlapped.hEvent = 0
2014         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2015         handle = msvcrt.get_osfhandle(f.fileno())
2016         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2017                           whole_low, whole_high, f._lock_file_overlapped_p):
2018             raise OSError('Locking file failed: %r' % ctypes.FormatError())
2019
2020     def _unlock_file(f):
2021         assert f._lock_file_overlapped_p
2022         handle = msvcrt.get_osfhandle(f.fileno())
2023         if not UnlockFileEx(handle, 0,
2024                             whole_low, whole_high, f._lock_file_overlapped_p):
2025             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2026
2027 else:
2028     # Some platforms, such as Jython, is missing fcntl
2029     try:
2030         import fcntl
2031
2032         def _lock_file(f, exclusive):
2033             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
2034
2035         def _unlock_file(f):
2036             fcntl.flock(f, fcntl.LOCK_UN)
2037     except ImportError:
2038         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2039
2040         def _lock_file(f, exclusive):
2041             raise IOError(UNSUPPORTED_MSG)
2042
2043         def _unlock_file(f):
2044             raise IOError(UNSUPPORTED_MSG)
2045
2046
2047 class locked_file(object):
2048     def __init__(self, filename, mode, encoding=None):
2049         assert mode in ['r', 'a', 'w']
2050         self.f = io.open(filename, mode, encoding=encoding)
2051         self.mode = mode
2052
2053     def __enter__(self):
2054         exclusive = self.mode != 'r'
2055         try:
2056             _lock_file(self.f, exclusive)
2057         except IOError:
2058             self.f.close()
2059             raise
2060         return self
2061
2062     def __exit__(self, etype, value, traceback):
2063         try:
2064             _unlock_file(self.f)
2065         finally:
2066             self.f.close()
2067
2068     def __iter__(self):
2069         return iter(self.f)
2070
2071     def write(self, *args):
2072         return self.f.write(*args)
2073
2074     def read(self, *args):
2075         return self.f.read(*args)
2076
2077
2078 def get_filesystem_encoding():
2079     encoding = sys.getfilesystemencoding()
2080     return encoding if encoding is not None else 'utf-8'
2081
2082
2083 def shell_quote(args):
2084     quoted_args = []
2085     encoding = get_filesystem_encoding()
2086     for a in args:
2087         if isinstance(a, bytes):
2088             # We may get a filename encoded with 'encodeFilename'
2089             a = a.decode(encoding)
2090         quoted_args.append(compat_shlex_quote(a))
2091     return ' '.join(quoted_args)
2092
2093
2094 def smuggle_url(url, data):
2095     """ Pass additional data in a URL for internal use. """
2096
2097     url, idata = unsmuggle_url(url, {})
2098     data.update(idata)
2099     sdata = compat_urllib_parse_urlencode(
2100         {'__youtubedl_smuggle': json.dumps(data)})
2101     return url + '#' + sdata
2102
2103
2104 def unsmuggle_url(smug_url, default=None):
2105     if '#__youtubedl_smuggle' not in smug_url:
2106         return smug_url, default
2107     url, _, sdata = smug_url.rpartition('#')
2108     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2109     data = json.loads(jsond)
2110     return url, data
2111
2112
2113 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2114     """ Formats numbers with decimal sufixes like K, M, etc """
2115     num, factor = float_or_none(num), float(factor)
2116     if num is None:
2117         return None
2118     exponent = 0 if num == 0 else int(math.log(num, factor))
2119     suffix = ['', *'kMGTPEZY'][exponent]
2120     if factor == 1024:
2121         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2122     converted = num / (factor ** exponent)
2123     return fmt % (converted, suffix)
2124
2125
2126 def format_bytes(bytes):
2127     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2128
2129
2130 def lookup_unit_table(unit_table, s):
2131     units_re = '|'.join(re.escape(u) for u in unit_table)
2132     m = re.match(
2133         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2134     if not m:
2135         return None
2136     num_str = m.group('num').replace(',', '.')
2137     mult = unit_table[m.group('unit')]
2138     return int(float(num_str) * mult)
2139
2140
2141 def parse_filesize(s):
2142     if s is None:
2143         return None
2144
2145     # The lower-case forms are of course incorrect and unofficial,
2146     # but we support those too
2147     _UNIT_TABLE = {
2148         'B': 1,
2149         'b': 1,
2150         'bytes': 1,
2151         'KiB': 1024,
2152         'KB': 1000,
2153         'kB': 1024,
2154         'Kb': 1000,
2155         'kb': 1000,
2156         'kilobytes': 1000,
2157         'kibibytes': 1024,
2158         'MiB': 1024 ** 2,
2159         'MB': 1000 ** 2,
2160         'mB': 1024 ** 2,
2161         'Mb': 1000 ** 2,
2162         'mb': 1000 ** 2,
2163         'megabytes': 1000 ** 2,
2164         'mebibytes': 1024 ** 2,
2165         'GiB': 1024 ** 3,
2166         'GB': 1000 ** 3,
2167         'gB': 1024 ** 3,
2168         'Gb': 1000 ** 3,
2169         'gb': 1000 ** 3,
2170         'gigabytes': 1000 ** 3,
2171         'gibibytes': 1024 ** 3,
2172         'TiB': 1024 ** 4,
2173         'TB': 1000 ** 4,
2174         'tB': 1024 ** 4,
2175         'Tb': 1000 ** 4,
2176         'tb': 1000 ** 4,
2177         'terabytes': 1000 ** 4,
2178         'tebibytes': 1024 ** 4,
2179         'PiB': 1024 ** 5,
2180         'PB': 1000 ** 5,
2181         'pB': 1024 ** 5,
2182         'Pb': 1000 ** 5,
2183         'pb': 1000 ** 5,
2184         'petabytes': 1000 ** 5,
2185         'pebibytes': 1024 ** 5,
2186         'EiB': 1024 ** 6,
2187         'EB': 1000 ** 6,
2188         'eB': 1024 ** 6,
2189         'Eb': 1000 ** 6,
2190         'eb': 1000 ** 6,
2191         'exabytes': 1000 ** 6,
2192         'exbibytes': 1024 ** 6,
2193         'ZiB': 1024 ** 7,
2194         'ZB': 1000 ** 7,
2195         'zB': 1024 ** 7,
2196         'Zb': 1000 ** 7,
2197         'zb': 1000 ** 7,
2198         'zettabytes': 1000 ** 7,
2199         'zebibytes': 1024 ** 7,
2200         'YiB': 1024 ** 8,
2201         'YB': 1000 ** 8,
2202         'yB': 1024 ** 8,
2203         'Yb': 1000 ** 8,
2204         'yb': 1000 ** 8,
2205         'yottabytes': 1000 ** 8,
2206         'yobibytes': 1024 ** 8,
2207     }
2208
2209     return lookup_unit_table(_UNIT_TABLE, s)
2210
2211
2212 def parse_count(s):
2213     if s is None:
2214         return None
2215
2216     s = re.sub(r'^[^\d]+\s', '', s).strip()
2217
2218     if re.match(r'^[\d,.]+$', s):
2219         return str_to_int(s)
2220
2221     _UNIT_TABLE = {
2222         'k': 1000,
2223         'K': 1000,
2224         'm': 1000 ** 2,
2225         'M': 1000 ** 2,
2226         'kk': 1000 ** 2,
2227         'KK': 1000 ** 2,
2228         'b': 1000 ** 3,
2229         'B': 1000 ** 3,
2230     }
2231
2232     ret = lookup_unit_table(_UNIT_TABLE, s)
2233     if ret is not None:
2234         return ret
2235
2236     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2237     if mobj:
2238         return str_to_int(mobj.group(1))
2239
2240
2241 def parse_resolution(s):
2242     if s is None:
2243         return {}
2244
2245     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2246     if mobj:
2247         return {
2248             'width': int(mobj.group('w')),
2249             'height': int(mobj.group('h')),
2250         }
2251
2252     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2253     if mobj:
2254         return {'height': int(mobj.group(1))}
2255
2256     mobj = re.search(r'\b([48])[kK]\b', s)
2257     if mobj:
2258         return {'height': int(mobj.group(1)) * 540}
2259
2260     return {}
2261
2262
2263 def parse_bitrate(s):
2264     if not isinstance(s, compat_str):
2265         return
2266     mobj = re.search(r'\b(\d+)\s*kbps', s)
2267     if mobj:
2268         return int(mobj.group(1))
2269
2270
2271 def month_by_name(name, lang='en'):
2272     """ Return the number of a month by (locale-independently) English name """
2273
2274     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2275
2276     try:
2277         return month_names.index(name) + 1
2278     except ValueError:
2279         return None
2280
2281
2282 def month_by_abbreviation(abbrev):
2283     """ Return the number of a month by (locale-independently) English
2284         abbreviations """
2285
2286     try:
2287         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2288     except ValueError:
2289         return None
2290
2291
2292 def fix_xml_ampersands(xml_str):
2293     """Replace all the '&' by '&amp;' in XML"""
2294     return re.sub(
2295         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2296         '&amp;',
2297         xml_str)
2298
2299
2300 def setproctitle(title):
2301     assert isinstance(title, compat_str)
2302
2303     # ctypes in Jython is not complete
2304     # http://bugs.jython.org/issue2148
2305     if sys.platform.startswith('java'):
2306         return
2307
2308     try:
2309         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2310     except OSError:
2311         return
2312     except TypeError:
2313         # LoadLibrary in Windows Python 2.7.13 only expects
2314         # a bytestring, but since unicode_literals turns
2315         # every string into a unicode string, it fails.
2316         return
2317     title_bytes = title.encode('utf-8')
2318     buf = ctypes.create_string_buffer(len(title_bytes))
2319     buf.value = title_bytes
2320     try:
2321         libc.prctl(15, buf, 0, 0, 0)
2322     except AttributeError:
2323         return  # Strange libc, just skip this
2324
2325
2326 def remove_start(s, start):
2327     return s[len(start):] if s is not None and s.startswith(start) else s
2328
2329
2330 def remove_end(s, end):
2331     return s[:-len(end)] if s is not None and s.endswith(end) else s
2332
2333
2334 def remove_quotes(s):
2335     if s is None or len(s) < 2:
2336         return s
2337     for quote in ('"', "'", ):
2338         if s[0] == quote and s[-1] == quote:
2339             return s[1:-1]
2340     return s
2341
2342
2343 def get_domain(url):
2344     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2345     return domain.group('domain') if domain else None
2346
2347
2348 def url_basename(url):
2349     path = compat_urlparse.urlparse(url).path
2350     return path.strip('/').split('/')[-1]
2351
2352
2353 def base_url(url):
2354     return re.match(r'https?://[^?#&]+/', url).group()
2355
2356
2357 def urljoin(base, path):
2358     if isinstance(path, bytes):
2359         path = path.decode('utf-8')
2360     if not isinstance(path, compat_str) or not path:
2361         return None
2362     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2363         return path
2364     if isinstance(base, bytes):
2365         base = base.decode('utf-8')
2366     if not isinstance(base, compat_str) or not re.match(
2367             r'^(?:https?:)?//', base):
2368         return None
2369     return compat_urlparse.urljoin(base, path)
2370
2371
2372 class HEADRequest(compat_urllib_request.Request):
2373     def get_method(self):
2374         return 'HEAD'
2375
2376
2377 class PUTRequest(compat_urllib_request.Request):
2378     def get_method(self):
2379         return 'PUT'
2380
2381
2382 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2383     if get_attr:
2384         if v is not None:
2385             v = getattr(v, get_attr, None)
2386     if v == '':
2387         v = None
2388     if v is None:
2389         return default
2390     try:
2391         return int(v) * invscale // scale
2392     except (ValueError, TypeError, OverflowError):
2393         return default
2394
2395
2396 def str_or_none(v, default=None):
2397     return default if v is None else compat_str(v)
2398
2399
2400 def str_to_int(int_str):
2401     """ A more relaxed version of int_or_none """
2402     if isinstance(int_str, compat_integer_types):
2403         return int_str
2404     elif isinstance(int_str, compat_str):
2405         int_str = re.sub(r'[,\.\+]', '', int_str)
2406         return int_or_none(int_str)
2407
2408
2409 def float_or_none(v, scale=1, invscale=1, default=None):
2410     if v is None:
2411         return default
2412     try:
2413         return float(v) * invscale / scale
2414     except (ValueError, TypeError):
2415         return default
2416
2417
2418 def bool_or_none(v, default=None):
2419     return v if isinstance(v, bool) else default
2420
2421
2422 def strip_or_none(v, default=None):
2423     return v.strip() if isinstance(v, compat_str) else default
2424
2425
2426 def url_or_none(url):
2427     if not url or not isinstance(url, compat_str):
2428         return None
2429     url = url.strip()
2430     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2431
2432
2433 def strftime_or_none(timestamp, date_format, default=None):
2434     datetime_object = None
2435     try:
2436         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2437             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2438         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2439             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2440         return datetime_object.strftime(date_format)
2441     except (ValueError, TypeError, AttributeError):
2442         return default
2443
2444
2445 def parse_duration(s):
2446     if not isinstance(s, compat_basestring):
2447         return None
2448     s = s.strip()
2449     if not s:
2450         return None
2451
2452     days, hours, mins, secs, ms = [None] * 5
2453     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
2454     if m:
2455         days, hours, mins, secs, ms = m.groups()
2456     else:
2457         m = re.match(
2458             r'''(?ix)(?:P?
2459                 (?:
2460                     [0-9]+\s*y(?:ears?)?\s*
2461                 )?
2462                 (?:
2463                     [0-9]+\s*m(?:onths?)?\s*
2464                 )?
2465                 (?:
2466                     [0-9]+\s*w(?:eeks?)?\s*
2467                 )?
2468                 (?:
2469                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2470                 )?
2471                 T)?
2472                 (?:
2473                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2474                 )?
2475                 (?:
2476                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2477                 )?
2478                 (?:
2479                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2480                 )?Z?$''', s)
2481         if m:
2482             days, hours, mins, secs, ms = m.groups()
2483         else:
2484             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2485             if m:
2486                 hours, mins = m.groups()
2487             else:
2488                 return None
2489
2490     duration = 0
2491     if secs:
2492         duration += float(secs)
2493     if mins:
2494         duration += float(mins) * 60
2495     if hours:
2496         duration += float(hours) * 60 * 60
2497     if days:
2498         duration += float(days) * 24 * 60 * 60
2499     if ms:
2500         duration += float(ms)
2501     return duration
2502
2503
2504 def prepend_extension(filename, ext, expected_real_ext=None):
2505     name, real_ext = os.path.splitext(filename)
2506     return (
2507         '{0}.{1}{2}'.format(name, ext, real_ext)
2508         if not expected_real_ext or real_ext[1:] == expected_real_ext
2509         else '{0}.{1}'.format(filename, ext))
2510
2511
2512 def replace_extension(filename, ext, expected_real_ext=None):
2513     name, real_ext = os.path.splitext(filename)
2514     return '{0}.{1}'.format(
2515         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2516         ext)
2517
2518
2519 def check_executable(exe, args=[]):
2520     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2521     args can be a list of arguments for a short output (like -version) """
2522     try:
2523         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2524     except OSError:
2525         return False
2526     return exe
2527
2528
2529 def _get_exe_version_output(exe, args):
2530     try:
2531         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2532         # SIGTTOU if yt-dlp is run in the background.
2533         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2534         out, _ = Popen(
2535             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2536             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2537     except OSError:
2538         return False
2539     if isinstance(out, bytes):  # Python 2.x
2540         out = out.decode('ascii', 'ignore')
2541     return out
2542
2543
2544 def detect_exe_version(output, version_re=None, unrecognized='present'):
2545     assert isinstance(output, compat_str)
2546     if version_re is None:
2547         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2548     m = re.search(version_re, output)
2549     if m:
2550         return m.group(1)
2551     else:
2552         return unrecognized
2553
2554
2555 def get_exe_version(exe, args=['--version'],
2556                     version_re=None, unrecognized='present'):
2557     """ Returns the version of the specified executable,
2558     or False if the executable is not present """
2559     out = _get_exe_version_output(exe, args)
2560     return detect_exe_version(out, version_re, unrecognized) if out else False
2561
2562
2563 class LazyList(collections.abc.Sequence):
2564     ''' Lazy immutable list from an iterable
2565     Note that slices of a LazyList are lists and not LazyList'''
2566
2567     class IndexError(IndexError):
2568         pass
2569
2570     def __init__(self, iterable, *, reverse=False, _cache=None):
2571         self.__iterable = iter(iterable)
2572         self.__cache = [] if _cache is None else _cache
2573         self.__reversed = reverse
2574
2575     def __iter__(self):
2576         if self.__reversed:
2577             # We need to consume the entire iterable to iterate in reverse
2578             yield from self.exhaust()
2579             return
2580         yield from self.__cache
2581         for item in self.__iterable:
2582             self.__cache.append(item)
2583             yield item
2584
2585     def __exhaust(self):
2586         self.__cache.extend(self.__iterable)
2587         # Discard the emptied iterable to make it pickle-able
2588         self.__iterable = []
2589         return self.__cache
2590
2591     def exhaust(self):
2592         ''' Evaluate the entire iterable '''
2593         return self.__exhaust()[::-1 if self.__reversed else 1]
2594
2595     @staticmethod
2596     def __reverse_index(x):
2597         return None if x is None else -(x + 1)
2598
2599     def __getitem__(self, idx):
2600         if isinstance(idx, slice):
2601             if self.__reversed:
2602                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2603             start, stop, step = idx.start, idx.stop, idx.step or 1
2604         elif isinstance(idx, int):
2605             if self.__reversed:
2606                 idx = self.__reverse_index(idx)
2607             start, stop, step = idx, idx, 0
2608         else:
2609             raise TypeError('indices must be integers or slices')
2610         if ((start or 0) < 0 or (stop or 0) < 0
2611                 or (start is None and step < 0)
2612                 or (stop is None and step > 0)):
2613             # We need to consume the entire iterable to be able to slice from the end
2614             # Obviously, never use this with infinite iterables
2615             self.__exhaust()
2616             try:
2617                 return self.__cache[idx]
2618             except IndexError as e:
2619                 raise self.IndexError(e) from e
2620         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2621         if n > 0:
2622             self.__cache.extend(itertools.islice(self.__iterable, n))
2623         try:
2624             return self.__cache[idx]
2625         except IndexError as e:
2626             raise self.IndexError(e) from e
2627
2628     def __bool__(self):
2629         try:
2630             self[-1] if self.__reversed else self[0]
2631         except self.IndexError:
2632             return False
2633         return True
2634
2635     def __len__(self):
2636         self.__exhaust()
2637         return len(self.__cache)
2638
2639     def __reversed__(self):
2640         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2641
2642     def __copy__(self):
2643         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2644
2645     def __repr__(self):
2646         # repr and str should mimic a list. So we exhaust the iterable
2647         return repr(self.exhaust())
2648
2649     def __str__(self):
2650         return repr(self.exhaust())
2651
2652
2653 class PagedList:
2654
2655     class IndexError(IndexError):
2656         pass
2657
2658     def __len__(self):
2659         # This is only useful for tests
2660         return len(self.getslice())
2661
2662     def __init__(self, pagefunc, pagesize, use_cache=True):
2663         self._pagefunc = pagefunc
2664         self._pagesize = pagesize
2665         self._use_cache = use_cache
2666         self._cache = {}
2667
2668     def getpage(self, pagenum):
2669         page_results = self._cache.get(pagenum)
2670         if page_results is None:
2671             page_results = list(self._pagefunc(pagenum))
2672         if self._use_cache:
2673             self._cache[pagenum] = page_results
2674         return page_results
2675
2676     def getslice(self, start=0, end=None):
2677         return list(self._getslice(start, end))
2678
2679     def _getslice(self, start, end):
2680         raise NotImplementedError('This method must be implemented by subclasses')
2681
2682     def __getitem__(self, idx):
2683         # NOTE: cache must be enabled if this is used
2684         if not isinstance(idx, int) or idx < 0:
2685             raise TypeError('indices must be non-negative integers')
2686         entries = self.getslice(idx, idx + 1)
2687         if not entries:
2688             raise self.IndexError()
2689         return entries[0]
2690
2691
2692 class OnDemandPagedList(PagedList):
2693     def _getslice(self, start, end):
2694         for pagenum in itertools.count(start // self._pagesize):
2695             firstid = pagenum * self._pagesize
2696             nextfirstid = pagenum * self._pagesize + self._pagesize
2697             if start >= nextfirstid:
2698                 continue
2699
2700             startv = (
2701                 start % self._pagesize
2702                 if firstid <= start < nextfirstid
2703                 else 0)
2704             endv = (
2705                 ((end - 1) % self._pagesize) + 1
2706                 if (end is not None and firstid <= end <= nextfirstid)
2707                 else None)
2708
2709             page_results = self.getpage(pagenum)
2710             if startv != 0 or endv is not None:
2711                 page_results = page_results[startv:endv]
2712             yield from page_results
2713
2714             # A little optimization - if current page is not "full", ie. does
2715             # not contain page_size videos then we can assume that this page
2716             # is the last one - there are no more ids on further pages -
2717             # i.e. no need to query again.
2718             if len(page_results) + startv < self._pagesize:
2719                 break
2720
2721             # If we got the whole page, but the next page is not interesting,
2722             # break out early as well
2723             if end == nextfirstid:
2724                 break
2725
2726
2727 class InAdvancePagedList(PagedList):
2728     def __init__(self, pagefunc, pagecount, pagesize):
2729         self._pagecount = pagecount
2730         PagedList.__init__(self, pagefunc, pagesize, True)
2731
2732     def _getslice(self, start, end):
2733         start_page = start // self._pagesize
2734         end_page = (
2735             self._pagecount if end is None else (end // self._pagesize + 1))
2736         skip_elems = start - start_page * self._pagesize
2737         only_more = None if end is None else end - start
2738         for pagenum in range(start_page, end_page):
2739             page_results = self.getpage(pagenum)
2740             if skip_elems:
2741                 page_results = page_results[skip_elems:]
2742                 skip_elems = None
2743             if only_more is not None:
2744                 if len(page_results) < only_more:
2745                     only_more -= len(page_results)
2746                 else:
2747                     yield from page_results[:only_more]
2748                     break
2749             yield from page_results
2750
2751
2752 def uppercase_escape(s):
2753     unicode_escape = codecs.getdecoder('unicode_escape')
2754     return re.sub(
2755         r'\\U[0-9a-fA-F]{8}',
2756         lambda m: unicode_escape(m.group(0))[0],
2757         s)
2758
2759
2760 def lowercase_escape(s):
2761     unicode_escape = codecs.getdecoder('unicode_escape')
2762     return re.sub(
2763         r'\\u[0-9a-fA-F]{4}',
2764         lambda m: unicode_escape(m.group(0))[0],
2765         s)
2766
2767
2768 def escape_rfc3986(s):
2769     """Escape non-ASCII characters as suggested by RFC 3986"""
2770     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2771         s = s.encode('utf-8')
2772     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2773
2774
2775 def escape_url(url):
2776     """Escape URL as suggested by RFC 3986"""
2777     url_parsed = compat_urllib_parse_urlparse(url)
2778     return url_parsed._replace(
2779         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2780         path=escape_rfc3986(url_parsed.path),
2781         params=escape_rfc3986(url_parsed.params),
2782         query=escape_rfc3986(url_parsed.query),
2783         fragment=escape_rfc3986(url_parsed.fragment)
2784     ).geturl()
2785
2786
2787 def parse_qs(url):
2788     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2789
2790
2791 def read_batch_urls(batch_fd):
2792     def fixup(url):
2793         if not isinstance(url, compat_str):
2794             url = url.decode('utf-8', 'replace')
2795         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2796         for bom in BOM_UTF8:
2797             if url.startswith(bom):
2798                 url = url[len(bom):]
2799         url = url.lstrip()
2800         if not url or url.startswith(('#', ';', ']')):
2801             return False
2802         # "#" cannot be stripped out since it is part of the URI
2803         # However, it can be safely stipped out if follwing a whitespace
2804         return re.split(r'\s#', url, 1)[0].rstrip()
2805
2806     with contextlib.closing(batch_fd) as fd:
2807         return [url for url in map(fixup, fd) if url]
2808
2809
2810 def urlencode_postdata(*args, **kargs):
2811     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2812
2813
2814 def update_url_query(url, query):
2815     if not query:
2816         return url
2817     parsed_url = compat_urlparse.urlparse(url)
2818     qs = compat_parse_qs(parsed_url.query)
2819     qs.update(query)
2820     return compat_urlparse.urlunparse(parsed_url._replace(
2821         query=compat_urllib_parse_urlencode(qs, True)))
2822
2823
2824 def update_Request(req, url=None, data=None, headers={}, query={}):
2825     req_headers = req.headers.copy()
2826     req_headers.update(headers)
2827     req_data = data or req.data
2828     req_url = update_url_query(url or req.get_full_url(), query)
2829     req_get_method = req.get_method()
2830     if req_get_method == 'HEAD':
2831         req_type = HEADRequest
2832     elif req_get_method == 'PUT':
2833         req_type = PUTRequest
2834     else:
2835         req_type = compat_urllib_request.Request
2836     new_req = req_type(
2837         req_url, data=req_data, headers=req_headers,
2838         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2839     if hasattr(req, 'timeout'):
2840         new_req.timeout = req.timeout
2841     return new_req
2842
2843
2844 def _multipart_encode_impl(data, boundary):
2845     content_type = 'multipart/form-data; boundary=%s' % boundary
2846
2847     out = b''
2848     for k, v in data.items():
2849         out += b'--' + boundary.encode('ascii') + b'\r\n'
2850         if isinstance(k, compat_str):
2851             k = k.encode('utf-8')
2852         if isinstance(v, compat_str):
2853             v = v.encode('utf-8')
2854         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2855         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2856         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2857         if boundary.encode('ascii') in content:
2858             raise ValueError('Boundary overlaps with data')
2859         out += content
2860
2861     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2862
2863     return out, content_type
2864
2865
2866 def multipart_encode(data, boundary=None):
2867     '''
2868     Encode a dict to RFC 7578-compliant form-data
2869
2870     data:
2871         A dict where keys and values can be either Unicode or bytes-like
2872         objects.
2873     boundary:
2874         If specified a Unicode object, it's used as the boundary. Otherwise
2875         a random boundary is generated.
2876
2877     Reference: https://tools.ietf.org/html/rfc7578
2878     '''
2879     has_specified_boundary = boundary is not None
2880
2881     while True:
2882         if boundary is None:
2883             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2884
2885         try:
2886             out, content_type = _multipart_encode_impl(data, boundary)
2887             break
2888         except ValueError:
2889             if has_specified_boundary:
2890                 raise
2891             boundary = None
2892
2893     return out, content_type
2894
2895
2896 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2897     if isinstance(key_or_keys, (list, tuple)):
2898         for key in key_or_keys:
2899             if key not in d or d[key] is None or skip_false_values and not d[key]:
2900                 continue
2901             return d[key]
2902         return default
2903     return d.get(key_or_keys, default)
2904
2905
2906 def try_get(src, getter, expected_type=None):
2907     for get in variadic(getter):
2908         try:
2909             v = get(src)
2910         except (AttributeError, KeyError, TypeError, IndexError):
2911             pass
2912         else:
2913             if expected_type is None or isinstance(v, expected_type):
2914                 return v
2915
2916
2917 def merge_dicts(*dicts):
2918     merged = {}
2919     for a_dict in dicts:
2920         for k, v in a_dict.items():
2921             if v is None:
2922                 continue
2923             if (k not in merged
2924                     or (isinstance(v, compat_str) and v
2925                         and isinstance(merged[k], compat_str)
2926                         and not merged[k])):
2927                 merged[k] = v
2928     return merged
2929
2930
2931 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2932     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2933
2934
2935 US_RATINGS = {
2936     'G': 0,
2937     'PG': 10,
2938     'PG-13': 13,
2939     'R': 16,
2940     'NC': 18,
2941 }
2942
2943
2944 TV_PARENTAL_GUIDELINES = {
2945     'TV-Y': 0,
2946     'TV-Y7': 7,
2947     'TV-G': 0,
2948     'TV-PG': 0,
2949     'TV-14': 14,
2950     'TV-MA': 17,
2951 }
2952
2953
2954 def parse_age_limit(s):
2955     if type(s) == int:
2956         return s if 0 <= s <= 21 else None
2957     if not isinstance(s, compat_basestring):
2958         return None
2959     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2960     if m:
2961         return int(m.group('age'))
2962     s = s.upper()
2963     if s in US_RATINGS:
2964         return US_RATINGS[s]
2965     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2966     if m:
2967         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2968     return None
2969
2970
2971 def strip_jsonp(code):
2972     return re.sub(
2973         r'''(?sx)^
2974             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2975             (?:\s*&&\s*(?P=func_name))?
2976             \s*\(\s*(?P<callback_data>.*)\);?
2977             \s*?(?://[^\n]*)*$''',
2978         r'\g<callback_data>', code)
2979
2980
2981 def js_to_json(code, vars={}):
2982     # vars is a dict of var, val pairs to substitute
2983     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2984     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2985     INTEGER_TABLE = (
2986         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2987         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2988     )
2989
2990     def fix_kv(m):
2991         v = m.group(0)
2992         if v in ('true', 'false', 'null'):
2993             return v
2994         elif v in ('undefined', 'void 0'):
2995             return 'null'
2996         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2997             return ""
2998
2999         if v[0] in ("'", '"'):
3000             v = re.sub(r'(?s)\\.|"', lambda m: {
3001                 '"': '\\"',
3002                 "\\'": "'",
3003                 '\\\n': '',
3004                 '\\x': '\\u00',
3005             }.get(m.group(0), m.group(0)), v[1:-1])
3006         else:
3007             for regex, base in INTEGER_TABLE:
3008                 im = re.match(regex, v)
3009                 if im:
3010                     i = int(im.group(1), base)
3011                     return '"%d":' % i if v.endswith(':') else '%d' % i
3012
3013             if v in vars:
3014                 return vars[v]
3015
3016         return '"%s"' % v
3017
3018     return re.sub(r'''(?sx)
3019         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3020         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3021         {comment}|,(?={skip}[\]}}])|
3022         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3023         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3024         [0-9]+(?={skip}:)|
3025         !+
3026         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3027
3028
3029 def qualities(quality_ids):
3030     """ Get a numeric quality value out of a list of possible values """
3031     def q(qid):
3032         try:
3033             return quality_ids.index(qid)
3034         except ValueError:
3035             return -1
3036     return q
3037
3038
3039 DEFAULT_OUTTMPL = {
3040     'default': '%(title)s [%(id)s].%(ext)s',
3041     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3042 }
3043 OUTTMPL_TYPES = {
3044     'chapter': None,
3045     'subtitle': None,
3046     'thumbnail': None,
3047     'description': 'description',
3048     'annotation': 'annotations.xml',
3049     'infojson': 'info.json',
3050     'link': None,
3051     'pl_thumbnail': None,
3052     'pl_description': 'description',
3053     'pl_infojson': 'info.json',
3054 }
3055
3056 # As of [1] format syntax is:
3057 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3058 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3059 STR_FORMAT_RE_TMPL = r'''(?x)
3060     (?<!%)(?P<prefix>(?:%%)*)
3061     %
3062     (?P<has_key>\((?P<key>{0})\))?
3063     (?P<format>
3064         (?P<conversion>[#0\-+ ]+)?
3065         (?P<min_width>\d+)?
3066         (?P<precision>\.\d+)?
3067         (?P<len_mod>[hlL])?  # unused in python
3068         {1}  # conversion type
3069     )
3070 '''
3071
3072
3073 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3074
3075
3076 def limit_length(s, length):
3077     """ Add ellipses to overly long strings """
3078     if s is None:
3079         return None
3080     ELLIPSES = '...'
3081     if len(s) > length:
3082         return s[:length - len(ELLIPSES)] + ELLIPSES
3083     return s
3084
3085
3086 def version_tuple(v):
3087     return tuple(int(e) for e in re.split(r'[-.]', v))
3088
3089
3090 def is_outdated_version(version, limit, assume_new=True):
3091     if not version:
3092         return not assume_new
3093     try:
3094         return version_tuple(version) < version_tuple(limit)
3095     except ValueError:
3096         return not assume_new
3097
3098
3099 def ytdl_is_updateable():
3100     """ Returns if yt-dlp can be updated with -U """
3101
3102     from .update import is_non_updateable
3103
3104     return not is_non_updateable()
3105
3106
3107 def args_to_str(args):
3108     # Get a short string representation for a subprocess command
3109     return ' '.join(compat_shlex_quote(a) for a in args)
3110
3111
3112 def error_to_compat_str(err):
3113     err_str = str(err)
3114     # On python 2 error byte string must be decoded with proper
3115     # encoding rather than ascii
3116     if sys.version_info[0] < 3:
3117         err_str = err_str.decode(preferredencoding())
3118     return err_str
3119
3120
3121 def mimetype2ext(mt):
3122     if mt is None:
3123         return None
3124
3125     mt, _, params = mt.partition(';')
3126     mt = mt.strip()
3127
3128     FULL_MAP = {
3129         'audio/mp4': 'm4a',
3130         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3131         # it's the most popular one
3132         'audio/mpeg': 'mp3',
3133         'audio/x-wav': 'wav',
3134         'audio/wav': 'wav',
3135         'audio/wave': 'wav',
3136     }
3137
3138     ext = FULL_MAP.get(mt)
3139     if ext is not None:
3140         return ext
3141
3142     SUBTYPE_MAP = {
3143         '3gpp': '3gp',
3144         'smptett+xml': 'tt',
3145         'ttaf+xml': 'dfxp',
3146         'ttml+xml': 'ttml',
3147         'x-flv': 'flv',
3148         'x-mp4-fragmented': 'mp4',
3149         'x-ms-sami': 'sami',
3150         'x-ms-wmv': 'wmv',
3151         'mpegurl': 'm3u8',
3152         'x-mpegurl': 'm3u8',
3153         'vnd.apple.mpegurl': 'm3u8',
3154         'dash+xml': 'mpd',
3155         'f4m+xml': 'f4m',
3156         'hds+xml': 'f4m',
3157         'vnd.ms-sstr+xml': 'ism',
3158         'quicktime': 'mov',
3159         'mp2t': 'ts',
3160         'x-wav': 'wav',
3161         'filmstrip+json': 'fs',
3162         'svg+xml': 'svg',
3163     }
3164
3165     _, _, subtype = mt.rpartition('/')
3166     ext = SUBTYPE_MAP.get(subtype.lower())
3167     if ext is not None:
3168         return ext
3169
3170     SUFFIX_MAP = {
3171         'json': 'json',
3172         'xml': 'xml',
3173         'zip': 'zip',
3174         'gzip': 'gz',
3175     }
3176
3177     _, _, suffix = subtype.partition('+')
3178     ext = SUFFIX_MAP.get(suffix)
3179     if ext is not None:
3180         return ext
3181
3182     return subtype.replace('+', '.')
3183
3184
3185 def ext2mimetype(ext_or_url):
3186     if not ext_or_url:
3187         return None
3188     if '.' not in ext_or_url:
3189         ext_or_url = f'file.{ext_or_url}'
3190     return mimetypes.guess_type(ext_or_url)[0]
3191
3192
3193 def parse_codecs(codecs_str):
3194     # http://tools.ietf.org/html/rfc6381
3195     if not codecs_str:
3196         return {}
3197     split_codecs = list(filter(None, map(
3198         str.strip, codecs_str.strip().strip(',').split(','))))
3199     vcodec, acodec, hdr = None, None, None
3200     for full_codec in split_codecs:
3201         parts = full_codec.split('.')
3202         codec = parts[0].replace('0', '')
3203         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3204                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3205             if not vcodec:
3206                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3207                 if codec in ('dvh1', 'dvhe'):
3208                     hdr = 'DV'
3209                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3210                     hdr = 'HDR10'
3211                 elif full_codec.replace('0', '').startswith('vp9.2'):
3212                     hdr = 'HDR10'
3213         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3214             if not acodec:
3215                 acodec = full_codec
3216         else:
3217             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3218     if vcodec or acodec:
3219         return {
3220             'vcodec': vcodec or 'none',
3221             'acodec': acodec or 'none',
3222             'dynamic_range': hdr,
3223         }
3224     elif len(split_codecs) == 2:
3225         return {
3226             'vcodec': split_codecs[0],
3227             'acodec': split_codecs[1],
3228         }
3229     return {}
3230
3231
3232 def urlhandle_detect_ext(url_handle):
3233     getheader = url_handle.headers.get
3234
3235     cd = getheader('Content-Disposition')
3236     if cd:
3237         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3238         if m:
3239             e = determine_ext(m.group('filename'), default_ext=None)
3240             if e:
3241                 return e
3242
3243     return mimetype2ext(getheader('Content-Type'))
3244
3245
3246 def encode_data_uri(data, mime_type):
3247     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3248
3249
3250 def age_restricted(content_limit, age_limit):
3251     """ Returns True iff the content should be blocked """
3252
3253     if age_limit is None:  # No limit set
3254         return False
3255     if content_limit is None:
3256         return False  # Content available for everyone
3257     return age_limit < content_limit
3258
3259
3260 def is_html(first_bytes):
3261     """ Detect whether a file contains HTML by examining its first bytes. """
3262
3263     BOMS = [
3264         (b'\xef\xbb\xbf', 'utf-8'),
3265         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3266         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3267         (b'\xff\xfe', 'utf-16-le'),
3268         (b'\xfe\xff', 'utf-16-be'),
3269     ]
3270     for bom, enc in BOMS:
3271         if first_bytes.startswith(bom):
3272             s = first_bytes[len(bom):].decode(enc, 'replace')
3273             break
3274     else:
3275         s = first_bytes.decode('utf-8', 'replace')
3276
3277     return re.match(r'^\s*<', s)
3278
3279
3280 def determine_protocol(info_dict):
3281     protocol = info_dict.get('protocol')
3282     if protocol is not None:
3283         return protocol
3284
3285     url = sanitize_url(info_dict['url'])
3286     if url.startswith('rtmp'):
3287         return 'rtmp'
3288     elif url.startswith('mms'):
3289         return 'mms'
3290     elif url.startswith('rtsp'):
3291         return 'rtsp'
3292
3293     ext = determine_ext(url)
3294     if ext == 'm3u8':
3295         return 'm3u8'
3296     elif ext == 'f4m':
3297         return 'f4m'
3298
3299     return compat_urllib_parse_urlparse(url).scheme
3300
3301
3302 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3303     """ Render a list of rows, each as a list of values.
3304     Text after a \t will be right aligned """
3305     def width(string):
3306         return len(remove_terminal_sequences(string).replace('\t', ''))
3307
3308     def get_max_lens(table):
3309         return [max(width(str(v)) for v in col) for col in zip(*table)]
3310
3311     def filter_using_list(row, filterArray):
3312         return [col for (take, col) in zip(filterArray, row) if take]
3313
3314     if hide_empty:
3315         max_lens = get_max_lens(data)
3316         header_row = filter_using_list(header_row, max_lens)
3317         data = [filter_using_list(row, max_lens) for row in data]
3318
3319     table = [header_row] + data
3320     max_lens = get_max_lens(table)
3321     extra_gap += 1
3322     if delim:
3323         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3324         table[1][-1] = table[1][-1][:-extra_gap]  # Remove extra_gap from end of delimiter
3325     for row in table:
3326         for pos, text in enumerate(map(str, row)):
3327             if '\t' in text:
3328                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3329             else:
3330                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3331     ret = '\n'.join(''.join(row).rstrip() for row in table)
3332     return ret
3333
3334
3335 def _match_one(filter_part, dct, incomplete):
3336     # TODO: Generalize code with YoutubeDL._build_format_filter
3337     STRING_OPERATORS = {
3338         '*=': operator.contains,
3339         '^=': lambda attr, value: attr.startswith(value),
3340         '$=': lambda attr, value: attr.endswith(value),
3341         '~=': lambda attr, value: re.search(value, attr),
3342     }
3343     COMPARISON_OPERATORS = {
3344         **STRING_OPERATORS,
3345         '<=': operator.le,  # "<=" must be defined above "<"
3346         '<': operator.lt,
3347         '>=': operator.ge,
3348         '>': operator.gt,
3349         '=': operator.eq,
3350     }
3351
3352     operator_rex = re.compile(r'''(?x)\s*
3353         (?P<key>[a-z_]+)
3354         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3355         (?:
3356             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3357             (?P<strval>.+?)
3358         )
3359         \s*$
3360         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3361     m = operator_rex.search(filter_part)
3362     if m:
3363         m = m.groupdict()
3364         unnegated_op = COMPARISON_OPERATORS[m['op']]
3365         if m['negation']:
3366             op = lambda attr, value: not unnegated_op(attr, value)
3367         else:
3368             op = unnegated_op
3369         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3370         if m['quote']:
3371             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3372         actual_value = dct.get(m['key'])
3373         numeric_comparison = None
3374         if isinstance(actual_value, compat_numeric_types):
3375             # If the original field is a string and matching comparisonvalue is
3376             # a number we should respect the origin of the original field
3377             # and process comparison value as a string (see
3378             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3379             try:
3380                 numeric_comparison = int(comparison_value)
3381             except ValueError:
3382                 numeric_comparison = parse_filesize(comparison_value)
3383                 if numeric_comparison is None:
3384                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3385                 if numeric_comparison is None:
3386                     numeric_comparison = parse_duration(comparison_value)
3387         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3388             raise ValueError('Operator %s only supports string values!' % m['op'])
3389         if actual_value is None:
3390             return incomplete or m['none_inclusive']
3391         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3392
3393     UNARY_OPERATORS = {
3394         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3395         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3396     }
3397     operator_rex = re.compile(r'''(?x)\s*
3398         (?P<op>%s)\s*(?P<key>[a-z_]+)
3399         \s*$
3400         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3401     m = operator_rex.search(filter_part)
3402     if m:
3403         op = UNARY_OPERATORS[m.group('op')]
3404         actual_value = dct.get(m.group('key'))
3405         if incomplete and actual_value is None:
3406             return True
3407         return op(actual_value)
3408
3409     raise ValueError('Invalid filter part %r' % filter_part)
3410
3411
3412 def match_str(filter_str, dct, incomplete=False):
3413     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3414         When incomplete, all conditions passes on missing fields
3415     """
3416     return all(
3417         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3418         for filter_part in re.split(r'(?<!\\)&', filter_str))
3419
3420
3421 def match_filter_func(filter_str):
3422     def _match_func(info_dict, *args, **kwargs):
3423         if match_str(filter_str, info_dict, *args, **kwargs):
3424             return None
3425         else:
3426             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3427             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3428     return _match_func
3429
3430
3431 def parse_dfxp_time_expr(time_expr):
3432     if not time_expr:
3433         return
3434
3435     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3436     if mobj:
3437         return float(mobj.group('time_offset'))
3438
3439     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3440     if mobj:
3441         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3442
3443
3444 def srt_subtitles_timecode(seconds):
3445     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3446
3447
3448 def ass_subtitles_timecode(seconds):
3449     time = timetuple_from_msec(seconds * 1000)
3450     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3451
3452
3453 def dfxp2srt(dfxp_data):
3454     '''
3455     @param dfxp_data A bytes-like object containing DFXP data
3456     @returns A unicode object containing converted SRT data
3457     '''
3458     LEGACY_NAMESPACES = (
3459         (b'http://www.w3.org/ns/ttml', [
3460             b'http://www.w3.org/2004/11/ttaf1',
3461             b'http://www.w3.org/2006/04/ttaf1',
3462             b'http://www.w3.org/2006/10/ttaf1',
3463         ]),
3464         (b'http://www.w3.org/ns/ttml#styling', [
3465             b'http://www.w3.org/ns/ttml#style',
3466         ]),
3467     )
3468
3469     SUPPORTED_STYLING = [
3470         'color',
3471         'fontFamily',
3472         'fontSize',
3473         'fontStyle',
3474         'fontWeight',
3475         'textDecoration'
3476     ]
3477
3478     _x = functools.partial(xpath_with_ns, ns_map={
3479         'xml': 'http://www.w3.org/XML/1998/namespace',
3480         'ttml': 'http://www.w3.org/ns/ttml',
3481         'tts': 'http://www.w3.org/ns/ttml#styling',
3482     })
3483
3484     styles = {}
3485     default_style = {}
3486
3487     class TTMLPElementParser(object):
3488         _out = ''
3489         _unclosed_elements = []
3490         _applied_styles = []
3491
3492         def start(self, tag, attrib):
3493             if tag in (_x('ttml:br'), 'br'):
3494                 self._out += '\n'
3495             else:
3496                 unclosed_elements = []
3497                 style = {}
3498                 element_style_id = attrib.get('style')
3499                 if default_style:
3500                     style.update(default_style)
3501                 if element_style_id:
3502                     style.update(styles.get(element_style_id, {}))
3503                 for prop in SUPPORTED_STYLING:
3504                     prop_val = attrib.get(_x('tts:' + prop))
3505                     if prop_val:
3506                         style[prop] = prop_val
3507                 if style:
3508                     font = ''
3509                     for k, v in sorted(style.items()):
3510                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3511                             continue
3512                         if k == 'color':
3513                             font += ' color="%s"' % v
3514                         elif k == 'fontSize':
3515                             font += ' size="%s"' % v
3516                         elif k == 'fontFamily':
3517                             font += ' face="%s"' % v
3518                         elif k == 'fontWeight' and v == 'bold':
3519                             self._out += '<b>'
3520                             unclosed_elements.append('b')
3521                         elif k == 'fontStyle' and v == 'italic':
3522                             self._out += '<i>'
3523                             unclosed_elements.append('i')
3524                         elif k == 'textDecoration' and v == 'underline':
3525                             self._out += '<u>'
3526                             unclosed_elements.append('u')
3527                     if font:
3528                         self._out += '<font' + font + '>'
3529                         unclosed_elements.append('font')
3530                     applied_style = {}
3531                     if self._applied_styles:
3532                         applied_style.update(self._applied_styles[-1])
3533                     applied_style.update(style)
3534                     self._applied_styles.append(applied_style)
3535                 self._unclosed_elements.append(unclosed_elements)
3536
3537         def end(self, tag):
3538             if tag not in (_x('ttml:br'), 'br'):
3539                 unclosed_elements = self._unclosed_elements.pop()
3540                 for element in reversed(unclosed_elements):
3541                     self._out += '</%s>' % element
3542                 if unclosed_elements and self._applied_styles:
3543                     self._applied_styles.pop()
3544
3545         def data(self, data):
3546             self._out += data
3547
3548         def close(self):
3549             return self._out.strip()
3550
3551     def parse_node(node):
3552         target = TTMLPElementParser()
3553         parser = xml.etree.ElementTree.XMLParser(target=target)
3554         parser.feed(xml.etree.ElementTree.tostring(node))
3555         return parser.close()
3556
3557     for k, v in LEGACY_NAMESPACES:
3558         for ns in v:
3559             dfxp_data = dfxp_data.replace(ns, k)
3560
3561     dfxp = compat_etree_fromstring(dfxp_data)
3562     out = []
3563     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3564
3565     if not paras:
3566         raise ValueError('Invalid dfxp/TTML subtitle')
3567
3568     repeat = False
3569     while True:
3570         for style in dfxp.findall(_x('.//ttml:style')):
3571             style_id = style.get('id') or style.get(_x('xml:id'))
3572             if not style_id:
3573                 continue
3574             parent_style_id = style.get('style')
3575             if parent_style_id:
3576                 if parent_style_id not in styles:
3577                     repeat = True
3578                     continue
3579                 styles[style_id] = styles[parent_style_id].copy()
3580             for prop in SUPPORTED_STYLING:
3581                 prop_val = style.get(_x('tts:' + prop))
3582                 if prop_val:
3583                     styles.setdefault(style_id, {})[prop] = prop_val
3584         if repeat:
3585             repeat = False
3586         else:
3587             break
3588
3589     for p in ('body', 'div'):
3590         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3591         if ele is None:
3592             continue
3593         style = styles.get(ele.get('style'))
3594         if not style:
3595             continue
3596         default_style.update(style)
3597
3598     for para, index in zip(paras, itertools.count(1)):
3599         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3600         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3601         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3602         if begin_time is None:
3603             continue
3604         if not end_time:
3605             if not dur:
3606                 continue
3607             end_time = begin_time + dur
3608         out.append('%d\n%s --> %s\n%s\n\n' % (
3609             index,
3610             srt_subtitles_timecode(begin_time),
3611             srt_subtitles_timecode(end_time),
3612             parse_node(para)))
3613
3614     return ''.join(out)
3615
3616
3617 def cli_option(params, command_option, param):
3618     param = params.get(param)
3619     if param:
3620         param = compat_str(param)
3621     return [command_option, param] if param is not None else []
3622
3623
3624 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3625     param = params.get(param)
3626     if param is None:
3627         return []
3628     assert isinstance(param, bool)
3629     if separator:
3630         return [command_option + separator + (true_value if param else false_value)]
3631     return [command_option, true_value if param else false_value]
3632
3633
3634 def cli_valueless_option(params, command_option, param, expected_value=True):
3635     param = params.get(param)
3636     return [command_option] if param == expected_value else []
3637
3638
3639 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3640     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3641         if use_compat:
3642             return argdict
3643         else:
3644             argdict = None
3645     if argdict is None:
3646         return default
3647     assert isinstance(argdict, dict)
3648
3649     assert isinstance(keys, (list, tuple))
3650     for key_list in keys:
3651         arg_list = list(filter(
3652             lambda x: x is not None,
3653             [argdict.get(key.lower()) for key in variadic(key_list)]))
3654         if arg_list:
3655             return [arg for args in arg_list for arg in args]
3656     return default
3657
3658
3659 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3660     main_key, exe = main_key.lower(), exe.lower()
3661     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3662     keys = [f'{root_key}{k}' for k in (keys or [''])]
3663     if root_key in keys:
3664         if main_key != exe:
3665             keys.append((main_key, exe))
3666         keys.append('default')
3667     else:
3668         use_compat = False
3669     return cli_configuration_args(argdict, keys, default, use_compat)
3670
3671
3672 class ISO639Utils(object):
3673     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3674     _lang_map = {
3675         'aa': 'aar',
3676         'ab': 'abk',
3677         'ae': 'ave',
3678         'af': 'afr',
3679         'ak': 'aka',
3680         'am': 'amh',
3681         'an': 'arg',
3682         'ar': 'ara',
3683         'as': 'asm',
3684         'av': 'ava',
3685         'ay': 'aym',
3686         'az': 'aze',
3687         'ba': 'bak',
3688         'be': 'bel',
3689         'bg': 'bul',
3690         'bh': 'bih',
3691         'bi': 'bis',
3692         'bm': 'bam',
3693         'bn': 'ben',
3694         'bo': 'bod',
3695         'br': 'bre',
3696         'bs': 'bos',
3697         'ca': 'cat',
3698         'ce': 'che',
3699         'ch': 'cha',
3700         'co': 'cos',
3701         'cr': 'cre',
3702         'cs': 'ces',
3703         'cu': 'chu',
3704         'cv': 'chv',
3705         'cy': 'cym',
3706         'da': 'dan',
3707         'de': 'deu',
3708         'dv': 'div',
3709         'dz': 'dzo',
3710         'ee': 'ewe',
3711         'el': 'ell',
3712         'en': 'eng',
3713         'eo': 'epo',
3714         'es': 'spa',
3715         'et': 'est',
3716         'eu': 'eus',
3717         'fa': 'fas',
3718         'ff': 'ful',
3719         'fi': 'fin',
3720         'fj': 'fij',
3721         'fo': 'fao',
3722         'fr': 'fra',
3723         'fy': 'fry',
3724         'ga': 'gle',
3725         'gd': 'gla',
3726         'gl': 'glg',
3727         'gn': 'grn',
3728         'gu': 'guj',
3729         'gv': 'glv',
3730         'ha': 'hau',
3731         'he': 'heb',
3732         'iw': 'heb',  # Replaced by he in 1989 revision
3733         'hi': 'hin',
3734         'ho': 'hmo',
3735         'hr': 'hrv',
3736         'ht': 'hat',
3737         'hu': 'hun',
3738         'hy': 'hye',
3739         'hz': 'her',
3740         'ia': 'ina',
3741         'id': 'ind',
3742         'in': 'ind',  # Replaced by id in 1989 revision
3743         'ie': 'ile',
3744         'ig': 'ibo',
3745         'ii': 'iii',
3746         'ik': 'ipk',
3747         'io': 'ido',
3748         'is': 'isl',
3749         'it': 'ita',
3750         'iu': 'iku',
3751         'ja': 'jpn',
3752         'jv': 'jav',
3753         'ka': 'kat',
3754         'kg': 'kon',
3755         'ki': 'kik',
3756         'kj': 'kua',
3757         'kk': 'kaz',
3758         'kl': 'kal',
3759         'km': 'khm',
3760         'kn': 'kan',
3761         'ko': 'kor',
3762         'kr': 'kau',
3763         'ks': 'kas',
3764         'ku': 'kur',
3765         'kv': 'kom',
3766         'kw': 'cor',
3767         'ky': 'kir',
3768         'la': 'lat',
3769         'lb': 'ltz',
3770         'lg': 'lug',
3771         'li': 'lim',
3772         'ln': 'lin',
3773         'lo': 'lao',
3774         'lt': 'lit',
3775         'lu': 'lub',
3776         'lv': 'lav',
3777         'mg': 'mlg',
3778         'mh': 'mah',
3779         'mi': 'mri',
3780         'mk': 'mkd',
3781         'ml': 'mal',
3782         'mn': 'mon',
3783         'mr': 'mar',
3784         'ms': 'msa',
3785         'mt': 'mlt',
3786         'my': 'mya',
3787         'na': 'nau',
3788         'nb': 'nob',
3789         'nd': 'nde',
3790         'ne': 'nep',
3791         'ng': 'ndo',
3792         'nl': 'nld',
3793         'nn': 'nno',
3794         'no': 'nor',
3795         'nr': 'nbl',
3796         'nv': 'nav',
3797         'ny': 'nya',
3798         'oc': 'oci',
3799         'oj': 'oji',
3800         'om': 'orm',
3801         'or': 'ori',
3802         'os': 'oss',
3803         'pa': 'pan',
3804         'pi': 'pli',
3805         'pl': 'pol',
3806         'ps': 'pus',
3807         'pt': 'por',
3808         'qu': 'que',
3809         'rm': 'roh',
3810         'rn': 'run',
3811         'ro': 'ron',
3812         'ru': 'rus',
3813         'rw': 'kin',
3814         'sa': 'san',
3815         'sc': 'srd',
3816         'sd': 'snd',
3817         'se': 'sme',
3818         'sg': 'sag',
3819         'si': 'sin',
3820         'sk': 'slk',
3821         'sl': 'slv',
3822         'sm': 'smo',
3823         'sn': 'sna',
3824         'so': 'som',
3825         'sq': 'sqi',
3826         'sr': 'srp',
3827         'ss': 'ssw',
3828         'st': 'sot',
3829         'su': 'sun',
3830         'sv': 'swe',
3831         'sw': 'swa',
3832         'ta': 'tam',
3833         'te': 'tel',
3834         'tg': 'tgk',
3835         'th': 'tha',
3836         'ti': 'tir',
3837         'tk': 'tuk',
3838         'tl': 'tgl',
3839         'tn': 'tsn',
3840         'to': 'ton',
3841         'tr': 'tur',
3842         'ts': 'tso',
3843         'tt': 'tat',
3844         'tw': 'twi',
3845         'ty': 'tah',
3846         'ug': 'uig',
3847         'uk': 'ukr',
3848         'ur': 'urd',
3849         'uz': 'uzb',
3850         've': 'ven',
3851         'vi': 'vie',
3852         'vo': 'vol',
3853         'wa': 'wln',
3854         'wo': 'wol',
3855         'xh': 'xho',
3856         'yi': 'yid',
3857         'ji': 'yid',  # Replaced by yi in 1989 revision
3858         'yo': 'yor',
3859         'za': 'zha',
3860         'zh': 'zho',
3861         'zu': 'zul',
3862     }
3863
3864     @classmethod
3865     def short2long(cls, code):
3866         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3867         return cls._lang_map.get(code[:2])
3868
3869     @classmethod
3870     def long2short(cls, code):
3871         """Convert language code from ISO 639-2/T to ISO 639-1"""
3872         for short_name, long_name in cls._lang_map.items():
3873             if long_name == code:
3874                 return short_name
3875
3876
3877 class ISO3166Utils(object):
3878     # From http://data.okfn.org/data/core/country-list
3879     _country_map = {
3880         'AF': 'Afghanistan',
3881         'AX': 'Åland Islands',
3882         'AL': 'Albania',
3883         'DZ': 'Algeria',
3884         'AS': 'American Samoa',
3885         'AD': 'Andorra',
3886         'AO': 'Angola',
3887         'AI': 'Anguilla',
3888         'AQ': 'Antarctica',
3889         'AG': 'Antigua and Barbuda',
3890         'AR': 'Argentina',
3891         'AM': 'Armenia',
3892         'AW': 'Aruba',
3893         'AU': 'Australia',
3894         'AT': 'Austria',
3895         'AZ': 'Azerbaijan',
3896         'BS': 'Bahamas',
3897         'BH': 'Bahrain',
3898         'BD': 'Bangladesh',
3899         'BB': 'Barbados',
3900         'BY': 'Belarus',
3901         'BE': 'Belgium',
3902         'BZ': 'Belize',
3903         'BJ': 'Benin',
3904         'BM': 'Bermuda',
3905         'BT': 'Bhutan',
3906         'BO': 'Bolivia, Plurinational State of',
3907         'BQ': 'Bonaire, Sint Eustatius and Saba',
3908         'BA': 'Bosnia and Herzegovina',
3909         'BW': 'Botswana',
3910         'BV': 'Bouvet Island',
3911         'BR': 'Brazil',
3912         'IO': 'British Indian Ocean Territory',
3913         'BN': 'Brunei Darussalam',
3914         'BG': 'Bulgaria',
3915         'BF': 'Burkina Faso',
3916         'BI': 'Burundi',
3917         'KH': 'Cambodia',
3918         'CM': 'Cameroon',
3919         'CA': 'Canada',
3920         'CV': 'Cape Verde',
3921         'KY': 'Cayman Islands',
3922         'CF': 'Central African Republic',
3923         'TD': 'Chad',
3924         'CL': 'Chile',
3925         'CN': 'China',
3926         'CX': 'Christmas Island',
3927         'CC': 'Cocos (Keeling) Islands',
3928         'CO': 'Colombia',
3929         'KM': 'Comoros',
3930         'CG': 'Congo',
3931         'CD': 'Congo, the Democratic Republic of the',
3932         'CK': 'Cook Islands',
3933         'CR': 'Costa Rica',
3934         'CI': 'Côte d\'Ivoire',
3935         'HR': 'Croatia',
3936         'CU': 'Cuba',
3937         'CW': 'Curaçao',
3938         'CY': 'Cyprus',
3939         'CZ': 'Czech Republic',
3940         'DK': 'Denmark',
3941         'DJ': 'Djibouti',
3942         'DM': 'Dominica',
3943         'DO': 'Dominican Republic',
3944         'EC': 'Ecuador',
3945         'EG': 'Egypt',
3946         'SV': 'El Salvador',
3947         'GQ': 'Equatorial Guinea',
3948         'ER': 'Eritrea',
3949         'EE': 'Estonia',
3950         'ET': 'Ethiopia',
3951         'FK': 'Falkland Islands (Malvinas)',
3952         'FO': 'Faroe Islands',
3953         'FJ': 'Fiji',
3954         'FI': 'Finland',
3955         'FR': 'France',
3956         'GF': 'French Guiana',
3957         'PF': 'French Polynesia',
3958         'TF': 'French Southern Territories',
3959         'GA': 'Gabon',
3960         'GM': 'Gambia',
3961         'GE': 'Georgia',
3962         'DE': 'Germany',
3963         'GH': 'Ghana',
3964         'GI': 'Gibraltar',
3965         'GR': 'Greece',
3966         'GL': 'Greenland',
3967         'GD': 'Grenada',
3968         'GP': 'Guadeloupe',
3969         'GU': 'Guam',
3970         'GT': 'Guatemala',
3971         'GG': 'Guernsey',
3972         'GN': 'Guinea',
3973         'GW': 'Guinea-Bissau',
3974         'GY': 'Guyana',
3975         'HT': 'Haiti',
3976         'HM': 'Heard Island and McDonald Islands',
3977         'VA': 'Holy See (Vatican City State)',
3978         'HN': 'Honduras',
3979         'HK': 'Hong Kong',
3980         'HU': 'Hungary',
3981         'IS': 'Iceland',
3982         'IN': 'India',
3983         'ID': 'Indonesia',
3984         'IR': 'Iran, Islamic Republic of',
3985         'IQ': 'Iraq',
3986         'IE': 'Ireland',
3987         'IM': 'Isle of Man',
3988         'IL': 'Israel',
3989         'IT': 'Italy',
3990         'JM': 'Jamaica',
3991         'JP': 'Japan',
3992         'JE': 'Jersey',
3993         'JO': 'Jordan',
3994         'KZ': 'Kazakhstan',
3995         'KE': 'Kenya',
3996         'KI': 'Kiribati',
3997         'KP': 'Korea, Democratic People\'s Republic of',
3998         'KR': 'Korea, Republic of',
3999         'KW': 'Kuwait',
4000         'KG': 'Kyrgyzstan',
4001         'LA': 'Lao People\'s Democratic Republic',
4002         'LV': 'Latvia',
4003         'LB': 'Lebanon',
4004         'LS': 'Lesotho',
4005         'LR': 'Liberia',
4006         'LY': 'Libya',
4007         'LI': 'Liechtenstein',
4008         'LT': 'Lithuania',
4009         'LU': 'Luxembourg',
4010         'MO': 'Macao',
4011         'MK': 'Macedonia, the Former Yugoslav Republic of',
4012         'MG': 'Madagascar',
4013         'MW': 'Malawi',
4014         'MY': 'Malaysia',
4015         'MV': 'Maldives',
4016         'ML': 'Mali',
4017         'MT': 'Malta',
4018         'MH': 'Marshall Islands',
4019         'MQ': 'Martinique',
4020         'MR': 'Mauritania',
4021         'MU': 'Mauritius',
4022         'YT': 'Mayotte',
4023         'MX': 'Mexico',
4024         'FM': 'Micronesia, Federated States of',
4025         'MD': 'Moldova, Republic of',
4026         'MC': 'Monaco',
4027         'MN': 'Mongolia',
4028         'ME': 'Montenegro',
4029         'MS': 'Montserrat',
4030         'MA': 'Morocco',
4031         'MZ': 'Mozambique',
4032         'MM': 'Myanmar',
4033         'NA': 'Namibia',
4034         'NR': 'Nauru',
4035         'NP': 'Nepal',
4036         'NL': 'Netherlands',
4037         'NC': 'New Caledonia',
4038         'NZ': 'New Zealand',
4039         'NI': 'Nicaragua',
4040         'NE': 'Niger',
4041         'NG': 'Nigeria',
4042         'NU': 'Niue',
4043         'NF': 'Norfolk Island',
4044         'MP': 'Northern Mariana Islands',
4045         'NO': 'Norway',
4046         'OM': 'Oman',
4047         'PK': 'Pakistan',
4048         'PW': 'Palau',
4049         'PS': 'Palestine, State of',
4050         'PA': 'Panama',
4051         'PG': 'Papua New Guinea',
4052         'PY': 'Paraguay',
4053         'PE': 'Peru',
4054         'PH': 'Philippines',
4055         'PN': 'Pitcairn',
4056         'PL': 'Poland',
4057         'PT': 'Portugal',
4058         'PR': 'Puerto Rico',
4059         'QA': 'Qatar',
4060         'RE': 'Réunion',
4061         'RO': 'Romania',
4062         'RU': 'Russian Federation',
4063         'RW': 'Rwanda',
4064         'BL': 'Saint Barthélemy',
4065         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4066         'KN': 'Saint Kitts and Nevis',
4067         'LC': 'Saint Lucia',
4068         'MF': 'Saint Martin (French part)',
4069         'PM': 'Saint Pierre and Miquelon',
4070         'VC': 'Saint Vincent and the Grenadines',
4071         'WS': 'Samoa',
4072         'SM': 'San Marino',
4073         'ST': 'Sao Tome and Principe',
4074         'SA': 'Saudi Arabia',
4075         'SN': 'Senegal',
4076         'RS': 'Serbia',
4077         'SC': 'Seychelles',
4078         'SL': 'Sierra Leone',
4079         'SG': 'Singapore',
4080         'SX': 'Sint Maarten (Dutch part)',
4081         'SK': 'Slovakia',
4082         'SI': 'Slovenia',
4083         'SB': 'Solomon Islands',
4084         'SO': 'Somalia',
4085         'ZA': 'South Africa',
4086         'GS': 'South Georgia and the South Sandwich Islands',
4087         'SS': 'South Sudan',
4088         'ES': 'Spain',
4089         'LK': 'Sri Lanka',
4090         'SD': 'Sudan',
4091         'SR': 'Suriname',
4092         'SJ': 'Svalbard and Jan Mayen',
4093         'SZ': 'Swaziland',
4094         'SE': 'Sweden',
4095         'CH': 'Switzerland',
4096         'SY': 'Syrian Arab Republic',
4097         'TW': 'Taiwan, Province of China',
4098         'TJ': 'Tajikistan',
4099         'TZ': 'Tanzania, United Republic of',
4100         'TH': 'Thailand',
4101         'TL': 'Timor-Leste',
4102         'TG': 'Togo',
4103         'TK': 'Tokelau',
4104         'TO': 'Tonga',
4105         'TT': 'Trinidad and Tobago',
4106         'TN': 'Tunisia',
4107         'TR': 'Turkey',
4108         'TM': 'Turkmenistan',
4109         'TC': 'Turks and Caicos Islands',
4110         'TV': 'Tuvalu',
4111         'UG': 'Uganda',
4112         'UA': 'Ukraine',
4113         'AE': 'United Arab Emirates',
4114         'GB': 'United Kingdom',
4115         'US': 'United States',
4116         'UM': 'United States Minor Outlying Islands',
4117         'UY': 'Uruguay',
4118         'UZ': 'Uzbekistan',
4119         'VU': 'Vanuatu',
4120         'VE': 'Venezuela, Bolivarian Republic of',
4121         'VN': 'Viet Nam',
4122         'VG': 'Virgin Islands, British',
4123         'VI': 'Virgin Islands, U.S.',
4124         'WF': 'Wallis and Futuna',
4125         'EH': 'Western Sahara',
4126         'YE': 'Yemen',
4127         'ZM': 'Zambia',
4128         'ZW': 'Zimbabwe',
4129     }
4130
4131     @classmethod
4132     def short2full(cls, code):
4133         """Convert an ISO 3166-2 country code to the corresponding full name"""
4134         return cls._country_map.get(code.upper())
4135
4136
4137 class GeoUtils(object):
4138     # Major IPv4 address blocks per country
4139     _country_ip_map = {
4140         'AD': '46.172.224.0/19',
4141         'AE': '94.200.0.0/13',
4142         'AF': '149.54.0.0/17',
4143         'AG': '209.59.64.0/18',
4144         'AI': '204.14.248.0/21',
4145         'AL': '46.99.0.0/16',
4146         'AM': '46.70.0.0/15',
4147         'AO': '105.168.0.0/13',
4148         'AP': '182.50.184.0/21',
4149         'AQ': '23.154.160.0/24',
4150         'AR': '181.0.0.0/12',
4151         'AS': '202.70.112.0/20',
4152         'AT': '77.116.0.0/14',
4153         'AU': '1.128.0.0/11',
4154         'AW': '181.41.0.0/18',
4155         'AX': '185.217.4.0/22',
4156         'AZ': '5.197.0.0/16',
4157         'BA': '31.176.128.0/17',
4158         'BB': '65.48.128.0/17',
4159         'BD': '114.130.0.0/16',
4160         'BE': '57.0.0.0/8',
4161         'BF': '102.178.0.0/15',
4162         'BG': '95.42.0.0/15',
4163         'BH': '37.131.0.0/17',
4164         'BI': '154.117.192.0/18',
4165         'BJ': '137.255.0.0/16',
4166         'BL': '185.212.72.0/23',
4167         'BM': '196.12.64.0/18',
4168         'BN': '156.31.0.0/16',
4169         'BO': '161.56.0.0/16',
4170         'BQ': '161.0.80.0/20',
4171         'BR': '191.128.0.0/12',
4172         'BS': '24.51.64.0/18',
4173         'BT': '119.2.96.0/19',
4174         'BW': '168.167.0.0/16',
4175         'BY': '178.120.0.0/13',
4176         'BZ': '179.42.192.0/18',
4177         'CA': '99.224.0.0/11',
4178         'CD': '41.243.0.0/16',
4179         'CF': '197.242.176.0/21',
4180         'CG': '160.113.0.0/16',
4181         'CH': '85.0.0.0/13',
4182         'CI': '102.136.0.0/14',
4183         'CK': '202.65.32.0/19',
4184         'CL': '152.172.0.0/14',
4185         'CM': '102.244.0.0/14',
4186         'CN': '36.128.0.0/10',
4187         'CO': '181.240.0.0/12',
4188         'CR': '201.192.0.0/12',
4189         'CU': '152.206.0.0/15',
4190         'CV': '165.90.96.0/19',
4191         'CW': '190.88.128.0/17',
4192         'CY': '31.153.0.0/16',
4193         'CZ': '88.100.0.0/14',
4194         'DE': '53.0.0.0/8',
4195         'DJ': '197.241.0.0/17',
4196         'DK': '87.48.0.0/12',
4197         'DM': '192.243.48.0/20',
4198         'DO': '152.166.0.0/15',
4199         'DZ': '41.96.0.0/12',
4200         'EC': '186.68.0.0/15',
4201         'EE': '90.190.0.0/15',
4202         'EG': '156.160.0.0/11',
4203         'ER': '196.200.96.0/20',
4204         'ES': '88.0.0.0/11',
4205         'ET': '196.188.0.0/14',
4206         'EU': '2.16.0.0/13',
4207         'FI': '91.152.0.0/13',
4208         'FJ': '144.120.0.0/16',
4209         'FK': '80.73.208.0/21',
4210         'FM': '119.252.112.0/20',
4211         'FO': '88.85.32.0/19',
4212         'FR': '90.0.0.0/9',
4213         'GA': '41.158.0.0/15',
4214         'GB': '25.0.0.0/8',
4215         'GD': '74.122.88.0/21',
4216         'GE': '31.146.0.0/16',
4217         'GF': '161.22.64.0/18',
4218         'GG': '62.68.160.0/19',
4219         'GH': '154.160.0.0/12',
4220         'GI': '95.164.0.0/16',
4221         'GL': '88.83.0.0/19',
4222         'GM': '160.182.0.0/15',
4223         'GN': '197.149.192.0/18',
4224         'GP': '104.250.0.0/19',
4225         'GQ': '105.235.224.0/20',
4226         'GR': '94.64.0.0/13',
4227         'GT': '168.234.0.0/16',
4228         'GU': '168.123.0.0/16',
4229         'GW': '197.214.80.0/20',
4230         'GY': '181.41.64.0/18',
4231         'HK': '113.252.0.0/14',
4232         'HN': '181.210.0.0/16',
4233         'HR': '93.136.0.0/13',
4234         'HT': '148.102.128.0/17',
4235         'HU': '84.0.0.0/14',
4236         'ID': '39.192.0.0/10',
4237         'IE': '87.32.0.0/12',
4238         'IL': '79.176.0.0/13',
4239         'IM': '5.62.80.0/20',
4240         'IN': '117.192.0.0/10',
4241         'IO': '203.83.48.0/21',
4242         'IQ': '37.236.0.0/14',
4243         'IR': '2.176.0.0/12',
4244         'IS': '82.221.0.0/16',
4245         'IT': '79.0.0.0/10',
4246         'JE': '87.244.64.0/18',
4247         'JM': '72.27.0.0/17',
4248         'JO': '176.29.0.0/16',
4249         'JP': '133.0.0.0/8',
4250         'KE': '105.48.0.0/12',
4251         'KG': '158.181.128.0/17',
4252         'KH': '36.37.128.0/17',
4253         'KI': '103.25.140.0/22',
4254         'KM': '197.255.224.0/20',
4255         'KN': '198.167.192.0/19',
4256         'KP': '175.45.176.0/22',
4257         'KR': '175.192.0.0/10',
4258         'KW': '37.36.0.0/14',
4259         'KY': '64.96.0.0/15',
4260         'KZ': '2.72.0.0/13',
4261         'LA': '115.84.64.0/18',
4262         'LB': '178.135.0.0/16',
4263         'LC': '24.92.144.0/20',
4264         'LI': '82.117.0.0/19',
4265         'LK': '112.134.0.0/15',
4266         'LR': '102.183.0.0/16',
4267         'LS': '129.232.0.0/17',
4268         'LT': '78.56.0.0/13',
4269         'LU': '188.42.0.0/16',
4270         'LV': '46.109.0.0/16',
4271         'LY': '41.252.0.0/14',
4272         'MA': '105.128.0.0/11',
4273         'MC': '88.209.64.0/18',
4274         'MD': '37.246.0.0/16',
4275         'ME': '178.175.0.0/17',
4276         'MF': '74.112.232.0/21',
4277         'MG': '154.126.0.0/17',
4278         'MH': '117.103.88.0/21',
4279         'MK': '77.28.0.0/15',
4280         'ML': '154.118.128.0/18',
4281         'MM': '37.111.0.0/17',
4282         'MN': '49.0.128.0/17',
4283         'MO': '60.246.0.0/16',
4284         'MP': '202.88.64.0/20',
4285         'MQ': '109.203.224.0/19',
4286         'MR': '41.188.64.0/18',
4287         'MS': '208.90.112.0/22',
4288         'MT': '46.11.0.0/16',
4289         'MU': '105.16.0.0/12',
4290         'MV': '27.114.128.0/18',
4291         'MW': '102.70.0.0/15',
4292         'MX': '187.192.0.0/11',
4293         'MY': '175.136.0.0/13',
4294         'MZ': '197.218.0.0/15',
4295         'NA': '41.182.0.0/16',
4296         'NC': '101.101.0.0/18',
4297         'NE': '197.214.0.0/18',
4298         'NF': '203.17.240.0/22',
4299         'NG': '105.112.0.0/12',
4300         'NI': '186.76.0.0/15',
4301         'NL': '145.96.0.0/11',
4302         'NO': '84.208.0.0/13',
4303         'NP': '36.252.0.0/15',
4304         'NR': '203.98.224.0/19',
4305         'NU': '49.156.48.0/22',
4306         'NZ': '49.224.0.0/14',
4307         'OM': '5.36.0.0/15',
4308         'PA': '186.72.0.0/15',
4309         'PE': '186.160.0.0/14',
4310         'PF': '123.50.64.0/18',
4311         'PG': '124.240.192.0/19',
4312         'PH': '49.144.0.0/13',
4313         'PK': '39.32.0.0/11',
4314         'PL': '83.0.0.0/11',
4315         'PM': '70.36.0.0/20',
4316         'PR': '66.50.0.0/16',
4317         'PS': '188.161.0.0/16',
4318         'PT': '85.240.0.0/13',
4319         'PW': '202.124.224.0/20',
4320         'PY': '181.120.0.0/14',
4321         'QA': '37.210.0.0/15',
4322         'RE': '102.35.0.0/16',
4323         'RO': '79.112.0.0/13',
4324         'RS': '93.86.0.0/15',
4325         'RU': '5.136.0.0/13',
4326         'RW': '41.186.0.0/16',
4327         'SA': '188.48.0.0/13',
4328         'SB': '202.1.160.0/19',
4329         'SC': '154.192.0.0/11',
4330         'SD': '102.120.0.0/13',
4331         'SE': '78.64.0.0/12',
4332         'SG': '8.128.0.0/10',
4333         'SI': '188.196.0.0/14',
4334         'SK': '78.98.0.0/15',
4335         'SL': '102.143.0.0/17',
4336         'SM': '89.186.32.0/19',
4337         'SN': '41.82.0.0/15',
4338         'SO': '154.115.192.0/18',
4339         'SR': '186.179.128.0/17',
4340         'SS': '105.235.208.0/21',
4341         'ST': '197.159.160.0/19',
4342         'SV': '168.243.0.0/16',
4343         'SX': '190.102.0.0/20',
4344         'SY': '5.0.0.0/16',
4345         'SZ': '41.84.224.0/19',
4346         'TC': '65.255.48.0/20',
4347         'TD': '154.68.128.0/19',
4348         'TG': '196.168.0.0/14',
4349         'TH': '171.96.0.0/13',
4350         'TJ': '85.9.128.0/18',
4351         'TK': '27.96.24.0/21',
4352         'TL': '180.189.160.0/20',
4353         'TM': '95.85.96.0/19',
4354         'TN': '197.0.0.0/11',
4355         'TO': '175.176.144.0/21',
4356         'TR': '78.160.0.0/11',
4357         'TT': '186.44.0.0/15',
4358         'TV': '202.2.96.0/19',
4359         'TW': '120.96.0.0/11',
4360         'TZ': '156.156.0.0/14',
4361         'UA': '37.52.0.0/14',
4362         'UG': '102.80.0.0/13',
4363         'US': '6.0.0.0/8',
4364         'UY': '167.56.0.0/13',
4365         'UZ': '84.54.64.0/18',
4366         'VA': '212.77.0.0/19',
4367         'VC': '207.191.240.0/21',
4368         'VE': '186.88.0.0/13',
4369         'VG': '66.81.192.0/20',
4370         'VI': '146.226.0.0/16',
4371         'VN': '14.160.0.0/11',
4372         'VU': '202.80.32.0/20',
4373         'WF': '117.20.32.0/21',
4374         'WS': '202.4.32.0/19',
4375         'YE': '134.35.0.0/16',
4376         'YT': '41.242.116.0/22',
4377         'ZA': '41.0.0.0/11',
4378         'ZM': '102.144.0.0/13',
4379         'ZW': '102.177.192.0/18',
4380     }
4381
4382     @classmethod
4383     def random_ipv4(cls, code_or_block):
4384         if len(code_or_block) == 2:
4385             block = cls._country_ip_map.get(code_or_block.upper())
4386             if not block:
4387                 return None
4388         else:
4389             block = code_or_block
4390         addr, preflen = block.split('/')
4391         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4392         addr_max = addr_min | (0xffffffff >> int(preflen))
4393         return compat_str(socket.inet_ntoa(
4394             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4395
4396
4397 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4398     def __init__(self, proxies=None):
4399         # Set default handlers
4400         for type in ('http', 'https'):
4401             setattr(self, '%s_open' % type,
4402                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4403                         meth(r, proxy, type))
4404         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4405
4406     def proxy_open(self, req, proxy, type):
4407         req_proxy = req.headers.get('Ytdl-request-proxy')
4408         if req_proxy is not None:
4409             proxy = req_proxy
4410             del req.headers['Ytdl-request-proxy']
4411
4412         if proxy == '__noproxy__':
4413             return None  # No Proxy
4414         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4415             req.add_header('Ytdl-socks-proxy', proxy)
4416             # yt-dlp's http/https handlers do wrapping the socket with socks
4417             return None
4418         return compat_urllib_request.ProxyHandler.proxy_open(
4419             self, req, proxy, type)
4420
4421
4422 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4423 # released into Public Domain
4424 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4425
4426 def long_to_bytes(n, blocksize=0):
4427     """long_to_bytes(n:long, blocksize:int) : string
4428     Convert a long integer to a byte string.
4429
4430     If optional blocksize is given and greater than zero, pad the front of the
4431     byte string with binary zeros so that the length is a multiple of
4432     blocksize.
4433     """
4434     # after much testing, this algorithm was deemed to be the fastest
4435     s = b''
4436     n = int(n)
4437     while n > 0:
4438         s = compat_struct_pack('>I', n & 0xffffffff) + s
4439         n = n >> 32
4440     # strip off leading zeros
4441     for i in range(len(s)):
4442         if s[i] != b'\000'[0]:
4443             break
4444     else:
4445         # only happens when n == 0
4446         s = b'\000'
4447         i = 0
4448     s = s[i:]
4449     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4450     # de-padding being done above, but sigh...
4451     if blocksize > 0 and len(s) % blocksize:
4452         s = (blocksize - len(s) % blocksize) * b'\000' + s
4453     return s
4454
4455
4456 def bytes_to_long(s):
4457     """bytes_to_long(string) : long
4458     Convert a byte string to a long integer.
4459
4460     This is (essentially) the inverse of long_to_bytes().
4461     """
4462     acc = 0
4463     length = len(s)
4464     if length % 4:
4465         extra = (4 - length % 4)
4466         s = b'\000' * extra + s
4467         length = length + extra
4468     for i in range(0, length, 4):
4469         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4470     return acc
4471
4472
4473 def ohdave_rsa_encrypt(data, exponent, modulus):
4474     '''
4475     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4476
4477     Input:
4478         data: data to encrypt, bytes-like object
4479         exponent, modulus: parameter e and N of RSA algorithm, both integer
4480     Output: hex string of encrypted data
4481
4482     Limitation: supports one block encryption only
4483     '''
4484
4485     payload = int(binascii.hexlify(data[::-1]), 16)
4486     encrypted = pow(payload, exponent, modulus)
4487     return '%x' % encrypted
4488
4489
4490 def pkcs1pad(data, length):
4491     """
4492     Padding input data with PKCS#1 scheme
4493
4494     @param {int[]} data        input data
4495     @param {int}   length      target length
4496     @returns {int[]}           padded data
4497     """
4498     if len(data) > length - 11:
4499         raise ValueError('Input data too long for PKCS#1 padding')
4500
4501     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4502     return [0, 2] + pseudo_random + [0] + data
4503
4504
4505 def encode_base_n(num, n, table=None):
4506     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4507     if not table:
4508         table = FULL_TABLE[:n]
4509
4510     if n > len(table):
4511         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4512
4513     if num == 0:
4514         return table[0]
4515
4516     ret = ''
4517     while num:
4518         ret = table[num % n] + ret
4519         num = num // n
4520     return ret
4521
4522
4523 def decode_packed_codes(code):
4524     mobj = re.search(PACKED_CODES_RE, code)
4525     obfuscated_code, base, count, symbols = mobj.groups()
4526     base = int(base)
4527     count = int(count)
4528     symbols = symbols.split('|')
4529     symbol_table = {}
4530
4531     while count:
4532         count -= 1
4533         base_n_count = encode_base_n(count, base)
4534         symbol_table[base_n_count] = symbols[count] or base_n_count
4535
4536     return re.sub(
4537         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4538         obfuscated_code)
4539
4540
4541 def caesar(s, alphabet, shift):
4542     if shift == 0:
4543         return s
4544     l = len(alphabet)
4545     return ''.join(
4546         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4547         for c in s)
4548
4549
4550 def rot47(s):
4551     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4552
4553
4554 def parse_m3u8_attributes(attrib):
4555     info = {}
4556     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4557         if val.startswith('"'):
4558             val = val[1:-1]
4559         info[key] = val
4560     return info
4561
4562
4563 def urshift(val, n):
4564     return val >> n if val >= 0 else (val + 0x100000000) >> n
4565
4566
4567 # Based on png2str() written by @gdkchan and improved by @yokrysty
4568 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4569 def decode_png(png_data):
4570     # Reference: https://www.w3.org/TR/PNG/
4571     header = png_data[8:]
4572
4573     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4574         raise IOError('Not a valid PNG file.')
4575
4576     int_map = {1: '>B', 2: '>H', 4: '>I'}
4577     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4578
4579     chunks = []
4580
4581     while header:
4582         length = unpack_integer(header[:4])
4583         header = header[4:]
4584
4585         chunk_type = header[:4]
4586         header = header[4:]
4587
4588         chunk_data = header[:length]
4589         header = header[length:]
4590
4591         header = header[4:]  # Skip CRC
4592
4593         chunks.append({
4594             'type': chunk_type,
4595             'length': length,
4596             'data': chunk_data
4597         })
4598
4599     ihdr = chunks[0]['data']
4600
4601     width = unpack_integer(ihdr[:4])
4602     height = unpack_integer(ihdr[4:8])
4603
4604     idat = b''
4605
4606     for chunk in chunks:
4607         if chunk['type'] == b'IDAT':
4608             idat += chunk['data']
4609
4610     if not idat:
4611         raise IOError('Unable to read PNG data.')
4612
4613     decompressed_data = bytearray(zlib.decompress(idat))
4614
4615     stride = width * 3
4616     pixels = []
4617
4618     def _get_pixel(idx):
4619         x = idx % stride
4620         y = idx // stride
4621         return pixels[y][x]
4622
4623     for y in range(height):
4624         basePos = y * (1 + stride)
4625         filter_type = decompressed_data[basePos]
4626
4627         current_row = []
4628
4629         pixels.append(current_row)
4630
4631         for x in range(stride):
4632             color = decompressed_data[1 + basePos + x]
4633             basex = y * stride + x
4634             left = 0
4635             up = 0
4636
4637             if x > 2:
4638                 left = _get_pixel(basex - 3)
4639             if y > 0:
4640                 up = _get_pixel(basex - stride)
4641
4642             if filter_type == 1:  # Sub
4643                 color = (color + left) & 0xff
4644             elif filter_type == 2:  # Up
4645                 color = (color + up) & 0xff
4646             elif filter_type == 3:  # Average
4647                 color = (color + ((left + up) >> 1)) & 0xff
4648             elif filter_type == 4:  # Paeth
4649                 a = left
4650                 b = up
4651                 c = 0
4652
4653                 if x > 2 and y > 0:
4654                     c = _get_pixel(basex - stride - 3)
4655
4656                 p = a + b - c
4657
4658                 pa = abs(p - a)
4659                 pb = abs(p - b)
4660                 pc = abs(p - c)
4661
4662                 if pa <= pb and pa <= pc:
4663                     color = (color + a) & 0xff
4664                 elif pb <= pc:
4665                     color = (color + b) & 0xff
4666                 else:
4667                     color = (color + c) & 0xff
4668
4669             current_row.append(color)
4670
4671     return width, height, pixels
4672
4673
4674 def write_xattr(path, key, value):
4675     # This mess below finds the best xattr tool for the job
4676     try:
4677         # try the pyxattr module...
4678         import xattr
4679
4680         if hasattr(xattr, 'set'):  # pyxattr
4681             # Unicode arguments are not supported in python-pyxattr until
4682             # version 0.5.0
4683             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4684             pyxattr_required_version = '0.5.0'
4685             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4686                 # TODO: fallback to CLI tools
4687                 raise XAttrUnavailableError(
4688                     'python-pyxattr is detected but is too old. '
4689                     'yt-dlp requires %s or above while your version is %s. '
4690                     'Falling back to other xattr implementations' % (
4691                         pyxattr_required_version, xattr.__version__))
4692
4693             setxattr = xattr.set
4694         else:  # xattr
4695             setxattr = xattr.setxattr
4696
4697         try:
4698             setxattr(path, key, value)
4699         except EnvironmentError as e:
4700             raise XAttrMetadataError(e.errno, e.strerror)
4701
4702     except ImportError:
4703         if compat_os_name == 'nt':
4704             # Write xattrs to NTFS Alternate Data Streams:
4705             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4706             assert ':' not in key
4707             assert os.path.exists(path)
4708
4709             ads_fn = path + ':' + key
4710             try:
4711                 with open(ads_fn, 'wb') as f:
4712                     f.write(value)
4713             except EnvironmentError as e:
4714                 raise XAttrMetadataError(e.errno, e.strerror)
4715         else:
4716             user_has_setfattr = check_executable('setfattr', ['--version'])
4717             user_has_xattr = check_executable('xattr', ['-h'])
4718
4719             if user_has_setfattr or user_has_xattr:
4720
4721                 value = value.decode('utf-8')
4722                 if user_has_setfattr:
4723                     executable = 'setfattr'
4724                     opts = ['-n', key, '-v', value]
4725                 elif user_has_xattr:
4726                     executable = 'xattr'
4727                     opts = ['-w', key, value]
4728
4729                 cmd = ([encodeFilename(executable, True)]
4730                        + [encodeArgument(o) for o in opts]
4731                        + [encodeFilename(path, True)])
4732
4733                 try:
4734                     p = Popen(
4735                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4736                 except EnvironmentError as e:
4737                     raise XAttrMetadataError(e.errno, e.strerror)
4738                 stdout, stderr = p.communicate_or_kill()
4739                 stderr = stderr.decode('utf-8', 'replace')
4740                 if p.returncode != 0:
4741                     raise XAttrMetadataError(p.returncode, stderr)
4742
4743             else:
4744                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4745                 if sys.platform.startswith('linux'):
4746                     raise XAttrUnavailableError(
4747                         "Couldn't find a tool to set the xattrs. "
4748                         "Install either the python 'pyxattr' or 'xattr' "
4749                         "modules, or the GNU 'attr' package "
4750                         "(which contains the 'setfattr' tool).")
4751                 else:
4752                     raise XAttrUnavailableError(
4753                         "Couldn't find a tool to set the xattrs. "
4754                         "Install either the python 'xattr' module, "
4755                         "or the 'xattr' binary.")
4756
4757
4758 def random_birthday(year_field, month_field, day_field):
4759     start_date = datetime.date(1950, 1, 1)
4760     end_date = datetime.date(1995, 12, 31)
4761     offset = random.randint(0, (end_date - start_date).days)
4762     random_date = start_date + datetime.timedelta(offset)
4763     return {
4764         year_field: str(random_date.year),
4765         month_field: str(random_date.month),
4766         day_field: str(random_date.day),
4767     }
4768
4769
4770 # Templates for internet shortcut files, which are plain text files.
4771 DOT_URL_LINK_TEMPLATE = '''
4772 [InternetShortcut]
4773 URL=%(url)s
4774 '''.lstrip()
4775
4776 DOT_WEBLOC_LINK_TEMPLATE = '''
4777 <?xml version="1.0" encoding="UTF-8"?>
4778 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4779 <plist version="1.0">
4780 <dict>
4781 \t<key>URL</key>
4782 \t<string>%(url)s</string>
4783 </dict>
4784 </plist>
4785 '''.lstrip()
4786
4787 DOT_DESKTOP_LINK_TEMPLATE = '''
4788 [Desktop Entry]
4789 Encoding=UTF-8
4790 Name=%(filename)s
4791 Type=Link
4792 URL=%(url)s
4793 Icon=text-html
4794 '''.lstrip()
4795
4796 LINK_TEMPLATES = {
4797     'url': DOT_URL_LINK_TEMPLATE,
4798     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4799     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4800 }
4801
4802
4803 def iri_to_uri(iri):
4804     """
4805     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4806
4807     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4808     """
4809
4810     iri_parts = compat_urllib_parse_urlparse(iri)
4811
4812     if '[' in iri_parts.netloc:
4813         raise ValueError('IPv6 URIs are not, yet, supported.')
4814         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4815
4816     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4817
4818     net_location = ''
4819     if iri_parts.username:
4820         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4821         if iri_parts.password is not None:
4822             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4823         net_location += '@'
4824
4825     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4826     # The 'idna' encoding produces ASCII text.
4827     if iri_parts.port is not None and iri_parts.port != 80:
4828         net_location += ':' + str(iri_parts.port)
4829
4830     return compat_urllib_parse_urlunparse(
4831         (iri_parts.scheme,
4832             net_location,
4833
4834             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4835
4836             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4837             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4838
4839             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4840             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4841
4842             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4843
4844     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4845
4846
4847 def to_high_limit_path(path):
4848     if sys.platform in ['win32', 'cygwin']:
4849         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4850         return r'\\?\ '.rstrip() + os.path.abspath(path)
4851
4852     return path
4853
4854
4855 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4856     if field is None:
4857         val = obj if obj is not None else default
4858     else:
4859         val = obj.get(field, default)
4860     if func and val not in ignore:
4861         val = func(val)
4862     return template % val if val not in ignore else default
4863
4864
4865 def clean_podcast_url(url):
4866     return re.sub(r'''(?x)
4867         (?:
4868             (?:
4869                 chtbl\.com/track|
4870                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4871                 play\.podtrac\.com
4872             )/[^/]+|
4873             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4874             flex\.acast\.com|
4875             pd(?:
4876                 cn\.co| # https://podcorn.com/analytics-prefix/
4877                 st\.fm # https://podsights.com/docs/
4878             )/e
4879         )/''', '', url)
4880
4881
4882 _HEX_TABLE = '0123456789abcdef'
4883
4884
4885 def random_uuidv4():
4886     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4887
4888
4889 def make_dir(path, to_screen=None):
4890     try:
4891         dn = os.path.dirname(path)
4892         if dn and not os.path.exists(dn):
4893             os.makedirs(dn)
4894         return True
4895     except (OSError, IOError) as err:
4896         if callable(to_screen) is not None:
4897             to_screen('unable to create directory ' + error_to_compat_str(err))
4898         return False
4899
4900
4901 def get_executable_path():
4902     from zipimport import zipimporter
4903     if hasattr(sys, 'frozen'):  # Running from PyInstaller
4904         path = os.path.dirname(sys.executable)
4905     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
4906         path = os.path.join(os.path.dirname(__file__), '../..')
4907     else:
4908         path = os.path.join(os.path.dirname(__file__), '..')
4909     return os.path.abspath(path)
4910
4911
4912 def load_plugins(name, suffix, namespace):
4913     classes = {}
4914     try:
4915         plugins_spec = importlib.util.spec_from_file_location(
4916             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4917         plugins = importlib.util.module_from_spec(plugins_spec)
4918         sys.modules[plugins_spec.name] = plugins
4919         plugins_spec.loader.exec_module(plugins)
4920         for name in dir(plugins):
4921             if name in namespace:
4922                 continue
4923             if not name.endswith(suffix):
4924                 continue
4925             klass = getattr(plugins, name)
4926             classes[name] = namespace[name] = klass
4927     except FileNotFoundError:
4928         pass
4929     return classes
4930
4931
4932 def traverse_obj(
4933         obj, *path_list, default=None, expected_type=None, get_all=True,
4934         casesense=True, is_user_input=False, traverse_string=False):
4935     ''' Traverse nested list/dict/tuple
4936     @param path_list        A list of paths which are checked one by one.
4937                             Each path is a list of keys where each key is a string,
4938                             a function, a tuple of strings or "...".
4939                             When a fuction is given, it takes the key as argument and
4940                             returns whether the key matches or not. When a tuple is given,
4941                             all the keys given in the tuple are traversed, and
4942                             "..." traverses all the keys in the object
4943     @param default          Default value to return
4944     @param expected_type    Only accept final value of this type (Can also be any callable)
4945     @param get_all          Return all the values obtained from a path or only the first one
4946     @param casesense        Whether to consider dictionary keys as case sensitive
4947     @param is_user_input    Whether the keys are generated from user input. If True,
4948                             strings are converted to int/slice if necessary
4949     @param traverse_string  Whether to traverse inside strings. If True, any
4950                             non-compatible object will also be converted into a string
4951     # TODO: Write tests
4952     '''
4953     if not casesense:
4954         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4955         path_list = (map(_lower, variadic(path)) for path in path_list)
4956
4957     def _traverse_obj(obj, path, _current_depth=0):
4958         nonlocal depth
4959         path = tuple(variadic(path))
4960         for i, key in enumerate(path):
4961             if obj is None:
4962                 return None
4963             if isinstance(key, (list, tuple)):
4964                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4965                 key = ...
4966             if key is ...:
4967                 obj = (obj.values() if isinstance(obj, dict)
4968                        else obj if isinstance(obj, (list, tuple, LazyList))
4969                        else str(obj) if traverse_string else [])
4970                 _current_depth += 1
4971                 depth = max(depth, _current_depth)
4972                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4973             elif callable(key):
4974                 if isinstance(obj, (list, tuple, LazyList)):
4975                     obj = enumerate(obj)
4976                 elif isinstance(obj, dict):
4977                     obj = obj.items()
4978                 else:
4979                     if not traverse_string:
4980                         return None
4981                     obj = str(obj)
4982                 _current_depth += 1
4983                 depth = max(depth, _current_depth)
4984                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
4985             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
4986                 obj = (obj.get(key) if casesense or (key in obj)
4987                        else next((v for k, v in obj.items() if _lower(k) == key), None))
4988             else:
4989                 if is_user_input:
4990                     key = (int_or_none(key) if ':' not in key
4991                            else slice(*map(int_or_none, key.split(':'))))
4992                     if key == slice(None):
4993                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
4994                 if not isinstance(key, (int, slice)):
4995                     return None
4996                 if not isinstance(obj, (list, tuple, LazyList)):
4997                     if not traverse_string:
4998                         return None
4999                     obj = str(obj)
5000                 try:
5001                     obj = obj[key]
5002                 except IndexError:
5003                     return None
5004         return obj
5005
5006     if isinstance(expected_type, type):
5007         type_test = lambda val: val if isinstance(val, expected_type) else None
5008     elif expected_type is not None:
5009         type_test = expected_type
5010     else:
5011         type_test = lambda val: val
5012
5013     for path in path_list:
5014         depth = 0
5015         val = _traverse_obj(obj, path)
5016         if val is not None:
5017             if depth:
5018                 for _ in range(depth - 1):
5019                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5020                 val = [v for v in map(type_test, val) if v is not None]
5021                 if val:
5022                     return val if get_all else val[0]
5023             else:
5024                 val = type_test(val)
5025                 if val is not None:
5026                     return val
5027     return default
5028
5029
5030 # Deprecated
5031 def traverse_dict(dictn, keys, casesense=True):
5032     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5033                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5034     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5035
5036
5037 def variadic(x, allowed_types=(str, bytes, dict)):
5038     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5039
5040
5041 # create a JSON Web Signature (jws) with HS256 algorithm
5042 # the resulting format is in JWS Compact Serialization
5043 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5044 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5045 def jwt_encode_hs256(payload_data, key, headers={}):
5046     header_data = {
5047         'alg': 'HS256',
5048         'typ': 'JWT',
5049     }
5050     if headers:
5051         header_data.update(headers)
5052     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5053     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5054     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5055     signature_b64 = base64.b64encode(h.digest())
5056     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5057     return token
5058
5059
5060 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5061 def jwt_decode_hs256(jwt):
5062     header_b64, payload_b64, signature_b64 = jwt.split('.')
5063     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5064     return payload_data
5065
5066
5067 def supports_terminal_sequences(stream):
5068     if compat_os_name == 'nt':
5069         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5070         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5071             return False
5072     elif not os.getenv('TERM'):
5073         return False
5074     try:
5075         return stream.isatty()
5076     except BaseException:
5077         return False
5078
5079
5080 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5081
5082
5083 def remove_terminal_sequences(string):
5084     return _terminal_sequences_re.sub('', string)
5085
5086
5087 def number_of_digits(number):
5088     return len('%d' % number)
5089
5090
5091 def join_nonempty(*values, delim='-', from_dict=None):
5092     if from_dict is not None:
5093         values = map(from_dict.get, values)
5094     return delim.join(map(str, filter(None, values)))