yt_dlp/utils.py

   1 #!/usr/bin/env python3
   2 # coding: utf-8
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import collections
  11 import contextlib
  12 import ctypes
  13 import datetime
  14 import email.utils
  15 import email.header
  16 import errno
  17 import functools
  18 import gzip
  19 import hashlib
  20 import hmac
  21 import importlib.util
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import operator
  28 import os
  29 import platform
  30 import random
  31 import re
  32 import socket
  33 import ssl
  34 import subprocess
  35 import sys
  36 import tempfile
  37 import time
  38 import traceback
  39 import xml.etree.ElementTree
  40 import zlib
  41 import mimetypes
  42
  43 from .compat import (
  44     compat_HTMLParseError,
  45     compat_HTMLParser,
  46     compat_HTTPError,
  47     compat_basestring,
  48     compat_chr,
  49     compat_cookiejar,
  50     compat_ctypes_WINFUNCTYPE,
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_html_entities,
  54     compat_html_entities_html5,
  55     compat_http_client,
  56     compat_integer_types,
  57     compat_numeric_types,
  58     compat_kwargs,
  59     compat_os_name,
  60     compat_parse_qs,
  61     compat_shlex_quote,
  62     compat_str,
  63     compat_struct_pack,
  64     compat_struct_unpack,
  65     compat_urllib_error,
  66     compat_urllib_parse,
  67     compat_urllib_parse_urlencode,
  68     compat_urllib_parse_urlparse,
  69     compat_urllib_parse_urlunparse,
  70     compat_urllib_parse_quote,
  71     compat_urllib_parse_quote_plus,
  72     compat_urllib_parse_unquote_plus,
  73     compat_urllib_request,
  74     compat_urlparse,
  75     compat_xpath,
  76 )
  77
  78 from .socks import (
  79     ProxyType,
  80     sockssocket,
  81 )
  82
  83
  84 def register_socks_protocols():
  85     # "Register" SOCKS protocols
  86     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  87     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  88     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  89         if scheme not in compat_urlparse.uses_netloc:
  90             compat_urlparse.uses_netloc.append(scheme)
  91
  92
  93 # This is not clearly defined otherwise
  94 compiled_regex_type = type(re.compile(''))
  95
  96
  97 def random_user_agent():
  98     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  99     _CHROME_VERSIONS = (
 100         '90.0.4430.212',
 101         '90.0.4430.24',
 102         '90.0.4430.70',
 103         '90.0.4430.72',
 104         '90.0.4430.85',
 105         '90.0.4430.93',
 106         '91.0.4472.101',
 107         '91.0.4472.106',
 108         '91.0.4472.114',
 109         '91.0.4472.124',
 110         '91.0.4472.164',
 111         '91.0.4472.19',
 112         '91.0.4472.77',
 113         '92.0.4515.107',
 114         '92.0.4515.115',
 115         '92.0.4515.131',
 116         '92.0.4515.159',
 117         '92.0.4515.43',
 118         '93.0.4556.0',
 119         '93.0.4577.15',
 120         '93.0.4577.63',
 121         '93.0.4577.82',
 122         '94.0.4606.41',
 123         '94.0.4606.54',
 124         '94.0.4606.61',
 125         '94.0.4606.71',
 126         '94.0.4606.81',
 127         '94.0.4606.85',
 128         '95.0.4638.17',
 129         '95.0.4638.50',
 130         '95.0.4638.54',
 131         '95.0.4638.69',
 132         '95.0.4638.74',
 133         '96.0.4664.18',
 134         '96.0.4664.45',
 135         '96.0.4664.55',
 136         '96.0.4664.93',
 137         '97.0.4692.20',
 138     )
 139     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 140
 141
 142 std_headers = {
 143     'User-Agent': random_user_agent(),
 144     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 145     'Accept-Encoding': 'gzip, deflate',
 146     'Accept-Language': 'en-us,en;q=0.5',
 147 }
 148
 149
 150 USER_AGENTS = {
 151     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 152 }
 153
 154
 155 NO_DEFAULT = object()
 156
 157 ENGLISH_MONTH_NAMES = [
 158     'January', 'February', 'March', 'April', 'May', 'June',
 159     'July', 'August', 'September', 'October', 'November', 'December']
 160
 161 MONTH_NAMES = {
 162     'en': ENGLISH_MONTH_NAMES,
 163     'fr': [
 164         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 165         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 166 }
 167
 168 KNOWN_EXTENSIONS = (
 169     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 170     'flv', 'f4v', 'f4a', 'f4b',
 171     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 172     'mkv', 'mka', 'mk3d',
 173     'avi', 'divx',
 174     'mov',
 175     'asf', 'wmv', 'wma',
 176     '3gp', '3g2',
 177     'mp3',
 178     'flac',
 179     'ape',
 180     'wav',
 181     'f4f', 'f4m', 'm3u8', 'smil')
 182
 183 # needed for sanitizing filenames in restricted mode
 184 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 185                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 186                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 187
 188 DATE_FORMATS = (
 189     '%d %B %Y',
 190     '%d %b %Y',
 191     '%B %d %Y',
 192     '%B %dst %Y',
 193     '%B %dnd %Y',
 194     '%B %drd %Y',
 195     '%B %dth %Y',
 196     '%b %d %Y',
 197     '%b %dst %Y',
 198     '%b %dnd %Y',
 199     '%b %drd %Y',
 200     '%b %dth %Y',
 201     '%b %dst %Y %I:%M',
 202     '%b %dnd %Y %I:%M',
 203     '%b %drd %Y %I:%M',
 204     '%b %dth %Y %I:%M',
 205     '%Y %m %d',
 206     '%Y-%m-%d',
 207     '%Y.%m.%d.',
 208     '%Y/%m/%d',
 209     '%Y/%m/%d %H:%M',
 210     '%Y/%m/%d %H:%M:%S',
 211     '%Y%m%d%H%M',
 212     '%Y%m%d%H%M%S',
 213     '%Y%m%d',
 214     '%Y-%m-%d %H:%M',
 215     '%Y-%m-%d %H:%M:%S',
 216     '%Y-%m-%d %H:%M:%S.%f',
 217     '%Y-%m-%d %H:%M:%S:%f',
 218     '%d.%m.%Y %H:%M',
 219     '%d.%m.%Y %H.%M',
 220     '%Y-%m-%dT%H:%M:%SZ',
 221     '%Y-%m-%dT%H:%M:%S.%fZ',
 222     '%Y-%m-%dT%H:%M:%S.%f0Z',
 223     '%Y-%m-%dT%H:%M:%S',
 224     '%Y-%m-%dT%H:%M:%S.%f',
 225     '%Y-%m-%dT%H:%M',
 226     '%b %d %Y at %H:%M',
 227     '%b %d %Y at %H:%M:%S',
 228     '%B %d %Y at %H:%M',
 229     '%B %d %Y at %H:%M:%S',
 230     '%H:%M %d-%b-%Y',
 231 )
 232
 233 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 234 DATE_FORMATS_DAY_FIRST.extend([
 235     '%d-%m-%Y',
 236     '%d.%m.%Y',
 237     '%d.%m.%y',
 238     '%d/%m/%Y',
 239     '%d/%m/%y',
 240     '%d/%m/%Y %H:%M:%S',
 241 ])
 242
 243 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 244 DATE_FORMATS_MONTH_FIRST.extend([
 245     '%m-%d-%Y',
 246     '%m.%d.%Y',
 247     '%m/%d/%Y',
 248     '%m/%d/%y',
 249     '%m/%d/%Y %H:%M:%S',
 250 ])
 251
 252 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 253 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 254
 255
 256 def preferredencoding():
 257     """Get preferred encoding.
 258
 259     Returns the best encoding scheme for the system, based on
 260     locale.getpreferredencoding() and some further tweaks.
 261     """
 262     try:
 263         pref = locale.getpreferredencoding()
 264         'TEST'.encode(pref)
 265     except Exception:
 266         pref = 'UTF-8'
 267
 268     return pref
 269
 270
 271 def write_json_file(obj, fn):
 272     """ Encode obj as JSON and write it to fn, atomically if possible """
 273
 274     fn = encodeFilename(fn)
 275     if sys.version_info < (3, 0) and sys.platform != 'win32':
 276         encoding = get_filesystem_encoding()
 277         # os.path.basename returns a bytes object, but NamedTemporaryFile
 278         # will fail if the filename contains non ascii characters unless we
 279         # use a unicode object
 280         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 281         # the same for os.path.dirname
 282         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 283     else:
 284         path_basename = os.path.basename
 285         path_dirname = os.path.dirname
 286
 287     args = {
 288         'suffix': '.tmp',
 289         'prefix': path_basename(fn) + '.',
 290         'dir': path_dirname(fn),
 291         'delete': False,
 292     }
 293
 294     # In Python 2.x, json.dump expects a bytestream.
 295     # In Python 3.x, it writes to a character stream
 296     if sys.version_info < (3, 0):
 297         args['mode'] = 'wb'
 298     else:
 299         args.update({
 300             'mode': 'w',
 301             'encoding': 'utf-8',
 302         })
 303
 304     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 305
 306     try:
 307         with tf:
 308             json.dump(obj, tf, ensure_ascii=False)
 309         if sys.platform == 'win32':
 310             # Need to remove existing file on Windows, else os.rename raises
 311             # WindowsError or FileExistsError.
 312             try:
 313                 os.unlink(fn)
 314             except OSError:
 315                 pass
 316         try:
 317             mask = os.umask(0)
 318             os.umask(mask)
 319             os.chmod(tf.name, 0o666 & ~mask)
 320         except OSError:
 321             pass
 322         os.rename(tf.name, fn)
 323     except Exception:
 324         try:
 325             os.remove(tf.name)
 326         except OSError:
 327             pass
 328         raise
 329
 330
 331 if sys.version_info >= (2, 7):
 332     def find_xpath_attr(node, xpath, key, val=None):
 333         """ Find the xpath xpath[@key=val] """
 334         assert re.match(r'^[a-zA-Z_-]+$', key)
 335         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 336         return node.find(expr)
 337 else:
 338     def find_xpath_attr(node, xpath, key, val=None):
 339         for f in node.findall(compat_xpath(xpath)):
 340             if key not in f.attrib:
 341                 continue
 342             if val is None or f.attrib.get(key) == val:
 343                 return f
 344         return None
 345
 346 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 347 # the namespace parameter
 348
 349
 350 def xpath_with_ns(path, ns_map):
 351     components = [c.split(':') for c in path.split('/')]
 352     replaced = []
 353     for c in components:
 354         if len(c) == 1:
 355             replaced.append(c[0])
 356         else:
 357             ns, tag = c
 358             replaced.append('{%s}%s' % (ns_map[ns], tag))
 359     return '/'.join(replaced)
 360
 361
 362 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 363     def _find_xpath(xpath):
 364         return node.find(compat_xpath(xpath))
 365
 366     if isinstance(xpath, (str, compat_str)):
 367         n = _find_xpath(xpath)
 368     else:
 369         for xp in xpath:
 370             n = _find_xpath(xp)
 371             if n is not None:
 372                 break
 373
 374     if n is None:
 375         if default is not NO_DEFAULT:
 376             return default
 377         elif fatal:
 378             name = xpath if name is None else name
 379             raise ExtractorError('Could not find XML element %s' % name)
 380         else:
 381             return None
 382     return n
 383
 384
 385 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 386     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 387     if n is None or n == default:
 388         return n
 389     if n.text is None:
 390         if default is not NO_DEFAULT:
 391             return default
 392         elif fatal:
 393             name = xpath if name is None else name
 394             raise ExtractorError('Could not find XML element\'s text %s' % name)
 395         else:
 396             return None
 397     return n.text
 398
 399
 400 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 401     n = find_xpath_attr(node, xpath, key)
 402     if n is None:
 403         if default is not NO_DEFAULT:
 404             return default
 405         elif fatal:
 406             name = '%s[@%s]' % (xpath, key) if name is None else name
 407             raise ExtractorError('Could not find XML attribute %s' % name)
 408         else:
 409             return None
 410     return n.attrib[key]
 411
 412
 413 def get_element_by_id(id, html):
 414     """Return the content of the tag with the specified ID in the passed HTML document"""
 415     return get_element_by_attribute('id', id, html)
 416
 417
 418 def get_element_by_class(class_name, html):
 419     """Return the content of the first tag with the specified class in the passed HTML document"""
 420     retval = get_elements_by_class(class_name, html)
 421     return retval[0] if retval else None
 422
 423
 424 def get_element_by_attribute(attribute, value, html, escape_value=True):
 425     retval = get_elements_by_attribute(attribute, value, html, escape_value)
 426     return retval[0] if retval else None
 427
 428
 429 def get_elements_by_class(class_name, html):
 430     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 431     return get_elements_by_attribute(
 432         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 433         html, escape_value=False)
 434
 435
 436 def get_elements_by_attribute(attribute, value, html, escape_value=True):
 437     """Return the content of the tag with the specified attribute in the passed HTML document"""
 438
 439     value = re.escape(value) if escape_value else value
 440
 441     retlist = []
 442     for m in re.finditer(r'''(?xs)
 443         <([a-zA-Z0-9:._-]+)
 444          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 445          \s+%s=['"]?%s['"]?
 446          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
 447         \s*>
 448         (?P<content>.*?)
 449         </\1>
 450     ''' % (re.escape(attribute), value), html):
 451         res = m.group('content')
 452
 453         if res.startswith('"') or res.startswith("'"):
 454             res = res[1:-1]
 455
 456         retlist.append(unescapeHTML(res))
 457
 458     return retlist
 459
 460
 461 class HTMLAttributeParser(compat_HTMLParser):
 462     """Trivial HTML parser to gather the attributes for a single element"""
 463
 464     def __init__(self):
 465         self.attrs = {}
 466         compat_HTMLParser.__init__(self)
 467
 468     def handle_starttag(self, tag, attrs):
 469         self.attrs = dict(attrs)
 470
 471
 472 class HTMLListAttrsParser(compat_HTMLParser):
 473     """HTML parser to gather the attributes for the elements of a list"""
 474
 475     def __init__(self):
 476         compat_HTMLParser.__init__(self)
 477         self.items = []
 478         self._level = 0
 479
 480     def handle_starttag(self, tag, attrs):
 481         if tag == 'li' and self._level == 0:
 482             self.items.append(dict(attrs))
 483         self._level += 1
 484
 485     def handle_endtag(self, tag):
 486         self._level -= 1
 487
 488
 489 def extract_attributes(html_element):
 490     """Given a string for an HTML element such as
 491     <el
 492          a="foo" B="bar" c="&98;az" d=boz
 493          empty= noval entity="&amp;"
 494          sq='"' dq="'"
 495     >
 496     Decode and return a dictionary of attributes.
 497     {
 498         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 499         'empty': '', 'noval': None, 'entity': '&',
 500         'sq': '"', 'dq': '\''
 501     }.
 502     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 503     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 504     """
 505     parser = HTMLAttributeParser()
 506     try:
 507         parser.feed(html_element)
 508         parser.close()
 509     # Older Python may throw HTMLParseError in case of malformed HTML
 510     except compat_HTMLParseError:
 511         pass
 512     return parser.attrs
 513
 514
 515 def parse_list(webpage):
 516     """Given a string for an series of HTML <li> elements,
 517     return a dictionary of their attributes"""
 518     parser = HTMLListAttrsParser()
 519     parser.feed(webpage)
 520     parser.close()
 521     return parser.items
 522
 523
 524 def clean_html(html):
 525     """Clean an HTML snippet into a readable string"""
 526
 527     if html is None:  # Convenience for sanitizing descriptions etc.
 528         return html
 529
 530     # Newline vs <br />
 531     html = html.replace('\n', ' ')
 532     html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
 533     html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 534     # Strip html tags
 535     html = re.sub('<.*?>', '', html)
 536     # Replace html entities
 537     html = unescapeHTML(html)
 538     return html.strip()
 539
 540
 541 def sanitize_open(filename, open_mode):
 542     """Try to open the given filename, and slightly tweak it if this fails.
 543
 544     Attempts to open the given filename. If this fails, it tries to change
 545     the filename slightly, step by step, until it's either able to open it
 546     or it fails and raises a final exception, like the standard open()
 547     function.
 548
 549     It returns the tuple (stream, definitive_file_name).
 550     """
 551     try:
 552         if filename == '-':
 553             if sys.platform == 'win32':
 554                 import msvcrt
 555                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 556             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 557         stream = open(encodeFilename(filename), open_mode)
 558         return (stream, filename)
 559     except (IOError, OSError) as err:
 560         if err.errno in (errno.EACCES,):
 561             raise
 562
 563         # In case of error, try to remove win32 forbidden chars
 564         alt_filename = sanitize_path(filename)
 565         if alt_filename == filename:
 566             raise
 567         else:
 568             # An exception here should be caught in the caller
 569             stream = open(encodeFilename(alt_filename), open_mode)
 570             return (stream, alt_filename)
 571
 572
 573 def timeconvert(timestr):
 574     """Convert RFC 2822 defined time string into system timestamp"""
 575     timestamp = None
 576     timetuple = email.utils.parsedate_tz(timestr)
 577     if timetuple is not None:
 578         timestamp = email.utils.mktime_tz(timetuple)
 579     return timestamp
 580
 581
 582 def sanitize_filename(s, restricted=False, is_id=False):
 583     """Sanitizes a string so it could be used as part of a filename.
 584     If restricted is set, use a stricter subset of allowed characters.
 585     Set is_id if this is not an arbitrary string, but an ID that should be kept
 586     if possible.
 587     """
 588     def replace_insane(char):
 589         if restricted and char in ACCENT_CHARS:
 590             return ACCENT_CHARS[char]
 591         elif not restricted and char == '\n':
 592             return ' '
 593         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 594             return ''
 595         elif char == '"':
 596             return '' if restricted else '\''
 597         elif char == ':':
 598             return '_-' if restricted else ' -'
 599         elif char in '\\/|*<>':
 600             return '_'
 601         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 602             return '_'
 603         if restricted and ord(char) > 127:
 604             return '_'
 605         return char
 606
 607     if s == '':
 608         return ''
 609     # Handle timestamps
 610     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 611     result = ''.join(map(replace_insane, s))
 612     if not is_id:
 613         while '__' in result:
 614             result = result.replace('__', '_')
 615         result = result.strip('_')
 616         # Common case of "Foreign band name - English song title"
 617         if restricted and result.startswith('-_'):
 618             result = result[2:]
 619         if result.startswith('-'):
 620             result = '_' + result[len('-'):]
 621         result = result.lstrip('.')
 622         if not result:
 623             result = '_'
 624     return result
 625
 626
 627 def sanitize_path(s, force=False):
 628     """Sanitizes and normalizes path on Windows"""
 629     if sys.platform == 'win32':
 630         force = False
 631         drive_or_unc, _ = os.path.splitdrive(s)
 632         if sys.version_info < (2, 7) and not drive_or_unc:
 633             drive_or_unc, _ = os.path.splitunc(s)
 634     elif force:
 635         drive_or_unc = ''
 636     else:
 637         return s
 638
 639     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 640     if drive_or_unc:
 641         norm_path.pop(0)
 642     sanitized_path = [
 643         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 644         for path_part in norm_path]
 645     if drive_or_unc:
 646         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 647     elif force and s[0] == os.path.sep:
 648         sanitized_path.insert(0, os.path.sep)
 649     return os.path.join(*sanitized_path)
 650
 651
 652 def sanitize_url(url):
 653     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 654     # the number of unwanted failures due to missing protocol
 655     if url.startswith('//'):
 656         return 'http:%s' % url
 657     # Fix some common typos seen so far
 658     COMMON_TYPOS = (
 659         # https://github.com/ytdl-org/youtube-dl/issues/15649
 660         (r'^httpss://', r'https://'),
 661         # https://bx1.be/lives/direct-tv/
 662         (r'^rmtp([es]?)://', r'rtmp\1://'),
 663     )
 664     for mistake, fixup in COMMON_TYPOS:
 665         if re.match(mistake, url):
 666             return re.sub(mistake, fixup, url)
 667     return url
 668
 669
 670 def extract_basic_auth(url):
 671     parts = compat_urlparse.urlsplit(url)
 672     if parts.username is None:
 673         return url, None
 674     url = compat_urlparse.urlunsplit(parts._replace(netloc=(
 675         parts.hostname if parts.port is None
 676         else '%s:%d' % (parts.hostname, parts.port))))
 677     auth_payload = base64.b64encode(
 678         ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
 679     return url, 'Basic ' + auth_payload.decode('utf-8')
 680
 681
 682 def sanitized_Request(url, *args, **kwargs):
 683     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 684     if auth_header is not None:
 685         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 686         headers['Authorization'] = auth_header
 687     return compat_urllib_request.Request(url, *args, **kwargs)
 688
 689
 690 def expand_path(s):
 691     """Expand shell variables and ~"""
 692     return os.path.expandvars(compat_expanduser(s))
 693
 694
 695 def orderedSet(iterable):
 696     """ Remove all duplicates from the input iterable """
 697     res = []
 698     for el in iterable:
 699         if el not in res:
 700             res.append(el)
 701     return res
 702
 703
 704 def _htmlentity_transform(entity_with_semicolon):
 705     """Transforms an HTML entity to a character."""
 706     entity = entity_with_semicolon[:-1]
 707
 708     # Known non-numeric HTML entity
 709     if entity in compat_html_entities.name2codepoint:
 710         return compat_chr(compat_html_entities.name2codepoint[entity])
 711
 712     # TODO: HTML5 allows entities without a semicolon. For example,
 713     # '&Eacuteric' should be decoded as 'Éric'.
 714     if entity_with_semicolon in compat_html_entities_html5:
 715         return compat_html_entities_html5[entity_with_semicolon]
 716
 717     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 718     if mobj is not None:
 719         numstr = mobj.group(1)
 720         if numstr.startswith('x'):
 721             base = 16
 722             numstr = '0%s' % numstr
 723         else:
 724             base = 10
 725         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 726         try:
 727             return compat_chr(int(numstr, base))
 728         except ValueError:
 729             pass
 730
 731     # Unknown entity in name, return its literal representation
 732     return '&%s;' % entity
 733
 734
 735 def unescapeHTML(s):
 736     if s is None:
 737         return None
 738     assert type(s) == compat_str
 739
 740     return re.sub(
 741         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 742
 743
 744 def escapeHTML(text):
 745     return (
 746         text
 747         .replace('&', '&amp;')
 748         .replace('<', '&lt;')
 749         .replace('>', '&gt;')
 750         .replace('"', '&quot;')
 751         .replace("'", '&#39;')
 752     )
 753
 754
 755 def process_communicate_or_kill(p, *args, **kwargs):
 756     try:
 757         return p.communicate(*args, **kwargs)
 758     except BaseException:  # Including KeyboardInterrupt
 759         p.kill()
 760         p.wait()
 761         raise
 762
 763
 764 class Popen(subprocess.Popen):
 765     if sys.platform == 'win32':
 766         _startupinfo = subprocess.STARTUPINFO()
 767         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 768     else:
 769         _startupinfo = None
 770
 771     def __init__(self, *args, **kwargs):
 772         super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
 773
 774     def communicate_or_kill(self, *args, **kwargs):
 775         return process_communicate_or_kill(self, *args, **kwargs)
 776
 777
 778 def get_subprocess_encoding():
 779     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 780         # For subprocess calls, encode with locale encoding
 781         # Refer to http://stackoverflow.com/a/9951851/35070
 782         encoding = preferredencoding()
 783     else:
 784         encoding = sys.getfilesystemencoding()
 785     if encoding is None:
 786         encoding = 'utf-8'
 787     return encoding
 788
 789
 790 def encodeFilename(s, for_subprocess=False):
 791     """
 792     @param s The name of the file
 793     """
 794
 795     assert type(s) == compat_str
 796
 797     # Python 3 has a Unicode API
 798     if sys.version_info >= (3, 0):
 799         return s
 800
 801     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 802     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 803     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 804     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 805         return s
 806
 807     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 808     if sys.platform.startswith('java'):
 809         return s
 810
 811     return s.encode(get_subprocess_encoding(), 'ignore')
 812
 813
 814 def decodeFilename(b, for_subprocess=False):
 815
 816     if sys.version_info >= (3, 0):
 817         return b
 818
 819     if not isinstance(b, bytes):
 820         return b
 821
 822     return b.decode(get_subprocess_encoding(), 'ignore')
 823
 824
 825 def encodeArgument(s):
 826     if not isinstance(s, compat_str):
 827         # Legacy code that uses byte strings
 828         # Uncomment the following line after fixing all post processors
 829         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 830         s = s.decode('ascii')
 831     return encodeFilename(s, True)
 832
 833
 834 def decodeArgument(b):
 835     return decodeFilename(b, True)
 836
 837
 838 def decodeOption(optval):
 839     if optval is None:
 840         return optval
 841     if isinstance(optval, bytes):
 842         optval = optval.decode(preferredencoding())
 843
 844     assert isinstance(optval, compat_str)
 845     return optval
 846
 847
 848 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 849
 850
 851 def timetuple_from_msec(msec):
 852     secs, msec = divmod(msec, 1000)
 853     mins, secs = divmod(secs, 60)
 854     hrs, mins = divmod(mins, 60)
 855     return _timetuple(hrs, mins, secs, msec)
 856
 857
 858 def formatSeconds(secs, delim=':', msec=False):
 859     time = timetuple_from_msec(secs * 1000)
 860     if time.hours:
 861         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 862     elif time.minutes:
 863         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 864     else:
 865         ret = '%d' % time.seconds
 866     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 867
 868
 869 def _ssl_load_windows_store_certs(ssl_context, storename):
 870     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 871     try:
 872         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 873                  if encoding == 'x509_asn' and (
 874                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 875     except PermissionError:
 876         return
 877     for cert in certs:
 878         try:
 879             ssl_context.load_verify_locations(cadata=cert)
 880         except ssl.SSLError:
 881             pass
 882
 883
 884 def make_HTTPS_handler(params, **kwargs):
 885     opts_check_certificate = not params.get('nocheckcertificate')
 886     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 887     context.check_hostname = opts_check_certificate
 888     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 889     if opts_check_certificate:
 890         try:
 891             context.load_default_certs()
 892             # Work around the issue in load_default_certs when there are bad certificates. See:
 893             # https://github.com/yt-dlp/yt-dlp/issues/1060,
 894             # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 895         except ssl.SSLError:
 896             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 897             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 898                 # Create a new context to discard any certificates that were already loaded
 899                 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 900                 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
 901                 for storename in ('CA', 'ROOT'):
 902                     _ssl_load_windows_store_certs(context, storename)
 903             context.set_default_verify_paths()
 904     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 905
 906
 907 def bug_reports_message(before=';'):
 908     if ytdl_is_updateable():
 909         update_cmd = 'type  yt-dlp -U  to update'
 910     else:
 911         update_cmd = 'see  https://github.com/yt-dlp/yt-dlp  on how to update'
 912     msg = 'please report this issue on  https://github.com/yt-dlp/yt-dlp .'
 913     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 914     msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
 915
 916     before = before.rstrip()
 917     if not before or before.endswith(('.', '!', '?')):
 918         msg = msg[0].title() + msg[1:]
 919
 920     return (before + ' ' if before else '') + msg
 921
 922
 923 class YoutubeDLError(Exception):
 924     """Base exception for YoutubeDL errors."""
 925     msg = None
 926
 927     def __init__(self, msg=None):
 928         if msg is not None:
 929             self.msg = msg
 930         elif self.msg is None:
 931             self.msg = type(self).__name__
 932         super().__init__(self.msg)
 933
 934
 935 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 936 if hasattr(ssl, 'CertificateError'):
 937     network_exceptions.append(ssl.CertificateError)
 938 network_exceptions = tuple(network_exceptions)
 939
 940
 941 class ExtractorError(YoutubeDLError):
 942     """Error during info extraction."""
 943
 944     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 945         """ tb, if given, is the original traceback (so that it can be printed out).
 946         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 947         """
 948         if sys.exc_info()[0] in network_exceptions:
 949             expected = True
 950
 951         self.msg = str(msg)
 952         self.traceback = tb
 953         self.expected = expected
 954         self.cause = cause
 955         self.video_id = video_id
 956         self.ie = ie
 957         self.exc_info = sys.exc_info()  # preserve original exception
 958
 959         super(ExtractorError, self).__init__(''.join((
 960             format_field(ie, template='[%s] '),
 961             format_field(video_id, template='%s: '),
 962             self.msg,
 963             format_field(cause, template=' (caused by %r)'),
 964             '' if expected else bug_reports_message())))
 965
 966     def format_traceback(self):
 967         if self.traceback is None:
 968             return None
 969         return ''.join(traceback.format_tb(self.traceback))
 970
 971
 972 class UnsupportedError(ExtractorError):
 973     def __init__(self, url):
 974         super(UnsupportedError, self).__init__(
 975             'Unsupported URL: %s' % url, expected=True)
 976         self.url = url
 977
 978
 979 class RegexNotFoundError(ExtractorError):
 980     """Error when a regex didn't match"""
 981     pass
 982
 983
 984 class GeoRestrictedError(ExtractorError):
 985     """Geographic restriction Error exception.
 986
 987     This exception may be thrown when a video is not available from your
 988     geographic location due to geographic restrictions imposed by a website.
 989     """
 990
 991     def __init__(self, msg, countries=None, **kwargs):
 992         kwargs['expected'] = True
 993         super(GeoRestrictedError, self).__init__(msg, **kwargs)
 994         self.countries = countries
 995
 996
 997 class DownloadError(YoutubeDLError):
 998     """Download Error exception.
 999
1000     This exception may be thrown by FileDownloader objects if they are not
1001     configured to continue on errors. They will contain the appropriate
1002     error message.
1003     """
1004
1005     def __init__(self, msg, exc_info=None):
1006         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1007         super(DownloadError, self).__init__(msg)
1008         self.exc_info = exc_info
1009
1010
1011 class EntryNotInPlaylist(YoutubeDLError):
1012     """Entry not in playlist exception.
1013
1014     This exception will be thrown by YoutubeDL when a requested entry
1015     is not found in the playlist info_dict
1016     """
1017     msg = 'Entry not found in info'
1018
1019
1020 class SameFileError(YoutubeDLError):
1021     """Same File exception.
1022
1023     This exception will be thrown by FileDownloader objects if they detect
1024     multiple files would have to be downloaded to the same file on disk.
1025     """
1026     msg = 'Fixed output name but more than one file to download'
1027
1028     def __init__(self, filename=None):
1029         if filename is not None:
1030             self.msg += f': {filename}'
1031         super().__init__(self.msg)
1032
1033
1034 class PostProcessingError(YoutubeDLError):
1035     """Post Processing exception.
1036
1037     This exception may be raised by PostProcessor's .run() method to
1038     indicate an error in the postprocessing task.
1039     """
1040
1041
1042 class DownloadCancelled(YoutubeDLError):
1043     """ Exception raised when the download queue should be interrupted """
1044     msg = 'The download was cancelled'
1045
1046
1047 class ExistingVideoReached(DownloadCancelled):
1048     """ --break-on-existing triggered """
1049     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1050
1051
1052 class RejectedVideoReached(DownloadCancelled):
1053     """ --break-on-reject triggered """
1054     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1055
1056
1057 class MaxDownloadsReached(DownloadCancelled):
1058     """ --max-downloads limit has been reached. """
1059     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1060
1061
1062 class ReExtractInfo(YoutubeDLError):
1063     """ Video info needs to be re-extracted. """
1064
1065     def __init__(self, msg, expected=False):
1066         super().__init__(msg)
1067         self.expected = expected
1068
1069
1070 class ThrottledDownload(ReExtractInfo):
1071     """ Download speed below --throttled-rate. """
1072     msg = 'The download speed is below throttle limit'
1073
1074     def __init__(self):
1075         super().__init__(self.msg, expected=False)
1076
1077
1078 class UnavailableVideoError(YoutubeDLError):
1079     """Unavailable Format exception.
1080
1081     This exception will be thrown when a video is requested
1082     in a format that is not available for that video.
1083     """
1084     msg = 'Unable to download video'
1085
1086     def __init__(self, err=None):
1087         if err is not None:
1088             self.msg += f': {err}'
1089         super().__init__(self.msg)
1090
1091
1092 class ContentTooShortError(YoutubeDLError):
1093     """Content Too Short exception.
1094
1095     This exception may be raised by FileDownloader objects when a file they
1096     download is too small for what the server announced first, indicating
1097     the connection was probably interrupted.
1098     """
1099
1100     def __init__(self, downloaded, expected):
1101         super(ContentTooShortError, self).__init__(
1102             'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1103         )
1104         # Both in bytes
1105         self.downloaded = downloaded
1106         self.expected = expected
1107
1108
1109 class XAttrMetadataError(YoutubeDLError):
1110     def __init__(self, code=None, msg='Unknown error'):
1111         super(XAttrMetadataError, self).__init__(msg)
1112         self.code = code
1113         self.msg = msg
1114
1115         # Parsing code and msg
1116         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1117                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1118             self.reason = 'NO_SPACE'
1119         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1120             self.reason = 'VALUE_TOO_LONG'
1121         else:
1122             self.reason = 'NOT_SUPPORTED'
1123
1124
1125 class XAttrUnavailableError(YoutubeDLError):
1126     pass
1127
1128
1129 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1130     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1131     # expected HTTP responses to meet HTTP/1.0 or later (see also
1132     # https://github.com/ytdl-org/youtube-dl/issues/6727)
1133     if sys.version_info < (3, 0):
1134         kwargs['strict'] = True
1135     hc = http_class(*args, **compat_kwargs(kwargs))
1136     source_address = ydl_handler._params.get('source_address')
1137
1138     if source_address is not None:
1139         # This is to workaround _create_connection() from socket where it will try all
1140         # address data from getaddrinfo() including IPv6. This filters the result from
1141         # getaddrinfo() based on the source_address value.
1142         # This is based on the cpython socket.create_connection() function.
1143         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1144         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1145             host, port = address
1146             err = None
1147             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1148             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1149             ip_addrs = [addr for addr in addrs if addr[0] == af]
1150             if addrs and not ip_addrs:
1151                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1152                 raise socket.error(
1153                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1154                     % (ip_version, source_address[0]))
1155             for res in ip_addrs:
1156                 af, socktype, proto, canonname, sa = res
1157                 sock = None
1158                 try:
1159                     sock = socket.socket(af, socktype, proto)
1160                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1161                         sock.settimeout(timeout)
1162                     sock.bind(source_address)
1163                     sock.connect(sa)
1164                     err = None  # Explicitly break reference cycle
1165                     return sock
1166                 except socket.error as _:
1167                     err = _
1168                     if sock is not None:
1169                         sock.close()
1170             if err is not None:
1171                 raise err
1172             else:
1173                 raise socket.error('getaddrinfo returns an empty list')
1174         if hasattr(hc, '_create_connection'):
1175             hc._create_connection = _create_connection
1176         sa = (source_address, 0)
1177         if hasattr(hc, 'source_address'):  # Python 2.7+
1178             hc.source_address = sa
1179         else:  # Python 2.6
1180             def _hc_connect(self, *args, **kwargs):
1181                 sock = _create_connection(
1182                     (self.host, self.port), self.timeout, sa)
1183                 if is_https:
1184                     self.sock = ssl.wrap_socket(
1185                         sock, self.key_file, self.cert_file,
1186                         ssl_version=ssl.PROTOCOL_TLSv1)
1187                 else:
1188                     self.sock = sock
1189             hc.connect = functools.partial(_hc_connect, hc)
1190
1191     return hc
1192
1193
1194 def handle_youtubedl_headers(headers):
1195     filtered_headers = headers
1196
1197     if 'Youtubedl-no-compression' in filtered_headers:
1198         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1199         del filtered_headers['Youtubedl-no-compression']
1200
1201     return filtered_headers
1202
1203
1204 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1205     """Handler for HTTP requests and responses.
1206
1207     This class, when installed with an OpenerDirector, automatically adds
1208     the standard headers to every HTTP request and handles gzipped and
1209     deflated responses from web servers. If compression is to be avoided in
1210     a particular request, the original request in the program code only has
1211     to include the HTTP header "Youtubedl-no-compression", which will be
1212     removed before making the real request.
1213
1214     Part of this code was copied from:
1215
1216     http://techknack.net/python-urllib2-handlers/
1217
1218     Andrew Rowls, the author of that code, agreed to release it to the
1219     public domain.
1220     """
1221
1222     def __init__(self, params, *args, **kwargs):
1223         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1224         self._params = params
1225
1226     def http_open(self, req):
1227         conn_class = compat_http_client.HTTPConnection
1228
1229         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1230         if socks_proxy:
1231             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1232             del req.headers['Ytdl-socks-proxy']
1233
1234         return self.do_open(functools.partial(
1235             _create_http_connection, self, conn_class, False),
1236             req)
1237
1238     @staticmethod
1239     def deflate(data):
1240         if not data:
1241             return data
1242         try:
1243             return zlib.decompress(data, -zlib.MAX_WBITS)
1244         except zlib.error:
1245             return zlib.decompress(data)
1246
1247     def http_request(self, req):
1248         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1249         # always respected by websites, some tend to give out URLs with non percent-encoded
1250         # non-ASCII characters (see telemb.py, ard.py [#3412])
1251         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1252         # To work around aforementioned issue we will replace request's original URL with
1253         # percent-encoded one
1254         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1255         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1256         url = req.get_full_url()
1257         url_escaped = escape_url(url)
1258
1259         # Substitute URL if any change after escaping
1260         if url != url_escaped:
1261             req = update_Request(req, url=url_escaped)
1262
1263         for h, v in std_headers.items():
1264             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1265             # The dict keys are capitalized because of this bug by urllib
1266             if h.capitalize() not in req.headers:
1267                 req.add_header(h, v)
1268
1269         req.headers = handle_youtubedl_headers(req.headers)
1270
1271         if sys.version_info < (2, 7) and '#' in req.get_full_url():
1272             # Python 2.6 is brain-dead when it comes to fragments
1273             req._Request__original = req._Request__original.partition('#')[0]
1274             req._Request__r_type = req._Request__r_type.partition('#')[0]
1275
1276         return req
1277
1278     def http_response(self, req, resp):
1279         old_resp = resp
1280         # gzip
1281         if resp.headers.get('Content-encoding', '') == 'gzip':
1282             content = resp.read()
1283             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1284             try:
1285                 uncompressed = io.BytesIO(gz.read())
1286             except IOError as original_ioerror:
1287                 # There may be junk add the end of the file
1288                 # See http://stackoverflow.com/q/4928560/35070 for details
1289                 for i in range(1, 1024):
1290                     try:
1291                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1292                         uncompressed = io.BytesIO(gz.read())
1293                     except IOError:
1294                         continue
1295                     break
1296                 else:
1297                     raise original_ioerror
1298             resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1299             resp.msg = old_resp.msg
1300             del resp.headers['Content-encoding']
1301         # deflate
1302         if resp.headers.get('Content-encoding', '') == 'deflate':
1303             gz = io.BytesIO(self.deflate(resp.read()))
1304             resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1305             resp.msg = old_resp.msg
1306             del resp.headers['Content-encoding']
1307         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1308         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1309         if 300 <= resp.code < 400:
1310             location = resp.headers.get('Location')
1311             if location:
1312                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1313                 if sys.version_info >= (3, 0):
1314                     location = location.encode('iso-8859-1').decode('utf-8')
1315                 else:
1316                     location = location.decode('utf-8')
1317                 location_escaped = escape_url(location)
1318                 if location != location_escaped:
1319                     del resp.headers['Location']
1320                     if sys.version_info < (3, 0):
1321                         location_escaped = location_escaped.encode('utf-8')
1322                     resp.headers['Location'] = location_escaped
1323         return resp
1324
1325     https_request = http_request
1326     https_response = http_response
1327
1328
1329 def make_socks_conn_class(base_class, socks_proxy):
1330     assert issubclass(base_class, (
1331         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1332
1333     url_components = compat_urlparse.urlparse(socks_proxy)
1334     if url_components.scheme.lower() == 'socks5':
1335         socks_type = ProxyType.SOCKS5
1336     elif url_components.scheme.lower() in ('socks', 'socks4'):
1337         socks_type = ProxyType.SOCKS4
1338     elif url_components.scheme.lower() == 'socks4a':
1339         socks_type = ProxyType.SOCKS4A
1340
1341     def unquote_if_non_empty(s):
1342         if not s:
1343             return s
1344         return compat_urllib_parse_unquote_plus(s)
1345
1346     proxy_args = (
1347         socks_type,
1348         url_components.hostname, url_components.port or 1080,
1349         True,  # Remote DNS
1350         unquote_if_non_empty(url_components.username),
1351         unquote_if_non_empty(url_components.password),
1352     )
1353
1354     class SocksConnection(base_class):
1355         def connect(self):
1356             self.sock = sockssocket()
1357             self.sock.setproxy(*proxy_args)
1358             if type(self.timeout) in (int, float):
1359                 self.sock.settimeout(self.timeout)
1360             self.sock.connect((self.host, self.port))
1361
1362             if isinstance(self, compat_http_client.HTTPSConnection):
1363                 if hasattr(self, '_context'):  # Python > 2.6
1364                     self.sock = self._context.wrap_socket(
1365                         self.sock, server_hostname=self.host)
1366                 else:
1367                     self.sock = ssl.wrap_socket(self.sock)
1368
1369     return SocksConnection
1370
1371
1372 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1373     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1374         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1375         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1376         self._params = params
1377
1378     def https_open(self, req):
1379         kwargs = {}
1380         conn_class = self._https_conn_class
1381
1382         if hasattr(self, '_context'):  # python > 2.6
1383             kwargs['context'] = self._context
1384         if hasattr(self, '_check_hostname'):  # python 3.x
1385             kwargs['check_hostname'] = self._check_hostname
1386
1387         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1388         if socks_proxy:
1389             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1390             del req.headers['Ytdl-socks-proxy']
1391
1392         return self.do_open(functools.partial(
1393             _create_http_connection, self, conn_class, True),
1394             req, **kwargs)
1395
1396
1397 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1398     """
1399     See [1] for cookie file format.
1400
1401     1. https://curl.haxx.se/docs/http-cookies.html
1402     """
1403     _HTTPONLY_PREFIX = '#HttpOnly_'
1404     _ENTRY_LEN = 7
1405     _HEADER = '''# Netscape HTTP Cookie File
1406 # This file is generated by yt-dlp.  Do not edit.
1407
1408 '''
1409     _CookieFileEntry = collections.namedtuple(
1410         'CookieFileEntry',
1411         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1412
1413     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1414         """
1415         Save cookies to a file.
1416
1417         Most of the code is taken from CPython 3.8 and slightly adapted
1418         to support cookie files with UTF-8 in both python 2 and 3.
1419         """
1420         if filename is None:
1421             if self.filename is not None:
1422                 filename = self.filename
1423             else:
1424                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1425
1426         # Store session cookies with `expires` set to 0 instead of an empty
1427         # string
1428         for cookie in self:
1429             if cookie.expires is None:
1430                 cookie.expires = 0
1431
1432         with io.open(filename, 'w', encoding='utf-8') as f:
1433             f.write(self._HEADER)
1434             now = time.time()
1435             for cookie in self:
1436                 if not ignore_discard and cookie.discard:
1437                     continue
1438                 if not ignore_expires and cookie.is_expired(now):
1439                     continue
1440                 if cookie.secure:
1441                     secure = 'TRUE'
1442                 else:
1443                     secure = 'FALSE'
1444                 if cookie.domain.startswith('.'):
1445                     initial_dot = 'TRUE'
1446                 else:
1447                     initial_dot = 'FALSE'
1448                 if cookie.expires is not None:
1449                     expires = compat_str(cookie.expires)
1450                 else:
1451                     expires = ''
1452                 if cookie.value is None:
1453                     # cookies.txt regards 'Set-Cookie: foo' as a cookie
1454                     # with no name, whereas http.cookiejar regards it as a
1455                     # cookie with no value.
1456                     name = ''
1457                     value = cookie.name
1458                 else:
1459                     name = cookie.name
1460                     value = cookie.value
1461                 f.write(
1462                     '\t'.join([cookie.domain, initial_dot, cookie.path,
1463                                secure, expires, name, value]) + '\n')
1464
1465     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1466         """Load cookies from a file."""
1467         if filename is None:
1468             if self.filename is not None:
1469                 filename = self.filename
1470             else:
1471                 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1472
1473         def prepare_line(line):
1474             if line.startswith(self._HTTPONLY_PREFIX):
1475                 line = line[len(self._HTTPONLY_PREFIX):]
1476             # comments and empty lines are fine
1477             if line.startswith('#') or not line.strip():
1478                 return line
1479             cookie_list = line.split('\t')
1480             if len(cookie_list) != self._ENTRY_LEN:
1481                 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1482             cookie = self._CookieFileEntry(*cookie_list)
1483             if cookie.expires_at and not cookie.expires_at.isdigit():
1484                 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1485             return line
1486
1487         cf = io.StringIO()
1488         with io.open(filename, encoding='utf-8') as f:
1489             for line in f:
1490                 try:
1491                     cf.write(prepare_line(line))
1492                 except compat_cookiejar.LoadError as e:
1493                     write_string(
1494                         'WARNING: skipping cookie file entry due to %s: %r\n'
1495                         % (e, line), sys.stderr)
1496                     continue
1497         cf.seek(0)
1498         self._really_load(cf, filename, ignore_discard, ignore_expires)
1499         # Session cookies are denoted by either `expires` field set to
1500         # an empty string or 0. MozillaCookieJar only recognizes the former
1501         # (see [1]). So we need force the latter to be recognized as session
1502         # cookies on our own.
1503         # Session cookies may be important for cookies-based authentication,
1504         # e.g. usually, when user does not check 'Remember me' check box while
1505         # logging in on a site, some important cookies are stored as session
1506         # cookies so that not recognizing them will result in failed login.
1507         # 1. https://bugs.python.org/issue17164
1508         for cookie in self:
1509             # Treat `expires=0` cookies as session cookies
1510             if cookie.expires == 0:
1511                 cookie.expires = None
1512                 cookie.discard = True
1513
1514
1515 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1516     def __init__(self, cookiejar=None):
1517         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1518
1519     def http_response(self, request, response):
1520         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1521         # characters in Set-Cookie HTTP header of last response (see
1522         # https://github.com/ytdl-org/youtube-dl/issues/6769).
1523         # In order to at least prevent crashing we will percent encode Set-Cookie
1524         # header before HTTPCookieProcessor starts processing it.
1525         # if sys.version_info < (3, 0) and response.headers:
1526         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1527         #         set_cookie = response.headers.get(set_cookie_header)
1528         #         if set_cookie:
1529         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1530         #             if set_cookie != set_cookie_escaped:
1531         #                 del response.headers[set_cookie_header]
1532         #                 response.headers[set_cookie_header] = set_cookie_escaped
1533         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1534
1535     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1536     https_response = http_response
1537
1538
1539 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1540     """YoutubeDL redirect handler
1541
1542     The code is based on HTTPRedirectHandler implementation from CPython [1].
1543
1544     This redirect handler solves two issues:
1545      - ensures redirect URL is always unicode under python 2
1546      - introduces support for experimental HTTP response status code
1547        308 Permanent Redirect [2] used by some sites [3]
1548
1549     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1550     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1551     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1552     """
1553
1554     http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1555
1556     def redirect_request(self, req, fp, code, msg, headers, newurl):
1557         """Return a Request or None in response to a redirect.
1558
1559         This is called by the http_error_30x methods when a
1560         redirection response is received.  If a redirection should
1561         take place, return a new Request to allow http_error_30x to
1562         perform the redirect.  Otherwise, raise HTTPError if no-one
1563         else should try to handle this url.  Return None if you can't
1564         but another Handler might.
1565         """
1566         m = req.get_method()
1567         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1568                  or code in (301, 302, 303) and m == "POST")):
1569             raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1570         # Strictly (according to RFC 2616), 301 or 302 in response to
1571         # a POST MUST NOT cause a redirection without confirmation
1572         # from the user (of urllib.request, in this case).  In practice,
1573         # essentially all clients do redirect in this case, so we do
1574         # the same.
1575
1576         # On python 2 urlh.geturl() may sometimes return redirect URL
1577         # as byte string instead of unicode. This workaround allows
1578         # to force it always return unicode.
1579         if sys.version_info[0] < 3:
1580             newurl = compat_str(newurl)
1581
1582         # Be conciliant with URIs containing a space.  This is mainly
1583         # redundant with the more complete encoding done in http_error_302(),
1584         # but it is kept for compatibility with other callers.
1585         newurl = newurl.replace(' ', '%20')
1586
1587         CONTENT_HEADERS = ("content-length", "content-type")
1588         # NB: don't use dict comprehension for python 2.6 compatibility
1589         newheaders = dict((k, v) for k, v in req.headers.items()
1590                           if k.lower() not in CONTENT_HEADERS)
1591         return compat_urllib_request.Request(
1592             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1593             unverifiable=True)
1594
1595
1596 def extract_timezone(date_str):
1597     m = re.search(
1598         r'''(?x)
1599             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1600             (?P<tz>Z|                                            # just the UTC Z, or
1601                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1602                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1603                    [ ]?                                          # optional space
1604                 (?P<sign>\+|-)                                   # +/-
1605                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1606             $)
1607         ''', date_str)
1608     if not m:
1609         timezone = datetime.timedelta()
1610     else:
1611         date_str = date_str[:-len(m.group('tz'))]
1612         if not m.group('sign'):
1613             timezone = datetime.timedelta()
1614         else:
1615             sign = 1 if m.group('sign') == '+' else -1
1616             timezone = datetime.timedelta(
1617                 hours=sign * int(m.group('hours')),
1618                 minutes=sign * int(m.group('minutes')))
1619     return timezone, date_str
1620
1621
1622 def parse_iso8601(date_str, delimiter='T', timezone=None):
1623     """ Return a UNIX timestamp from the given date """
1624
1625     if date_str is None:
1626         return None
1627
1628     date_str = re.sub(r'\.[0-9]+', '', date_str)
1629
1630     if timezone is None:
1631         timezone, date_str = extract_timezone(date_str)
1632
1633     try:
1634         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1635         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1636         return calendar.timegm(dt.timetuple())
1637     except ValueError:
1638         pass
1639
1640
1641 def date_formats(day_first=True):
1642     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1643
1644
1645 def unified_strdate(date_str, day_first=True):
1646     """Return a string with the date in the format YYYYMMDD"""
1647
1648     if date_str is None:
1649         return None
1650     upload_date = None
1651     # Replace commas
1652     date_str = date_str.replace(',', ' ')
1653     # Remove AM/PM + timezone
1654     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1655     _, date_str = extract_timezone(date_str)
1656
1657     for expression in date_formats(day_first):
1658         try:
1659             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1660         except ValueError:
1661             pass
1662     if upload_date is None:
1663         timetuple = email.utils.parsedate_tz(date_str)
1664         if timetuple:
1665             try:
1666                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1667             except ValueError:
1668                 pass
1669     if upload_date is not None:
1670         return compat_str(upload_date)
1671
1672
1673 def unified_timestamp(date_str, day_first=True):
1674     if date_str is None:
1675         return None
1676
1677     date_str = re.sub(r'[,|]', '', date_str)
1678
1679     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1680     timezone, date_str = extract_timezone(date_str)
1681
1682     # Remove AM/PM + timezone
1683     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1684
1685     # Remove unrecognized timezones from ISO 8601 alike timestamps
1686     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1687     if m:
1688         date_str = date_str[:-len(m.group('tz'))]
1689
1690     # Python only supports microseconds, so remove nanoseconds
1691     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1692     if m:
1693         date_str = m.group(1)
1694
1695     for expression in date_formats(day_first):
1696         try:
1697             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1698             return calendar.timegm(dt.timetuple())
1699         except ValueError:
1700             pass
1701     timetuple = email.utils.parsedate_tz(date_str)
1702     if timetuple:
1703         return calendar.timegm(timetuple) + pm_delta * 3600
1704
1705
1706 def determine_ext(url, default_ext='unknown_video'):
1707     if url is None or '.' not in url:
1708         return default_ext
1709     guess = url.partition('?')[0].rpartition('.')[2]
1710     if re.match(r'^[A-Za-z0-9]+$', guess):
1711         return guess
1712     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1713     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1714         return guess.rstrip('/')
1715     else:
1716         return default_ext
1717
1718
1719 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1720     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1721
1722
1723 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1724     """
1725     Return a datetime object from a string in the format YYYYMMDD or
1726     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1727
1728     format: string date format used to return datetime object from
1729     precision: round the time portion of a datetime object.
1730                 auto|microsecond|second|minute|hour|day.
1731                 auto: round to the unit provided in date_str (if applicable).
1732     """
1733     auto_precision = False
1734     if precision == 'auto':
1735         auto_precision = True
1736         precision = 'microsecond'
1737     today = datetime_round(datetime.datetime.now(), precision)
1738     if date_str in ('now', 'today'):
1739         return today
1740     if date_str == 'yesterday':
1741         return today - datetime.timedelta(days=1)
1742     match = re.match(
1743         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1744         date_str)
1745     if match is not None:
1746         start_time = datetime_from_str(match.group('start'), precision, format)
1747         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1748         unit = match.group('unit')
1749         if unit == 'month' or unit == 'year':
1750             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1751             unit = 'day'
1752         else:
1753             if unit == 'week':
1754                 unit = 'day'
1755                 time *= 7
1756             delta = datetime.timedelta(**{unit + 's': time})
1757             new_date = start_time + delta
1758         if auto_precision:
1759             return datetime_round(new_date, unit)
1760         return new_date
1761
1762     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1763
1764
1765 def date_from_str(date_str, format='%Y%m%d'):
1766     """
1767     Return a datetime object from a string in the format YYYYMMDD or
1768     (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1769
1770     format: string date format used to return datetime object from
1771     """
1772     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1773
1774
1775 def datetime_add_months(dt, months):
1776     """Increment/Decrement a datetime object by months."""
1777     month = dt.month + months - 1
1778     year = dt.year + month // 12
1779     month = month % 12 + 1
1780     day = min(dt.day, calendar.monthrange(year, month)[1])
1781     return dt.replace(year, month, day)
1782
1783
1784 def datetime_round(dt, precision='day'):
1785     """
1786     Round a datetime object's time to a specific precision
1787     """
1788     if precision == 'microsecond':
1789         return dt
1790
1791     unit_seconds = {
1792         'day': 86400,
1793         'hour': 3600,
1794         'minute': 60,
1795         'second': 1,
1796     }
1797     roundto = lambda x, n: ((x + n / 2) // n) * n
1798     timestamp = calendar.timegm(dt.timetuple())
1799     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1800
1801
1802 def hyphenate_date(date_str):
1803     """
1804     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1805     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1806     if match is not None:
1807         return '-'.join(match.groups())
1808     else:
1809         return date_str
1810
1811
1812 class DateRange(object):
1813     """Represents a time interval between two dates"""
1814
1815     def __init__(self, start=None, end=None):
1816         """start and end must be strings in the format accepted by date"""
1817         if start is not None:
1818             self.start = date_from_str(start)
1819         else:
1820             self.start = datetime.datetime.min.date()
1821         if end is not None:
1822             self.end = date_from_str(end)
1823         else:
1824             self.end = datetime.datetime.max.date()
1825         if self.start > self.end:
1826             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1827
1828     @classmethod
1829     def day(cls, day):
1830         """Returns a range that only contains the given day"""
1831         return cls(day, day)
1832
1833     def __contains__(self, date):
1834         """Check if the date is in the range"""
1835         if not isinstance(date, datetime.date):
1836             date = date_from_str(date)
1837         return self.start <= date <= self.end
1838
1839     def __str__(self):
1840         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1841
1842
1843 def platform_name():
1844     """ Returns the platform name as a compat_str """
1845     res = platform.platform()
1846     if isinstance(res, bytes):
1847         res = res.decode(preferredencoding())
1848
1849     assert isinstance(res, compat_str)
1850     return res
1851
1852
1853 def get_windows_version():
1854     ''' Get Windows version. None if it's not running on Windows '''
1855     if compat_os_name == 'nt':
1856         return version_tuple(platform.win32_ver()[1])
1857     else:
1858         return None
1859
1860
1861 def _windows_write_string(s, out):
1862     """ Returns True if the string was written using special methods,
1863     False if it has yet to be written out."""
1864     # Adapted from http://stackoverflow.com/a/3259271/35070
1865
1866     import ctypes.wintypes
1867
1868     WIN_OUTPUT_IDS = {
1869         1: -11,
1870         2: -12,
1871     }
1872
1873     try:
1874         fileno = out.fileno()
1875     except AttributeError:
1876         # If the output stream doesn't have a fileno, it's virtual
1877         return False
1878     except io.UnsupportedOperation:
1879         # Some strange Windows pseudo files?
1880         return False
1881     if fileno not in WIN_OUTPUT_IDS:
1882         return False
1883
1884     GetStdHandle = compat_ctypes_WINFUNCTYPE(
1885         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1886         ('GetStdHandle', ctypes.windll.kernel32))
1887     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1888
1889     WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1890         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1891         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1892         ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1893     written = ctypes.wintypes.DWORD(0)
1894
1895     GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1896     FILE_TYPE_CHAR = 0x0002
1897     FILE_TYPE_REMOTE = 0x8000
1898     GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1899         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1900         ctypes.POINTER(ctypes.wintypes.DWORD))(
1901         ('GetConsoleMode', ctypes.windll.kernel32))
1902     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1903
1904     def not_a_console(handle):
1905         if handle == INVALID_HANDLE_VALUE or handle is None:
1906             return True
1907         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1908                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1909
1910     if not_a_console(h):
1911         return False
1912
1913     def next_nonbmp_pos(s):
1914         try:
1915             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1916         except StopIteration:
1917             return len(s)
1918
1919     while s:
1920         count = min(next_nonbmp_pos(s), 1024)
1921
1922         ret = WriteConsoleW(
1923             h, s, count if count else 2, ctypes.byref(written), None)
1924         if ret == 0:
1925             raise OSError('Failed to write string')
1926         if not count:  # We just wrote a non-BMP character
1927             assert written.value == 2
1928             s = s[1:]
1929         else:
1930             assert written.value > 0
1931             s = s[written.value:]
1932     return True
1933
1934
1935 def write_string(s, out=None, encoding=None):
1936     if out is None:
1937         out = sys.stderr
1938     assert type(s) == compat_str
1939
1940     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1941         if _windows_write_string(s, out):
1942             return
1943
1944     if ('b' in getattr(out, 'mode', '')
1945             or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1946         byt = s.encode(encoding or preferredencoding(), 'ignore')
1947         out.write(byt)
1948     elif hasattr(out, 'buffer'):
1949         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1950         byt = s.encode(enc, 'ignore')
1951         out.buffer.write(byt)
1952     else:
1953         out.write(s)
1954     out.flush()
1955
1956
1957 def bytes_to_intlist(bs):
1958     if not bs:
1959         return []
1960     if isinstance(bs[0], int):  # Python 3
1961         return list(bs)
1962     else:
1963         return [ord(c) for c in bs]
1964
1965
1966 def intlist_to_bytes(xs):
1967     if not xs:
1968         return b''
1969     return compat_struct_pack('%dB' % len(xs), *xs)
1970
1971
1972 # Cross-platform file locking
1973 if sys.platform == 'win32':
1974     import ctypes.wintypes
1975     import msvcrt
1976
1977     class OVERLAPPED(ctypes.Structure):
1978         _fields_ = [
1979             ('Internal', ctypes.wintypes.LPVOID),
1980             ('InternalHigh', ctypes.wintypes.LPVOID),
1981             ('Offset', ctypes.wintypes.DWORD),
1982             ('OffsetHigh', ctypes.wintypes.DWORD),
1983             ('hEvent', ctypes.wintypes.HANDLE),
1984         ]
1985
1986     kernel32 = ctypes.windll.kernel32
1987     LockFileEx = kernel32.LockFileEx
1988     LockFileEx.argtypes = [
1989         ctypes.wintypes.HANDLE,     # hFile
1990         ctypes.wintypes.DWORD,      # dwFlags
1991         ctypes.wintypes.DWORD,      # dwReserved
1992         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1993         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1994         ctypes.POINTER(OVERLAPPED)  # Overlapped
1995     ]
1996     LockFileEx.restype = ctypes.wintypes.BOOL
1997     UnlockFileEx = kernel32.UnlockFileEx
1998     UnlockFileEx.argtypes = [
1999         ctypes.wintypes.HANDLE,     # hFile
2000         ctypes.wintypes.DWORD,      # dwReserved
2001         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2002         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2003         ctypes.POINTER(OVERLAPPED)  # Overlapped
2004     ]
2005     UnlockFileEx.restype = ctypes.wintypes.BOOL
2006     whole_low = 0xffffffff
2007     whole_high = 0x7fffffff
2008
2009     def _lock_file(f, exclusive):
2010         overlapped = OVERLAPPED()
2011         overlapped.Offset = 0
2012         overlapped.OffsetHigh = 0
2013         overlapped.hEvent = 0
2014         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2015         handle = msvcrt.get_osfhandle(f.fileno())
2016         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2017                           whole_low, whole_high, f._lock_file_overlapped_p):
2018             raise OSError('Locking file failed: %r' % ctypes.FormatError())
2019
2020     def _unlock_file(f):
2021         assert f._lock_file_overlapped_p
2022         handle = msvcrt.get_osfhandle(f.fileno())
2023         if not UnlockFileEx(handle, 0,
2024                             whole_low, whole_high, f._lock_file_overlapped_p):
2025             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2026
2027 else:
2028     # Some platforms, such as Jython, is missing fcntl
2029     try:
2030         import fcntl
2031
2032         def _lock_file(f, exclusive):
2033             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
2034
2035         def _unlock_file(f):
2036             fcntl.flock(f, fcntl.LOCK_UN)
2037     except ImportError:
2038         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2039
2040         def _lock_file(f, exclusive):
2041             raise IOError(UNSUPPORTED_MSG)
2042
2043         def _unlock_file(f):
2044             raise IOError(UNSUPPORTED_MSG)
2045
2046
2047 class locked_file(object):
2048     def __init__(self, filename, mode, encoding=None):
2049         assert mode in ['r', 'a', 'w']
2050         self.f = io.open(filename, mode, encoding=encoding)
2051         self.mode = mode
2052
2053     def __enter__(self):
2054         exclusive = self.mode != 'r'
2055         try:
2056             _lock_file(self.f, exclusive)
2057         except IOError:
2058             self.f.close()
2059             raise
2060         return self
2061
2062     def __exit__(self, etype, value, traceback):
2063         try:
2064             _unlock_file(self.f)
2065         finally:
2066             self.f.close()
2067
2068     def __iter__(self):
2069         return iter(self.f)
2070
2071     def write(self, *args):
2072         return self.f.write(*args)
2073
2074     def read(self, *args):
2075         return self.f.read(*args)
2076
2077
2078 def get_filesystem_encoding():
2079     encoding = sys.getfilesystemencoding()
2080     return encoding if encoding is not None else 'utf-8'
2081
2082
2083 def shell_quote(args):
2084     quoted_args = []
2085     encoding = get_filesystem_encoding()
2086     for a in args:
2087         if isinstance(a, bytes):
2088             # We may get a filename encoded with 'encodeFilename'
2089             a = a.decode(encoding)
2090         quoted_args.append(compat_shlex_quote(a))
2091     return ' '.join(quoted_args)
2092
2093
2094 def smuggle_url(url, data):
2095     """ Pass additional data in a URL for internal use. """
2096
2097     url, idata = unsmuggle_url(url, {})
2098     data.update(idata)
2099     sdata = compat_urllib_parse_urlencode(
2100         {'__youtubedl_smuggle': json.dumps(data)})
2101     return url + '#' + sdata
2102
2103
2104 def unsmuggle_url(smug_url, default=None):
2105     if '#__youtubedl_smuggle' not in smug_url:
2106         return smug_url, default
2107     url, _, sdata = smug_url.rpartition('#')
2108     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2109     data = json.loads(jsond)
2110     return url, data
2111
2112
2113 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2114     """ Formats numbers with decimal sufixes like K, M, etc """
2115     num, factor = float_or_none(num), float(factor)
2116     if num is None:
2117         return None
2118     exponent = 0 if num == 0 else int(math.log(num, factor))
2119     suffix = ['', *'kMGTPEZY'][exponent]
2120     if factor == 1024:
2121         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2122     converted = num / (factor ** exponent)
2123     return fmt % (converted, suffix)
2124
2125
2126 def format_bytes(bytes):
2127     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2128
2129
2130 def lookup_unit_table(unit_table, s):
2131     units_re = '|'.join(re.escape(u) for u in unit_table)
2132     m = re.match(
2133         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2134     if not m:
2135         return None
2136     num_str = m.group('num').replace(',', '.')
2137     mult = unit_table[m.group('unit')]
2138     return int(float(num_str) * mult)
2139
2140
2141 def parse_filesize(s):
2142     if s is None:
2143         return None
2144
2145     # The lower-case forms are of course incorrect and unofficial,
2146     # but we support those too
2147     _UNIT_TABLE = {
2148         'B': 1,
2149         'b': 1,
2150         'bytes': 1,
2151         'KiB': 1024,
2152         'KB': 1000,
2153         'kB': 1024,
2154         'Kb': 1000,
2155         'kb': 1000,
2156         'kilobytes': 1000,
2157         'kibibytes': 1024,
2158         'MiB': 1024 ** 2,
2159         'MB': 1000 ** 2,
2160         'mB': 1024 ** 2,
2161         'Mb': 1000 ** 2,
2162         'mb': 1000 ** 2,
2163         'megabytes': 1000 ** 2,
2164         'mebibytes': 1024 ** 2,
2165         'GiB': 1024 ** 3,
2166         'GB': 1000 ** 3,
2167         'gB': 1024 ** 3,
2168         'Gb': 1000 ** 3,
2169         'gb': 1000 ** 3,
2170         'gigabytes': 1000 ** 3,
2171         'gibibytes': 1024 ** 3,
2172         'TiB': 1024 ** 4,
2173         'TB': 1000 ** 4,
2174         'tB': 1024 ** 4,
2175         'Tb': 1000 ** 4,
2176         'tb': 1000 ** 4,
2177         'terabytes': 1000 ** 4,
2178         'tebibytes': 1024 ** 4,
2179         'PiB': 1024 ** 5,
2180         'PB': 1000 ** 5,
2181         'pB': 1024 ** 5,
2182         'Pb': 1000 ** 5,
2183         'pb': 1000 ** 5,
2184         'petabytes': 1000 ** 5,
2185         'pebibytes': 1024 ** 5,
2186         'EiB': 1024 ** 6,
2187         'EB': 1000 ** 6,
2188         'eB': 1024 ** 6,
2189         'Eb': 1000 ** 6,
2190         'eb': 1000 ** 6,
2191         'exabytes': 1000 ** 6,
2192         'exbibytes': 1024 ** 6,
2193         'ZiB': 1024 ** 7,
2194         'ZB': 1000 ** 7,
2195         'zB': 1024 ** 7,
2196         'Zb': 1000 ** 7,
2197         'zb': 1000 ** 7,
2198         'zettabytes': 1000 ** 7,
2199         'zebibytes': 1024 ** 7,
2200         'YiB': 1024 ** 8,
2201         'YB': 1000 ** 8,
2202         'yB': 1024 ** 8,
2203         'Yb': 1000 ** 8,
2204         'yb': 1000 ** 8,
2205         'yottabytes': 1000 ** 8,
2206         'yobibytes': 1024 ** 8,
2207     }
2208
2209     return lookup_unit_table(_UNIT_TABLE, s)
2210
2211
2212 def parse_count(s):
2213     if s is None:
2214         return None
2215
2216     s = re.sub(r'^[^\d]+\s', '', s).strip()
2217
2218     if re.match(r'^[\d,.]+$', s):
2219         return str_to_int(s)
2220
2221     _UNIT_TABLE = {
2222         'k': 1000,
2223         'K': 1000,
2224         'm': 1000 ** 2,
2225         'M': 1000 ** 2,
2226         'kk': 1000 ** 2,
2227         'KK': 1000 ** 2,
2228         'b': 1000 ** 3,
2229         'B': 1000 ** 3,
2230     }
2231
2232     ret = lookup_unit_table(_UNIT_TABLE, s)
2233     if ret is not None:
2234         return ret
2235
2236     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2237     if mobj:
2238         return str_to_int(mobj.group(1))
2239
2240
2241 def parse_resolution(s):
2242     if s is None:
2243         return {}
2244
2245     mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2246     if mobj:
2247         return {
2248             'width': int(mobj.group('w')),
2249             'height': int(mobj.group('h')),
2250         }
2251
2252     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2253     if mobj:
2254         return {'height': int(mobj.group(1))}
2255
2256     mobj = re.search(r'\b([48])[kK]\b', s)
2257     if mobj:
2258         return {'height': int(mobj.group(1)) * 540}
2259
2260     return {}
2261
2262
2263 def parse_bitrate(s):
2264     if not isinstance(s, compat_str):
2265         return
2266     mobj = re.search(r'\b(\d+)\s*kbps', s)
2267     if mobj:
2268         return int(mobj.group(1))
2269
2270
2271 def month_by_name(name, lang='en'):
2272     """ Return the number of a month by (locale-independently) English name """
2273
2274     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2275
2276     try:
2277         return month_names.index(name) + 1
2278     except ValueError:
2279         return None
2280
2281
2282 def month_by_abbreviation(abbrev):
2283     """ Return the number of a month by (locale-independently) English
2284         abbreviations """
2285
2286     try:
2287         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2288     except ValueError:
2289         return None
2290
2291
2292 def fix_xml_ampersands(xml_str):
2293     """Replace all the '&' by '&amp;' in XML"""
2294     return re.sub(
2295         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2296         '&amp;',
2297         xml_str)
2298
2299
2300 def setproctitle(title):
2301     assert isinstance(title, compat_str)
2302
2303     # ctypes in Jython is not complete
2304     # http://bugs.jython.org/issue2148
2305     if sys.platform.startswith('java'):
2306         return
2307
2308     try:
2309         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2310     except OSError:
2311         return
2312     except TypeError:
2313         # LoadLibrary in Windows Python 2.7.13 only expects
2314         # a bytestring, but since unicode_literals turns
2315         # every string into a unicode string, it fails.
2316         return
2317     title_bytes = title.encode('utf-8')
2318     buf = ctypes.create_string_buffer(len(title_bytes))
2319     buf.value = title_bytes
2320     try:
2321         libc.prctl(15, buf, 0, 0, 0)
2322     except AttributeError:
2323         return  # Strange libc, just skip this
2324
2325
2326 def remove_start(s, start):
2327     return s[len(start):] if s is not None and s.startswith(start) else s
2328
2329
2330 def remove_end(s, end):
2331     return s[:-len(end)] if s is not None and s.endswith(end) else s
2332
2333
2334 def remove_quotes(s):
2335     if s is None or len(s) < 2:
2336         return s
2337     for quote in ('"', "'", ):
2338         if s[0] == quote and s[-1] == quote:
2339             return s[1:-1]
2340     return s
2341
2342
2343 def get_domain(url):
2344     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2345     return domain.group('domain') if domain else None
2346
2347
2348 def url_basename(url):
2349     path = compat_urlparse.urlparse(url).path
2350     return path.strip('/').split('/')[-1]
2351
2352
2353 def base_url(url):
2354     return re.match(r'https?://[^?#&]+/', url).group()
2355
2356
2357 def urljoin(base, path):
2358     if isinstance(path, bytes):
2359         path = path.decode('utf-8')
2360     if not isinstance(path, compat_str) or not path:
2361         return None
2362     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2363         return path
2364     if isinstance(base, bytes):
2365         base = base.decode('utf-8')
2366     if not isinstance(base, compat_str) or not re.match(
2367             r'^(?:https?:)?//', base):
2368         return None
2369     return compat_urlparse.urljoin(base, path)
2370
2371
2372 class HEADRequest(compat_urllib_request.Request):
2373     def get_method(self):
2374         return 'HEAD'
2375
2376
2377 class PUTRequest(compat_urllib_request.Request):
2378     def get_method(self):
2379         return 'PUT'
2380
2381
2382 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2383     if get_attr:
2384         if v is not None:
2385             v = getattr(v, get_attr, None)
2386     if v == '':
2387         v = None
2388     if v is None:
2389         return default
2390     try:
2391         return int(v) * invscale // scale
2392     except (ValueError, TypeError, OverflowError):
2393         return default
2394
2395
2396 def str_or_none(v, default=None):
2397     return default if v is None else compat_str(v)
2398
2399
2400 def str_to_int(int_str):
2401     """ A more relaxed version of int_or_none """
2402     if isinstance(int_str, compat_integer_types):
2403         return int_str
2404     elif isinstance(int_str, compat_str):
2405         int_str = re.sub(r'[,\.\+]', '', int_str)
2406         return int_or_none(int_str)
2407
2408
2409 def float_or_none(v, scale=1, invscale=1, default=None):
2410     if v is None:
2411         return default
2412     try:
2413         return float(v) * invscale / scale
2414     except (ValueError, TypeError):
2415         return default
2416
2417
2418 def bool_or_none(v, default=None):
2419     return v if isinstance(v, bool) else default
2420
2421
2422 def strip_or_none(v, default=None):
2423     return v.strip() if isinstance(v, compat_str) else default
2424
2425
2426 def url_or_none(url):
2427     if not url or not isinstance(url, compat_str):
2428         return None
2429     url = url.strip()
2430     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2431
2432
2433 def strftime_or_none(timestamp, date_format, default=None):
2434     datetime_object = None
2435     try:
2436         if isinstance(timestamp, compat_numeric_types):  # unix timestamp
2437             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2438         elif isinstance(timestamp, compat_str):  # assume YYYYMMDD
2439             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2440         return datetime_object.strftime(date_format)
2441     except (ValueError, TypeError, AttributeError):
2442         return default
2443
2444
2445 def parse_duration(s):
2446     if not isinstance(s, compat_basestring):
2447         return None
2448     s = s.strip()
2449     if not s:
2450         return None
2451
2452     days, hours, mins, secs, ms = [None] * 5
2453     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
2454     if m:
2455         days, hours, mins, secs, ms = m.groups()
2456     else:
2457         m = re.match(
2458             r'''(?ix)(?:P?
2459                 (?:
2460                     [0-9]+\s*y(?:ears?)?\s*
2461                 )?
2462                 (?:
2463                     [0-9]+\s*m(?:onths?)?\s*
2464                 )?
2465                 (?:
2466                     [0-9]+\s*w(?:eeks?)?\s*
2467                 )?
2468                 (?:
2469                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2470                 )?
2471                 T)?
2472                 (?:
2473                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2474                 )?
2475                 (?:
2476                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2477                 )?
2478                 (?:
2479                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2480                 )?Z?$''', s)
2481         if m:
2482             days, hours, mins, secs, ms = m.groups()
2483         else:
2484             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2485             if m:
2486                 hours, mins = m.groups()
2487             else:
2488                 return None
2489
2490     duration = 0
2491     if secs:
2492         duration += float(secs)
2493     if mins:
2494         duration += float(mins) * 60
2495     if hours:
2496         duration += float(hours) * 60 * 60
2497     if days:
2498         duration += float(days) * 24 * 60 * 60
2499     if ms:
2500         duration += float(ms)
2501     return duration
2502
2503
2504 def prepend_extension(filename, ext, expected_real_ext=None):
2505     name, real_ext = os.path.splitext(filename)
2506     return (
2507         '{0}.{1}{2}'.format(name, ext, real_ext)
2508         if not expected_real_ext or real_ext[1:] == expected_real_ext
2509         else '{0}.{1}'.format(filename, ext))
2510
2511
2512 def replace_extension(filename, ext, expected_real_ext=None):
2513     name, real_ext = os.path.splitext(filename)
2514     return '{0}.{1}'.format(
2515         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2516         ext)
2517
2518
2519 def check_executable(exe, args=[]):
2520     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2521     args can be a list of arguments for a short output (like -version) """
2522     try:
2523         Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2524     except OSError:
2525         return False
2526     return exe
2527
2528
2529 def _get_exe_version_output(exe, args):
2530     try:
2531         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2532         # SIGTTOU if yt-dlp is run in the background.
2533         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2534         out, _ = Popen(
2535             [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2536             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2537     except OSError:
2538         return False
2539     if isinstance(out, bytes):  # Python 2.x
2540         out = out.decode('ascii', 'ignore')
2541     return out
2542
2543
2544 def detect_exe_version(output, version_re=None, unrecognized='present'):
2545     assert isinstance(output, compat_str)
2546     if version_re is None:
2547         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2548     m = re.search(version_re, output)
2549     if m:
2550         return m.group(1)
2551     else:
2552         return unrecognized
2553
2554
2555 def get_exe_version(exe, args=['--version'],
2556                     version_re=None, unrecognized='present'):
2557     """ Returns the version of the specified executable,
2558     or False if the executable is not present """
2559     out = _get_exe_version_output(exe, args)
2560     return detect_exe_version(out, version_re, unrecognized) if out else False
2561
2562
2563 class LazyList(collections.abc.Sequence):
2564     ''' Lazy immutable list from an iterable
2565     Note that slices of a LazyList are lists and not LazyList'''
2566
2567     class IndexError(IndexError):
2568         pass
2569
2570     def __init__(self, iterable, *, reverse=False, _cache=None):
2571         self.__iterable = iter(iterable)
2572         self.__cache = [] if _cache is None else _cache
2573         self.__reversed = reverse
2574
2575     def __iter__(self):
2576         if self.__reversed:
2577             # We need to consume the entire iterable to iterate in reverse
2578             yield from self.exhaust()
2579             return
2580         yield from self.__cache
2581         for item in self.__iterable:
2582             self.__cache.append(item)
2583             yield item
2584
2585     def __exhaust(self):
2586         self.__cache.extend(self.__iterable)
2587         # Discard the emptied iterable to make it pickle-able
2588         self.__iterable = []
2589         return self.__cache
2590
2591     def exhaust(self):
2592         ''' Evaluate the entire iterable '''
2593         return self.__exhaust()[::-1 if self.__reversed else 1]
2594
2595     @staticmethod
2596     def __reverse_index(x):
2597         return None if x is None else -(x + 1)
2598
2599     def __getitem__(self, idx):
2600         if isinstance(idx, slice):
2601             if self.__reversed:
2602                 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2603             start, stop, step = idx.start, idx.stop, idx.step or 1
2604         elif isinstance(idx, int):
2605             if self.__reversed:
2606                 idx = self.__reverse_index(idx)
2607             start, stop, step = idx, idx, 0
2608         else:
2609             raise TypeError('indices must be integers or slices')
2610         if ((start or 0) < 0 or (stop or 0) < 0
2611                 or (start is None and step < 0)
2612                 or (stop is None and step > 0)):
2613             # We need to consume the entire iterable to be able to slice from the end
2614             # Obviously, never use this with infinite iterables
2615             self.__exhaust()
2616             try:
2617                 return self.__cache[idx]
2618             except IndexError as e:
2619                 raise self.IndexError(e) from e
2620         n = max(start or 0, stop or 0) - len(self.__cache) + 1
2621         if n > 0:
2622             self.__cache.extend(itertools.islice(self.__iterable, n))
2623         try:
2624             return self.__cache[idx]
2625         except IndexError as e:
2626             raise self.IndexError(e) from e
2627
2628     def __bool__(self):
2629         try:
2630             self[-1] if self.__reversed else self[0]
2631         except self.IndexError:
2632             return False
2633         return True
2634
2635     def __len__(self):
2636         self.__exhaust()
2637         return len(self.__cache)
2638
2639     def __reversed__(self):
2640         return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2641
2642     def __copy__(self):
2643         return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2644
2645     def __repr__(self):
2646         # repr and str should mimic a list. So we exhaust the iterable
2647         return repr(self.exhaust())
2648
2649     def __str__(self):
2650         return repr(self.exhaust())
2651
2652
2653 class PagedList:
2654
2655     class IndexError(IndexError):
2656         pass
2657
2658     def __len__(self):
2659         # This is only useful for tests
2660         return len(self.getslice())
2661
2662     def __init__(self, pagefunc, pagesize, use_cache=True):
2663         self._pagefunc = pagefunc
2664         self._pagesize = pagesize
2665         self._use_cache = use_cache
2666         self._cache = {}
2667
2668     def getpage(self, pagenum):
2669         page_results = self._cache.get(pagenum)
2670         if page_results is None:
2671             page_results = list(self._pagefunc(pagenum))
2672         if self._use_cache:
2673             self._cache[pagenum] = page_results
2674         return page_results
2675
2676     def getslice(self, start=0, end=None):
2677         return list(self._getslice(start, end))
2678
2679     def _getslice(self, start, end):
2680         raise NotImplementedError('This method must be implemented by subclasses')
2681
2682     def __getitem__(self, idx):
2683         # NOTE: cache must be enabled if this is used
2684         if not isinstance(idx, int) or idx < 0:
2685             raise TypeError('indices must be non-negative integers')
2686         entries = self.getslice(idx, idx + 1)
2687         if not entries:
2688             raise self.IndexError()
2689         return entries[0]
2690
2691
2692 class OnDemandPagedList(PagedList):
2693     def _getslice(self, start, end):
2694         for pagenum in itertools.count(start // self._pagesize):
2695             firstid = pagenum * self._pagesize
2696             nextfirstid = pagenum * self._pagesize + self._pagesize
2697             if start >= nextfirstid:
2698                 continue
2699
2700             startv = (
2701                 start % self._pagesize
2702                 if firstid <= start < nextfirstid
2703                 else 0)
2704             endv = (
2705                 ((end - 1) % self._pagesize) + 1
2706                 if (end is not None and firstid <= end <= nextfirstid)
2707                 else None)
2708
2709             page_results = self.getpage(pagenum)
2710             if startv != 0 or endv is not None:
2711                 page_results = page_results[startv:endv]
2712             yield from page_results
2713
2714             # A little optimization - if current page is not "full", ie. does
2715             # not contain page_size videos then we can assume that this page
2716             # is the last one - there are no more ids on further pages -
2717             # i.e. no need to query again.
2718             if len(page_results) + startv < self._pagesize:
2719                 break
2720
2721             # If we got the whole page, but the next page is not interesting,
2722             # break out early as well
2723             if end == nextfirstid:
2724                 break
2725
2726
2727 class InAdvancePagedList(PagedList):
2728     def __init__(self, pagefunc, pagecount, pagesize):
2729         self._pagecount = pagecount
2730         PagedList.__init__(self, pagefunc, pagesize, True)
2731
2732     def _getslice(self, start, end):
2733         start_page = start // self._pagesize
2734         end_page = (
2735             self._pagecount if end is None else (end // self._pagesize + 1))
2736         skip_elems = start - start_page * self._pagesize
2737         only_more = None if end is None else end - start
2738         for pagenum in range(start_page, end_page):
2739             page_results = self.getpage(pagenum)
2740             if skip_elems:
2741                 page_results = page_results[skip_elems:]
2742                 skip_elems = None
2743             if only_more is not None:
2744                 if len(page_results) < only_more:
2745                     only_more -= len(page_results)
2746                 else:
2747                     yield from page_results[:only_more]
2748                     break
2749             yield from page_results
2750
2751
2752 def uppercase_escape(s):
2753     unicode_escape = codecs.getdecoder('unicode_escape')
2754     return re.sub(
2755         r'\\U[0-9a-fA-F]{8}',
2756         lambda m: unicode_escape(m.group(0))[0],
2757         s)
2758
2759
2760 def lowercase_escape(s):
2761     unicode_escape = codecs.getdecoder('unicode_escape')
2762     return re.sub(
2763         r'\\u[0-9a-fA-F]{4}',
2764         lambda m: unicode_escape(m.group(0))[0],
2765         s)
2766
2767
2768 def escape_rfc3986(s):
2769     """Escape non-ASCII characters as suggested by RFC 3986"""
2770     if sys.version_info < (3, 0) and isinstance(s, compat_str):
2771         s = s.encode('utf-8')
2772     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2773
2774
2775 def escape_url(url):
2776     """Escape URL as suggested by RFC 3986"""
2777     url_parsed = compat_urllib_parse_urlparse(url)
2778     return url_parsed._replace(
2779         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2780         path=escape_rfc3986(url_parsed.path),
2781         params=escape_rfc3986(url_parsed.params),
2782         query=escape_rfc3986(url_parsed.query),
2783         fragment=escape_rfc3986(url_parsed.fragment)
2784     ).geturl()
2785
2786
2787 def parse_qs(url):
2788     return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2789
2790
2791 def read_batch_urls(batch_fd):
2792     def fixup(url):
2793         if not isinstance(url, compat_str):
2794             url = url.decode('utf-8', 'replace')
2795         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2796         for bom in BOM_UTF8:
2797             if url.startswith(bom):
2798                 url = url[len(bom):]
2799         url = url.lstrip()
2800         if not url or url.startswith(('#', ';', ']')):
2801             return False
2802         # "#" cannot be stripped out since it is part of the URI
2803         # However, it can be safely stipped out if follwing a whitespace
2804         return re.split(r'\s#', url, 1)[0].rstrip()
2805
2806     with contextlib.closing(batch_fd) as fd:
2807         return [url for url in map(fixup, fd) if url]
2808
2809
2810 def urlencode_postdata(*args, **kargs):
2811     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2812
2813
2814 def update_url_query(url, query):
2815     if not query:
2816         return url
2817     parsed_url = compat_urlparse.urlparse(url)
2818     qs = compat_parse_qs(parsed_url.query)
2819     qs.update(query)
2820     return compat_urlparse.urlunparse(parsed_url._replace(
2821         query=compat_urllib_parse_urlencode(qs, True)))
2822
2823
2824 def update_Request(req, url=None, data=None, headers={}, query={}):
2825     req_headers = req.headers.copy()
2826     req_headers.update(headers)
2827     req_data = data or req.data
2828     req_url = update_url_query(url or req.get_full_url(), query)
2829     req_get_method = req.get_method()
2830     if req_get_method == 'HEAD':
2831         req_type = HEADRequest
2832     elif req_get_method == 'PUT':
2833         req_type = PUTRequest
2834     else:
2835         req_type = compat_urllib_request.Request
2836     new_req = req_type(
2837         req_url, data=req_data, headers=req_headers,
2838         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2839     if hasattr(req, 'timeout'):
2840         new_req.timeout = req.timeout
2841     return new_req
2842
2843
2844 def _multipart_encode_impl(data, boundary):
2845     content_type = 'multipart/form-data; boundary=%s' % boundary
2846
2847     out = b''
2848     for k, v in data.items():
2849         out += b'--' + boundary.encode('ascii') + b'\r\n'
2850         if isinstance(k, compat_str):
2851             k = k.encode('utf-8')
2852         if isinstance(v, compat_str):
2853             v = v.encode('utf-8')
2854         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2855         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2856         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2857         if boundary.encode('ascii') in content:
2858             raise ValueError('Boundary overlaps with data')
2859         out += content
2860
2861     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2862
2863     return out, content_type
2864
2865
2866 def multipart_encode(data, boundary=None):
2867     '''
2868     Encode a dict to RFC 7578-compliant form-data
2869
2870     data:
2871         A dict where keys and values can be either Unicode or bytes-like
2872         objects.
2873     boundary:
2874         If specified a Unicode object, it's used as the boundary. Otherwise
2875         a random boundary is generated.
2876
2877     Reference: https://tools.ietf.org/html/rfc7578
2878     '''
2879     has_specified_boundary = boundary is not None
2880
2881     while True:
2882         if boundary is None:
2883             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2884
2885         try:
2886             out, content_type = _multipart_encode_impl(data, boundary)
2887             break
2888         except ValueError:
2889             if has_specified_boundary:
2890                 raise
2891             boundary = None
2892
2893     return out, content_type
2894
2895
2896 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2897     if isinstance(key_or_keys, (list, tuple)):
2898         for key in key_or_keys:
2899             if key not in d or d[key] is None or skip_false_values and not d[key]:
2900                 continue
2901             return d[key]
2902         return default
2903     return d.get(key_or_keys, default)
2904
2905
2906 def try_get(src, getter, expected_type=None):
2907     for get in variadic(getter):
2908         try:
2909             v = get(src)
2910         except (AttributeError, KeyError, TypeError, IndexError):
2911             pass
2912         else:
2913             if expected_type is None or isinstance(v, expected_type):
2914                 return v
2915
2916
2917 def merge_dicts(*dicts):
2918     merged = {}
2919     for a_dict in dicts:
2920         for k, v in a_dict.items():
2921             if v is None:
2922                 continue
2923             if (k not in merged
2924                     or (isinstance(v, compat_str) and v
2925                         and isinstance(merged[k], compat_str)
2926                         and not merged[k])):
2927                 merged[k] = v
2928     return merged
2929
2930
2931 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2932     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2933
2934
2935 US_RATINGS = {
2936     'G': 0,
2937     'PG': 10,
2938     'PG-13': 13,
2939     'R': 16,
2940     'NC': 18,
2941 }
2942
2943
2944 TV_PARENTAL_GUIDELINES = {
2945     'TV-Y': 0,
2946     'TV-Y7': 7,
2947     'TV-G': 0,
2948     'TV-PG': 0,
2949     'TV-14': 14,
2950     'TV-MA': 17,
2951 }
2952
2953
2954 def parse_age_limit(s):
2955     if type(s) == int:
2956         return s if 0 <= s <= 21 else None
2957     if not isinstance(s, compat_basestring):
2958         return None
2959     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2960     if m:
2961         return int(m.group('age'))
2962     s = s.upper()
2963     if s in US_RATINGS:
2964         return US_RATINGS[s]
2965     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2966     if m:
2967         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2968     return None
2969
2970
2971 def strip_jsonp(code):
2972     return re.sub(
2973         r'''(?sx)^
2974             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2975             (?:\s*&&\s*(?P=func_name))?
2976             \s*\(\s*(?P<callback_data>.*)\);?
2977             \s*?(?://[^\n]*)*$''',
2978         r'\g<callback_data>', code)
2979
2980
2981 def js_to_json(code, vars={}):
2982     # vars is a dict of var, val pairs to substitute
2983     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2984     SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2985     INTEGER_TABLE = (
2986         (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2987         (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2988     )
2989
2990     def fix_kv(m):
2991         v = m.group(0)
2992         if v in ('true', 'false', 'null'):
2993             return v
2994         elif v in ('undefined', 'void 0'):
2995             return 'null'
2996         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2997             return ""
2998
2999         if v[0] in ("'", '"'):
3000             v = re.sub(r'(?s)\\.|"', lambda m: {
3001                 '"': '\\"',
3002                 "\\'": "'",
3003                 '\\\n': '',
3004                 '\\x': '\\u00',
3005             }.get(m.group(0), m.group(0)), v[1:-1])
3006         else:
3007             for regex, base in INTEGER_TABLE:
3008                 im = re.match(regex, v)
3009                 if im:
3010                     i = int(im.group(1), base)
3011                     return '"%d":' % i if v.endswith(':') else '%d' % i
3012
3013             if v in vars:
3014                 return vars[v]
3015
3016         return '"%s"' % v
3017
3018     return re.sub(r'''(?sx)
3019         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3020         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3021         {comment}|,(?={skip}[\]}}])|
3022         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3023         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3024         [0-9]+(?={skip}:)|
3025         !+
3026         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3027
3028
3029 def qualities(quality_ids):
3030     """ Get a numeric quality value out of a list of possible values """
3031     def q(qid):
3032         try:
3033             return quality_ids.index(qid)
3034         except ValueError:
3035             return -1
3036     return q
3037
3038
3039 DEFAULT_OUTTMPL = {
3040     'default': '%(title)s [%(id)s].%(ext)s',
3041     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3042 }
3043 OUTTMPL_TYPES = {
3044     'chapter': None,
3045     'subtitle': None,
3046     'thumbnail': None,
3047     'description': 'description',
3048     'annotation': 'annotations.xml',
3049     'infojson': 'info.json',
3050     'link': None,
3051     'pl_thumbnail': None,
3052     'pl_description': 'description',
3053     'pl_infojson': 'info.json',
3054 }
3055
3056 # As of [1] format syntax is:
3057 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3058 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3059 STR_FORMAT_RE_TMPL = r'''(?x)
3060     (?<!%)(?P<prefix>(?:%%)*)
3061     %
3062     (?P<has_key>\((?P<key>{0})\))?
3063     (?P<format>
3064         (?P<conversion>[#0\-+ ]+)?
3065         (?P<min_width>\d+)?
3066         (?P<precision>\.\d+)?
3067         (?P<len_mod>[hlL])?  # unused in python
3068         {1}  # conversion type
3069     )
3070 '''
3071
3072
3073 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3074
3075
3076 def limit_length(s, length):
3077     """ Add ellipses to overly long strings """
3078     if s is None:
3079         return None
3080     ELLIPSES = '...'
3081     if len(s) > length:
3082         return s[:length - len(ELLIPSES)] + ELLIPSES
3083     return s
3084
3085
3086 def version_tuple(v):
3087     return tuple(int(e) for e in re.split(r'[-.]', v))
3088
3089
3090 def is_outdated_version(version, limit, assume_new=True):
3091     if not version:
3092         return not assume_new
3093     try:
3094         return version_tuple(version) < version_tuple(limit)
3095     except ValueError:
3096         return not assume_new
3097
3098
3099 def ytdl_is_updateable():
3100     """ Returns if yt-dlp can be updated with -U """
3101
3102     from .update import is_non_updateable
3103
3104     return not is_non_updateable()
3105
3106
3107 def args_to_str(args):
3108     # Get a short string representation for a subprocess command
3109     return ' '.join(compat_shlex_quote(a) for a in args)
3110
3111
3112 def error_to_compat_str(err):
3113     err_str = str(err)
3114     # On python 2 error byte string must be decoded with proper
3115     # encoding rather than ascii
3116     if sys.version_info[0] < 3:
3117         err_str = err_str.decode(preferredencoding())
3118     return err_str
3119
3120
3121 def mimetype2ext(mt):
3122     if mt is None:
3123         return None
3124
3125     mt, _, params = mt.partition(';')
3126     mt = mt.strip()
3127
3128     FULL_MAP = {
3129         'audio/mp4': 'm4a',
3130         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3131         # it's the most popular one
3132         'audio/mpeg': 'mp3',
3133         'audio/x-wav': 'wav',
3134         'audio/wav': 'wav',
3135         'audio/wave': 'wav',
3136     }
3137
3138     ext = FULL_MAP.get(mt)
3139     if ext is not None:
3140         return ext
3141
3142     SUBTYPE_MAP = {
3143         '3gpp': '3gp',
3144         'smptett+xml': 'tt',
3145         'ttaf+xml': 'dfxp',
3146         'ttml+xml': 'ttml',
3147         'x-flv': 'flv',
3148         'x-mp4-fragmented': 'mp4',
3149         'x-ms-sami': 'sami',
3150         'x-ms-wmv': 'wmv',
3151         'mpegurl': 'm3u8',
3152         'x-mpegurl': 'm3u8',
3153         'vnd.apple.mpegurl': 'm3u8',
3154         'dash+xml': 'mpd',
3155         'f4m+xml': 'f4m',
3156         'hds+xml': 'f4m',
3157         'vnd.ms-sstr+xml': 'ism',
3158         'quicktime': 'mov',
3159         'mp2t': 'ts',
3160         'x-wav': 'wav',
3161         'filmstrip+json': 'fs',
3162         'svg+xml': 'svg',
3163     }
3164
3165     _, _, subtype = mt.rpartition('/')
3166     ext = SUBTYPE_MAP.get(subtype.lower())
3167     if ext is not None:
3168         return ext
3169
3170     SUFFIX_MAP = {
3171         'json': 'json',
3172         'xml': 'xml',
3173         'zip': 'zip',
3174         'gzip': 'gz',
3175     }
3176
3177     _, _, suffix = subtype.partition('+')
3178     ext = SUFFIX_MAP.get(suffix)
3179     if ext is not None:
3180         return ext
3181
3182     return subtype.replace('+', '.')
3183
3184
3185 def ext2mimetype(ext_or_url):
3186     if not ext_or_url:
3187         return None
3188     if '.' not in ext_or_url:
3189         ext_or_url = f'file.{ext_or_url}'
3190     return mimetypes.guess_type(ext_or_url)[0]
3191
3192
3193 def parse_codecs(codecs_str):
3194     # http://tools.ietf.org/html/rfc6381
3195     if not codecs_str:
3196         return {}
3197     split_codecs = list(filter(None, map(
3198         str.strip, codecs_str.strip().strip(',').split(','))))
3199     vcodec, acodec, tcodec, hdr = None, None, None, None
3200     for full_codec in split_codecs:
3201         parts = full_codec.split('.')
3202         codec = parts[0].replace('0', '')
3203         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3204                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3205             if not vcodec:
3206                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3207                 if codec in ('dvh1', 'dvhe'):
3208                     hdr = 'DV'
3209                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3210                     hdr = 'HDR10'
3211                 elif full_codec.replace('0', '').startswith('vp9.2'):
3212                     hdr = 'HDR10'
3213         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3214             if not acodec:
3215                 acodec = full_codec
3216         elif codec in ('stpp', 'wvtt',):
3217             if not tcodec:
3218                 tcodec = full_codec
3219         else:
3220             write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3221     if vcodec or acodec or tcodec:
3222         return {
3223             'vcodec': vcodec or 'none',
3224             'acodec': acodec or 'none',
3225             'dynamic_range': hdr,
3226             **({'tcodec': tcodec} if tcodec is not None else {}),
3227         }
3228     elif len(split_codecs) == 2:
3229         return {
3230             'vcodec': split_codecs[0],
3231             'acodec': split_codecs[1],
3232         }
3233     return {}
3234
3235
3236 def urlhandle_detect_ext(url_handle):
3237     getheader = url_handle.headers.get
3238
3239     cd = getheader('Content-Disposition')
3240     if cd:
3241         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3242         if m:
3243             e = determine_ext(m.group('filename'), default_ext=None)
3244             if e:
3245                 return e
3246
3247     return mimetype2ext(getheader('Content-Type'))
3248
3249
3250 def encode_data_uri(data, mime_type):
3251     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3252
3253
3254 def age_restricted(content_limit, age_limit):
3255     """ Returns True iff the content should be blocked """
3256
3257     if age_limit is None:  # No limit set
3258         return False
3259     if content_limit is None:
3260         return False  # Content available for everyone
3261     return age_limit < content_limit
3262
3263
3264 def is_html(first_bytes):
3265     """ Detect whether a file contains HTML by examining its first bytes. """
3266
3267     BOMS = [
3268         (b'\xef\xbb\xbf', 'utf-8'),
3269         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3270         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3271         (b'\xff\xfe', 'utf-16-le'),
3272         (b'\xfe\xff', 'utf-16-be'),
3273     ]
3274     for bom, enc in BOMS:
3275         if first_bytes.startswith(bom):
3276             s = first_bytes[len(bom):].decode(enc, 'replace')
3277             break
3278     else:
3279         s = first_bytes.decode('utf-8', 'replace')
3280
3281     return re.match(r'^\s*<', s)
3282
3283
3284 def determine_protocol(info_dict):
3285     protocol = info_dict.get('protocol')
3286     if protocol is not None:
3287         return protocol
3288
3289     url = sanitize_url(info_dict['url'])
3290     if url.startswith('rtmp'):
3291         return 'rtmp'
3292     elif url.startswith('mms'):
3293         return 'mms'
3294     elif url.startswith('rtsp'):
3295         return 'rtsp'
3296
3297     ext = determine_ext(url)
3298     if ext == 'm3u8':
3299         return 'm3u8'
3300     elif ext == 'f4m':
3301         return 'f4m'
3302
3303     return compat_urllib_parse_urlparse(url).scheme
3304
3305
3306 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3307     """ Render a list of rows, each as a list of values.
3308     Text after a \t will be right aligned """
3309     def width(string):
3310         return len(remove_terminal_sequences(string).replace('\t', ''))
3311
3312     def get_max_lens(table):
3313         return [max(width(str(v)) for v in col) for col in zip(*table)]
3314
3315     def filter_using_list(row, filterArray):
3316         return [col for (take, col) in zip(filterArray, row) if take]
3317
3318     if hide_empty:
3319         max_lens = get_max_lens(data)
3320         header_row = filter_using_list(header_row, max_lens)
3321         data = [filter_using_list(row, max_lens) for row in data]
3322
3323     table = [header_row] + data
3324     max_lens = get_max_lens(table)
3325     extra_gap += 1
3326     if delim:
3327         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3328         table[1][-1] = table[1][-1][:-extra_gap]  # Remove extra_gap from end of delimiter
3329     for row in table:
3330         for pos, text in enumerate(map(str, row)):
3331             if '\t' in text:
3332                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3333             else:
3334                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3335     ret = '\n'.join(''.join(row).rstrip() for row in table)
3336     return ret
3337
3338
3339 def _match_one(filter_part, dct, incomplete):
3340     # TODO: Generalize code with YoutubeDL._build_format_filter
3341     STRING_OPERATORS = {
3342         '*=': operator.contains,
3343         '^=': lambda attr, value: attr.startswith(value),
3344         '$=': lambda attr, value: attr.endswith(value),
3345         '~=': lambda attr, value: re.search(value, attr),
3346     }
3347     COMPARISON_OPERATORS = {
3348         **STRING_OPERATORS,
3349         '<=': operator.le,  # "<=" must be defined above "<"
3350         '<': operator.lt,
3351         '>=': operator.ge,
3352         '>': operator.gt,
3353         '=': operator.eq,
3354     }
3355
3356     operator_rex = re.compile(r'''(?x)\s*
3357         (?P<key>[a-z_]+)
3358         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3359         (?:
3360             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3361             (?P<strval>.+?)
3362         )
3363         \s*$
3364         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3365     m = operator_rex.search(filter_part)
3366     if m:
3367         m = m.groupdict()
3368         unnegated_op = COMPARISON_OPERATORS[m['op']]
3369         if m['negation']:
3370             op = lambda attr, value: not unnegated_op(attr, value)
3371         else:
3372             op = unnegated_op
3373         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3374         if m['quote']:
3375             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3376         actual_value = dct.get(m['key'])
3377         numeric_comparison = None
3378         if isinstance(actual_value, compat_numeric_types):
3379             # If the original field is a string and matching comparisonvalue is
3380             # a number we should respect the origin of the original field
3381             # and process comparison value as a string (see
3382             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3383             try:
3384                 numeric_comparison = int(comparison_value)
3385             except ValueError:
3386                 numeric_comparison = parse_filesize(comparison_value)
3387                 if numeric_comparison is None:
3388                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3389                 if numeric_comparison is None:
3390                     numeric_comparison = parse_duration(comparison_value)
3391         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3392             raise ValueError('Operator %s only supports string values!' % m['op'])
3393         if actual_value is None:
3394             return incomplete or m['none_inclusive']
3395         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3396
3397     UNARY_OPERATORS = {
3398         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3399         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3400     }
3401     operator_rex = re.compile(r'''(?x)\s*
3402         (?P<op>%s)\s*(?P<key>[a-z_]+)
3403         \s*$
3404         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3405     m = operator_rex.search(filter_part)
3406     if m:
3407         op = UNARY_OPERATORS[m.group('op')]
3408         actual_value = dct.get(m.group('key'))
3409         if incomplete and actual_value is None:
3410             return True
3411         return op(actual_value)
3412
3413     raise ValueError('Invalid filter part %r' % filter_part)
3414
3415
3416 def match_str(filter_str, dct, incomplete=False):
3417     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3418         When incomplete, all conditions passes on missing fields
3419     """
3420     return all(
3421         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3422         for filter_part in re.split(r'(?<!\\)&', filter_str))
3423
3424
3425 def match_filter_func(filter_str):
3426     def _match_func(info_dict, *args, **kwargs):
3427         if match_str(filter_str, info_dict, *args, **kwargs):
3428             return None
3429         else:
3430             video_title = info_dict.get('title', info_dict.get('id', 'video'))
3431             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3432     return _match_func
3433
3434
3435 def parse_dfxp_time_expr(time_expr):
3436     if not time_expr:
3437         return
3438
3439     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3440     if mobj:
3441         return float(mobj.group('time_offset'))
3442
3443     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3444     if mobj:
3445         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3446
3447
3448 def srt_subtitles_timecode(seconds):
3449     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3450
3451
3452 def ass_subtitles_timecode(seconds):
3453     time = timetuple_from_msec(seconds * 1000)
3454     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3455
3456
3457 def dfxp2srt(dfxp_data):
3458     '''
3459     @param dfxp_data A bytes-like object containing DFXP data
3460     @returns A unicode object containing converted SRT data
3461     '''
3462     LEGACY_NAMESPACES = (
3463         (b'http://www.w3.org/ns/ttml', [
3464             b'http://www.w3.org/2004/11/ttaf1',
3465             b'http://www.w3.org/2006/04/ttaf1',
3466             b'http://www.w3.org/2006/10/ttaf1',
3467         ]),
3468         (b'http://www.w3.org/ns/ttml#styling', [
3469             b'http://www.w3.org/ns/ttml#style',
3470         ]),
3471     )
3472
3473     SUPPORTED_STYLING = [
3474         'color',
3475         'fontFamily',
3476         'fontSize',
3477         'fontStyle',
3478         'fontWeight',
3479         'textDecoration'
3480     ]
3481
3482     _x = functools.partial(xpath_with_ns, ns_map={
3483         'xml': 'http://www.w3.org/XML/1998/namespace',
3484         'ttml': 'http://www.w3.org/ns/ttml',
3485         'tts': 'http://www.w3.org/ns/ttml#styling',
3486     })
3487
3488     styles = {}
3489     default_style = {}
3490
3491     class TTMLPElementParser(object):
3492         _out = ''
3493         _unclosed_elements = []
3494         _applied_styles = []
3495
3496         def start(self, tag, attrib):
3497             if tag in (_x('ttml:br'), 'br'):
3498                 self._out += '\n'
3499             else:
3500                 unclosed_elements = []
3501                 style = {}
3502                 element_style_id = attrib.get('style')
3503                 if default_style:
3504                     style.update(default_style)
3505                 if element_style_id:
3506                     style.update(styles.get(element_style_id, {}))
3507                 for prop in SUPPORTED_STYLING:
3508                     prop_val = attrib.get(_x('tts:' + prop))
3509                     if prop_val:
3510                         style[prop] = prop_val
3511                 if style:
3512                     font = ''
3513                     for k, v in sorted(style.items()):
3514                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3515                             continue
3516                         if k == 'color':
3517                             font += ' color="%s"' % v
3518                         elif k == 'fontSize':
3519                             font += ' size="%s"' % v
3520                         elif k == 'fontFamily':
3521                             font += ' face="%s"' % v
3522                         elif k == 'fontWeight' and v == 'bold':
3523                             self._out += '<b>'
3524                             unclosed_elements.append('b')
3525                         elif k == 'fontStyle' and v == 'italic':
3526                             self._out += '<i>'
3527                             unclosed_elements.append('i')
3528                         elif k == 'textDecoration' and v == 'underline':
3529                             self._out += '<u>'
3530                             unclosed_elements.append('u')
3531                     if font:
3532                         self._out += '<font' + font + '>'
3533                         unclosed_elements.append('font')
3534                     applied_style = {}
3535                     if self._applied_styles:
3536                         applied_style.update(self._applied_styles[-1])
3537                     applied_style.update(style)
3538                     self._applied_styles.append(applied_style)
3539                 self._unclosed_elements.append(unclosed_elements)
3540
3541         def end(self, tag):
3542             if tag not in (_x('ttml:br'), 'br'):
3543                 unclosed_elements = self._unclosed_elements.pop()
3544                 for element in reversed(unclosed_elements):
3545                     self._out += '</%s>' % element
3546                 if unclosed_elements and self._applied_styles:
3547                     self._applied_styles.pop()
3548
3549         def data(self, data):
3550             self._out += data
3551
3552         def close(self):
3553             return self._out.strip()
3554
3555     def parse_node(node):
3556         target = TTMLPElementParser()
3557         parser = xml.etree.ElementTree.XMLParser(target=target)
3558         parser.feed(xml.etree.ElementTree.tostring(node))
3559         return parser.close()
3560
3561     for k, v in LEGACY_NAMESPACES:
3562         for ns in v:
3563             dfxp_data = dfxp_data.replace(ns, k)
3564
3565     dfxp = compat_etree_fromstring(dfxp_data)
3566     out = []
3567     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3568
3569     if not paras:
3570         raise ValueError('Invalid dfxp/TTML subtitle')
3571
3572     repeat = False
3573     while True:
3574         for style in dfxp.findall(_x('.//ttml:style')):
3575             style_id = style.get('id') or style.get(_x('xml:id'))
3576             if not style_id:
3577                 continue
3578             parent_style_id = style.get('style')
3579             if parent_style_id:
3580                 if parent_style_id not in styles:
3581                     repeat = True
3582                     continue
3583                 styles[style_id] = styles[parent_style_id].copy()
3584             for prop in SUPPORTED_STYLING:
3585                 prop_val = style.get(_x('tts:' + prop))
3586                 if prop_val:
3587                     styles.setdefault(style_id, {})[prop] = prop_val
3588         if repeat:
3589             repeat = False
3590         else:
3591             break
3592
3593     for p in ('body', 'div'):
3594         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3595         if ele is None:
3596             continue
3597         style = styles.get(ele.get('style'))
3598         if not style:
3599             continue
3600         default_style.update(style)
3601
3602     for para, index in zip(paras, itertools.count(1)):
3603         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3604         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3605         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3606         if begin_time is None:
3607             continue
3608         if not end_time:
3609             if not dur:
3610                 continue
3611             end_time = begin_time + dur
3612         out.append('%d\n%s --> %s\n%s\n\n' % (
3613             index,
3614             srt_subtitles_timecode(begin_time),
3615             srt_subtitles_timecode(end_time),
3616             parse_node(para)))
3617
3618     return ''.join(out)
3619
3620
3621 def cli_option(params, command_option, param):
3622     param = params.get(param)
3623     if param:
3624         param = compat_str(param)
3625     return [command_option, param] if param is not None else []
3626
3627
3628 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3629     param = params.get(param)
3630     if param is None:
3631         return []
3632     assert isinstance(param, bool)
3633     if separator:
3634         return [command_option + separator + (true_value if param else false_value)]
3635     return [command_option, true_value if param else false_value]
3636
3637
3638 def cli_valueless_option(params, command_option, param, expected_value=True):
3639     param = params.get(param)
3640     return [command_option] if param == expected_value else []
3641
3642
3643 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3644     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3645         if use_compat:
3646             return argdict
3647         else:
3648             argdict = None
3649     if argdict is None:
3650         return default
3651     assert isinstance(argdict, dict)
3652
3653     assert isinstance(keys, (list, tuple))
3654     for key_list in keys:
3655         arg_list = list(filter(
3656             lambda x: x is not None,
3657             [argdict.get(key.lower()) for key in variadic(key_list)]))
3658         if arg_list:
3659             return [arg for args in arg_list for arg in args]
3660     return default
3661
3662
3663 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3664     main_key, exe = main_key.lower(), exe.lower()
3665     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3666     keys = [f'{root_key}{k}' for k in (keys or [''])]
3667     if root_key in keys:
3668         if main_key != exe:
3669             keys.append((main_key, exe))
3670         keys.append('default')
3671     else:
3672         use_compat = False
3673     return cli_configuration_args(argdict, keys, default, use_compat)
3674
3675
3676 class ISO639Utils(object):
3677     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3678     _lang_map = {
3679         'aa': 'aar',
3680         'ab': 'abk',
3681         'ae': 'ave',
3682         'af': 'afr',
3683         'ak': 'aka',
3684         'am': 'amh',
3685         'an': 'arg',
3686         'ar': 'ara',
3687         'as': 'asm',
3688         'av': 'ava',
3689         'ay': 'aym',
3690         'az': 'aze',
3691         'ba': 'bak',
3692         'be': 'bel',
3693         'bg': 'bul',
3694         'bh': 'bih',
3695         'bi': 'bis',
3696         'bm': 'bam',
3697         'bn': 'ben',
3698         'bo': 'bod',
3699         'br': 'bre',
3700         'bs': 'bos',
3701         'ca': 'cat',
3702         'ce': 'che',
3703         'ch': 'cha',
3704         'co': 'cos',
3705         'cr': 'cre',
3706         'cs': 'ces',
3707         'cu': 'chu',
3708         'cv': 'chv',
3709         'cy': 'cym',
3710         'da': 'dan',
3711         'de': 'deu',
3712         'dv': 'div',
3713         'dz': 'dzo',
3714         'ee': 'ewe',
3715         'el': 'ell',
3716         'en': 'eng',
3717         'eo': 'epo',
3718         'es': 'spa',
3719         'et': 'est',
3720         'eu': 'eus',
3721         'fa': 'fas',
3722         'ff': 'ful',
3723         'fi': 'fin',
3724         'fj': 'fij',
3725         'fo': 'fao',
3726         'fr': 'fra',
3727         'fy': 'fry',
3728         'ga': 'gle',
3729         'gd': 'gla',
3730         'gl': 'glg',
3731         'gn': 'grn',
3732         'gu': 'guj',
3733         'gv': 'glv',
3734         'ha': 'hau',
3735         'he': 'heb',
3736         'iw': 'heb',  # Replaced by he in 1989 revision
3737         'hi': 'hin',
3738         'ho': 'hmo',
3739         'hr': 'hrv',
3740         'ht': 'hat',
3741         'hu': 'hun',
3742         'hy': 'hye',
3743         'hz': 'her',
3744         'ia': 'ina',
3745         'id': 'ind',
3746         'in': 'ind',  # Replaced by id in 1989 revision
3747         'ie': 'ile',
3748         'ig': 'ibo',
3749         'ii': 'iii',
3750         'ik': 'ipk',
3751         'io': 'ido',
3752         'is': 'isl',
3753         'it': 'ita',
3754         'iu': 'iku',
3755         'ja': 'jpn',
3756         'jv': 'jav',
3757         'ka': 'kat',
3758         'kg': 'kon',
3759         'ki': 'kik',
3760         'kj': 'kua',
3761         'kk': 'kaz',
3762         'kl': 'kal',
3763         'km': 'khm',
3764         'kn': 'kan',
3765         'ko': 'kor',
3766         'kr': 'kau',
3767         'ks': 'kas',
3768         'ku': 'kur',
3769         'kv': 'kom',
3770         'kw': 'cor',
3771         'ky': 'kir',
3772         'la': 'lat',
3773         'lb': 'ltz',
3774         'lg': 'lug',
3775         'li': 'lim',
3776         'ln': 'lin',
3777         'lo': 'lao',
3778         'lt': 'lit',
3779         'lu': 'lub',
3780         'lv': 'lav',
3781         'mg': 'mlg',
3782         'mh': 'mah',
3783         'mi': 'mri',
3784         'mk': 'mkd',
3785         'ml': 'mal',
3786         'mn': 'mon',
3787         'mr': 'mar',
3788         'ms': 'msa',
3789         'mt': 'mlt',
3790         'my': 'mya',
3791         'na': 'nau',
3792         'nb': 'nob',
3793         'nd': 'nde',
3794         'ne': 'nep',
3795         'ng': 'ndo',
3796         'nl': 'nld',
3797         'nn': 'nno',
3798         'no': 'nor',
3799         'nr': 'nbl',
3800         'nv': 'nav',
3801         'ny': 'nya',
3802         'oc': 'oci',
3803         'oj': 'oji',
3804         'om': 'orm',
3805         'or': 'ori',
3806         'os': 'oss',
3807         'pa': 'pan',
3808         'pi': 'pli',
3809         'pl': 'pol',
3810         'ps': 'pus',
3811         'pt': 'por',
3812         'qu': 'que',
3813         'rm': 'roh',
3814         'rn': 'run',
3815         'ro': 'ron',
3816         'ru': 'rus',
3817         'rw': 'kin',
3818         'sa': 'san',
3819         'sc': 'srd',
3820         'sd': 'snd',
3821         'se': 'sme',
3822         'sg': 'sag',
3823         'si': 'sin',
3824         'sk': 'slk',
3825         'sl': 'slv',
3826         'sm': 'smo',
3827         'sn': 'sna',
3828         'so': 'som',
3829         'sq': 'sqi',
3830         'sr': 'srp',
3831         'ss': 'ssw',
3832         'st': 'sot',
3833         'su': 'sun',
3834         'sv': 'swe',
3835         'sw': 'swa',
3836         'ta': 'tam',
3837         'te': 'tel',
3838         'tg': 'tgk',
3839         'th': 'tha',
3840         'ti': 'tir',
3841         'tk': 'tuk',
3842         'tl': 'tgl',
3843         'tn': 'tsn',
3844         'to': 'ton',
3845         'tr': 'tur',
3846         'ts': 'tso',
3847         'tt': 'tat',
3848         'tw': 'twi',
3849         'ty': 'tah',
3850         'ug': 'uig',
3851         'uk': 'ukr',
3852         'ur': 'urd',
3853         'uz': 'uzb',
3854         've': 'ven',
3855         'vi': 'vie',
3856         'vo': 'vol',
3857         'wa': 'wln',
3858         'wo': 'wol',
3859         'xh': 'xho',
3860         'yi': 'yid',
3861         'ji': 'yid',  # Replaced by yi in 1989 revision
3862         'yo': 'yor',
3863         'za': 'zha',
3864         'zh': 'zho',
3865         'zu': 'zul',
3866     }
3867
3868     @classmethod
3869     def short2long(cls, code):
3870         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3871         return cls._lang_map.get(code[:2])
3872
3873     @classmethod
3874     def long2short(cls, code):
3875         """Convert language code from ISO 639-2/T to ISO 639-1"""
3876         for short_name, long_name in cls._lang_map.items():
3877             if long_name == code:
3878                 return short_name
3879
3880
3881 class ISO3166Utils(object):
3882     # From http://data.okfn.org/data/core/country-list
3883     _country_map = {
3884         'AF': 'Afghanistan',
3885         'AX': 'Åland Islands',
3886         'AL': 'Albania',
3887         'DZ': 'Algeria',
3888         'AS': 'American Samoa',
3889         'AD': 'Andorra',
3890         'AO': 'Angola',
3891         'AI': 'Anguilla',
3892         'AQ': 'Antarctica',
3893         'AG': 'Antigua and Barbuda',
3894         'AR': 'Argentina',
3895         'AM': 'Armenia',
3896         'AW': 'Aruba',
3897         'AU': 'Australia',
3898         'AT': 'Austria',
3899         'AZ': 'Azerbaijan',
3900         'BS': 'Bahamas',
3901         'BH': 'Bahrain',
3902         'BD': 'Bangladesh',
3903         'BB': 'Barbados',
3904         'BY': 'Belarus',
3905         'BE': 'Belgium',
3906         'BZ': 'Belize',
3907         'BJ': 'Benin',
3908         'BM': 'Bermuda',
3909         'BT': 'Bhutan',
3910         'BO': 'Bolivia, Plurinational State of',
3911         'BQ': 'Bonaire, Sint Eustatius and Saba',
3912         'BA': 'Bosnia and Herzegovina',
3913         'BW': 'Botswana',
3914         'BV': 'Bouvet Island',
3915         'BR': 'Brazil',
3916         'IO': 'British Indian Ocean Territory',
3917         'BN': 'Brunei Darussalam',
3918         'BG': 'Bulgaria',
3919         'BF': 'Burkina Faso',
3920         'BI': 'Burundi',
3921         'KH': 'Cambodia',
3922         'CM': 'Cameroon',
3923         'CA': 'Canada',
3924         'CV': 'Cape Verde',
3925         'KY': 'Cayman Islands',
3926         'CF': 'Central African Republic',
3927         'TD': 'Chad',
3928         'CL': 'Chile',
3929         'CN': 'China',
3930         'CX': 'Christmas Island',
3931         'CC': 'Cocos (Keeling) Islands',
3932         'CO': 'Colombia',
3933         'KM': 'Comoros',
3934         'CG': 'Congo',
3935         'CD': 'Congo, the Democratic Republic of the',
3936         'CK': 'Cook Islands',
3937         'CR': 'Costa Rica',
3938         'CI': 'Côte d\'Ivoire',
3939         'HR': 'Croatia',
3940         'CU': 'Cuba',
3941         'CW': 'Curaçao',
3942         'CY': 'Cyprus',
3943         'CZ': 'Czech Republic',
3944         'DK': 'Denmark',
3945         'DJ': 'Djibouti',
3946         'DM': 'Dominica',
3947         'DO': 'Dominican Republic',
3948         'EC': 'Ecuador',
3949         'EG': 'Egypt',
3950         'SV': 'El Salvador',
3951         'GQ': 'Equatorial Guinea',
3952         'ER': 'Eritrea',
3953         'EE': 'Estonia',
3954         'ET': 'Ethiopia',
3955         'FK': 'Falkland Islands (Malvinas)',
3956         'FO': 'Faroe Islands',
3957         'FJ': 'Fiji',
3958         'FI': 'Finland',
3959         'FR': 'France',
3960         'GF': 'French Guiana',
3961         'PF': 'French Polynesia',
3962         'TF': 'French Southern Territories',
3963         'GA': 'Gabon',
3964         'GM': 'Gambia',
3965         'GE': 'Georgia',
3966         'DE': 'Germany',
3967         'GH': 'Ghana',
3968         'GI': 'Gibraltar',
3969         'GR': 'Greece',
3970         'GL': 'Greenland',
3971         'GD': 'Grenada',
3972         'GP': 'Guadeloupe',
3973         'GU': 'Guam',
3974         'GT': 'Guatemala',
3975         'GG': 'Guernsey',
3976         'GN': 'Guinea',
3977         'GW': 'Guinea-Bissau',
3978         'GY': 'Guyana',
3979         'HT': 'Haiti',
3980         'HM': 'Heard Island and McDonald Islands',
3981         'VA': 'Holy See (Vatican City State)',
3982         'HN': 'Honduras',
3983         'HK': 'Hong Kong',
3984         'HU': 'Hungary',
3985         'IS': 'Iceland',
3986         'IN': 'India',
3987         'ID': 'Indonesia',
3988         'IR': 'Iran, Islamic Republic of',
3989         'IQ': 'Iraq',
3990         'IE': 'Ireland',
3991         'IM': 'Isle of Man',
3992         'IL': 'Israel',
3993         'IT': 'Italy',
3994         'JM': 'Jamaica',
3995         'JP': 'Japan',
3996         'JE': 'Jersey',
3997         'JO': 'Jordan',
3998         'KZ': 'Kazakhstan',
3999         'KE': 'Kenya',
4000         'KI': 'Kiribati',
4001         'KP': 'Korea, Democratic People\'s Republic of',
4002         'KR': 'Korea, Republic of',
4003         'KW': 'Kuwait',
4004         'KG': 'Kyrgyzstan',
4005         'LA': 'Lao People\'s Democratic Republic',
4006         'LV': 'Latvia',
4007         'LB': 'Lebanon',
4008         'LS': 'Lesotho',
4009         'LR': 'Liberia',
4010         'LY': 'Libya',
4011         'LI': 'Liechtenstein',
4012         'LT': 'Lithuania',
4013         'LU': 'Luxembourg',
4014         'MO': 'Macao',
4015         'MK': 'Macedonia, the Former Yugoslav Republic of',
4016         'MG': 'Madagascar',
4017         'MW': 'Malawi',
4018         'MY': 'Malaysia',
4019         'MV': 'Maldives',
4020         'ML': 'Mali',
4021         'MT': 'Malta',
4022         'MH': 'Marshall Islands',
4023         'MQ': 'Martinique',
4024         'MR': 'Mauritania',
4025         'MU': 'Mauritius',
4026         'YT': 'Mayotte',
4027         'MX': 'Mexico',
4028         'FM': 'Micronesia, Federated States of',
4029         'MD': 'Moldova, Republic of',
4030         'MC': 'Monaco',
4031         'MN': 'Mongolia',
4032         'ME': 'Montenegro',
4033         'MS': 'Montserrat',
4034         'MA': 'Morocco',
4035         'MZ': 'Mozambique',
4036         'MM': 'Myanmar',
4037         'NA': 'Namibia',
4038         'NR': 'Nauru',
4039         'NP': 'Nepal',
4040         'NL': 'Netherlands',
4041         'NC': 'New Caledonia',
4042         'NZ': 'New Zealand',
4043         'NI': 'Nicaragua',
4044         'NE': 'Niger',
4045         'NG': 'Nigeria',
4046         'NU': 'Niue',
4047         'NF': 'Norfolk Island',
4048         'MP': 'Northern Mariana Islands',
4049         'NO': 'Norway',
4050         'OM': 'Oman',
4051         'PK': 'Pakistan',
4052         'PW': 'Palau',
4053         'PS': 'Palestine, State of',
4054         'PA': 'Panama',
4055         'PG': 'Papua New Guinea',
4056         'PY': 'Paraguay',
4057         'PE': 'Peru',
4058         'PH': 'Philippines',
4059         'PN': 'Pitcairn',
4060         'PL': 'Poland',
4061         'PT': 'Portugal',
4062         'PR': 'Puerto Rico',
4063         'QA': 'Qatar',
4064         'RE': 'Réunion',
4065         'RO': 'Romania',
4066         'RU': 'Russian Federation',
4067         'RW': 'Rwanda',
4068         'BL': 'Saint Barthélemy',
4069         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4070         'KN': 'Saint Kitts and Nevis',
4071         'LC': 'Saint Lucia',
4072         'MF': 'Saint Martin (French part)',
4073         'PM': 'Saint Pierre and Miquelon',
4074         'VC': 'Saint Vincent and the Grenadines',
4075         'WS': 'Samoa',
4076         'SM': 'San Marino',
4077         'ST': 'Sao Tome and Principe',
4078         'SA': 'Saudi Arabia',
4079         'SN': 'Senegal',
4080         'RS': 'Serbia',
4081         'SC': 'Seychelles',
4082         'SL': 'Sierra Leone',
4083         'SG': 'Singapore',
4084         'SX': 'Sint Maarten (Dutch part)',
4085         'SK': 'Slovakia',
4086         'SI': 'Slovenia',
4087         'SB': 'Solomon Islands',
4088         'SO': 'Somalia',
4089         'ZA': 'South Africa',
4090         'GS': 'South Georgia and the South Sandwich Islands',
4091         'SS': 'South Sudan',
4092         'ES': 'Spain',
4093         'LK': 'Sri Lanka',
4094         'SD': 'Sudan',
4095         'SR': 'Suriname',
4096         'SJ': 'Svalbard and Jan Mayen',
4097         'SZ': 'Swaziland',
4098         'SE': 'Sweden',
4099         'CH': 'Switzerland',
4100         'SY': 'Syrian Arab Republic',
4101         'TW': 'Taiwan, Province of China',
4102         'TJ': 'Tajikistan',
4103         'TZ': 'Tanzania, United Republic of',
4104         'TH': 'Thailand',
4105         'TL': 'Timor-Leste',
4106         'TG': 'Togo',
4107         'TK': 'Tokelau',
4108         'TO': 'Tonga',
4109         'TT': 'Trinidad and Tobago',
4110         'TN': 'Tunisia',
4111         'TR': 'Turkey',
4112         'TM': 'Turkmenistan',
4113         'TC': 'Turks and Caicos Islands',
4114         'TV': 'Tuvalu',
4115         'UG': 'Uganda',
4116         'UA': 'Ukraine',
4117         'AE': 'United Arab Emirates',
4118         'GB': 'United Kingdom',
4119         'US': 'United States',
4120         'UM': 'United States Minor Outlying Islands',
4121         'UY': 'Uruguay',
4122         'UZ': 'Uzbekistan',
4123         'VU': 'Vanuatu',
4124         'VE': 'Venezuela, Bolivarian Republic of',
4125         'VN': 'Viet Nam',
4126         'VG': 'Virgin Islands, British',
4127         'VI': 'Virgin Islands, U.S.',
4128         'WF': 'Wallis and Futuna',
4129         'EH': 'Western Sahara',
4130         'YE': 'Yemen',
4131         'ZM': 'Zambia',
4132         'ZW': 'Zimbabwe',
4133     }
4134
4135     @classmethod
4136     def short2full(cls, code):
4137         """Convert an ISO 3166-2 country code to the corresponding full name"""
4138         return cls._country_map.get(code.upper())
4139
4140
4141 class GeoUtils(object):
4142     # Major IPv4 address blocks per country
4143     _country_ip_map = {
4144         'AD': '46.172.224.0/19',
4145         'AE': '94.200.0.0/13',
4146         'AF': '149.54.0.0/17',
4147         'AG': '209.59.64.0/18',
4148         'AI': '204.14.248.0/21',
4149         'AL': '46.99.0.0/16',
4150         'AM': '46.70.0.0/15',
4151         'AO': '105.168.0.0/13',
4152         'AP': '182.50.184.0/21',
4153         'AQ': '23.154.160.0/24',
4154         'AR': '181.0.0.0/12',
4155         'AS': '202.70.112.0/20',
4156         'AT': '77.116.0.0/14',
4157         'AU': '1.128.0.0/11',
4158         'AW': '181.41.0.0/18',
4159         'AX': '185.217.4.0/22',
4160         'AZ': '5.197.0.0/16',
4161         'BA': '31.176.128.0/17',
4162         'BB': '65.48.128.0/17',
4163         'BD': '114.130.0.0/16',
4164         'BE': '57.0.0.0/8',
4165         'BF': '102.178.0.0/15',
4166         'BG': '95.42.0.0/15',
4167         'BH': '37.131.0.0/17',
4168         'BI': '154.117.192.0/18',
4169         'BJ': '137.255.0.0/16',
4170         'BL': '185.212.72.0/23',
4171         'BM': '196.12.64.0/18',
4172         'BN': '156.31.0.0/16',
4173         'BO': '161.56.0.0/16',
4174         'BQ': '161.0.80.0/20',
4175         'BR': '191.128.0.0/12',
4176         'BS': '24.51.64.0/18',
4177         'BT': '119.2.96.0/19',
4178         'BW': '168.167.0.0/16',
4179         'BY': '178.120.0.0/13',
4180         'BZ': '179.42.192.0/18',
4181         'CA': '99.224.0.0/11',
4182         'CD': '41.243.0.0/16',
4183         'CF': '197.242.176.0/21',
4184         'CG': '160.113.0.0/16',
4185         'CH': '85.0.0.0/13',
4186         'CI': '102.136.0.0/14',
4187         'CK': '202.65.32.0/19',
4188         'CL': '152.172.0.0/14',
4189         'CM': '102.244.0.0/14',
4190         'CN': '36.128.0.0/10',
4191         'CO': '181.240.0.0/12',
4192         'CR': '201.192.0.0/12',
4193         'CU': '152.206.0.0/15',
4194         'CV': '165.90.96.0/19',
4195         'CW': '190.88.128.0/17',
4196         'CY': '31.153.0.0/16',
4197         'CZ': '88.100.0.0/14',
4198         'DE': '53.0.0.0/8',
4199         'DJ': '197.241.0.0/17',
4200         'DK': '87.48.0.0/12',
4201         'DM': '192.243.48.0/20',
4202         'DO': '152.166.0.0/15',
4203         'DZ': '41.96.0.0/12',
4204         'EC': '186.68.0.0/15',
4205         'EE': '90.190.0.0/15',
4206         'EG': '156.160.0.0/11',
4207         'ER': '196.200.96.0/20',
4208         'ES': '88.0.0.0/11',
4209         'ET': '196.188.0.0/14',
4210         'EU': '2.16.0.0/13',
4211         'FI': '91.152.0.0/13',
4212         'FJ': '144.120.0.0/16',
4213         'FK': '80.73.208.0/21',
4214         'FM': '119.252.112.0/20',
4215         'FO': '88.85.32.0/19',
4216         'FR': '90.0.0.0/9',
4217         'GA': '41.158.0.0/15',
4218         'GB': '25.0.0.0/8',
4219         'GD': '74.122.88.0/21',
4220         'GE': '31.146.0.0/16',
4221         'GF': '161.22.64.0/18',
4222         'GG': '62.68.160.0/19',
4223         'GH': '154.160.0.0/12',
4224         'GI': '95.164.0.0/16',
4225         'GL': '88.83.0.0/19',
4226         'GM': '160.182.0.0/15',
4227         'GN': '197.149.192.0/18',
4228         'GP': '104.250.0.0/19',
4229         'GQ': '105.235.224.0/20',
4230         'GR': '94.64.0.0/13',
4231         'GT': '168.234.0.0/16',
4232         'GU': '168.123.0.0/16',
4233         'GW': '197.214.80.0/20',
4234         'GY': '181.41.64.0/18',
4235         'HK': '113.252.0.0/14',
4236         'HN': '181.210.0.0/16',
4237         'HR': '93.136.0.0/13',
4238         'HT': '148.102.128.0/17',
4239         'HU': '84.0.0.0/14',
4240         'ID': '39.192.0.0/10',
4241         'IE': '87.32.0.0/12',
4242         'IL': '79.176.0.0/13',
4243         'IM': '5.62.80.0/20',
4244         'IN': '117.192.0.0/10',
4245         'IO': '203.83.48.0/21',
4246         'IQ': '37.236.0.0/14',
4247         'IR': '2.176.0.0/12',
4248         'IS': '82.221.0.0/16',
4249         'IT': '79.0.0.0/10',
4250         'JE': '87.244.64.0/18',
4251         'JM': '72.27.0.0/17',
4252         'JO': '176.29.0.0/16',
4253         'JP': '133.0.0.0/8',
4254         'KE': '105.48.0.0/12',
4255         'KG': '158.181.128.0/17',
4256         'KH': '36.37.128.0/17',
4257         'KI': '103.25.140.0/22',
4258         'KM': '197.255.224.0/20',
4259         'KN': '198.167.192.0/19',
4260         'KP': '175.45.176.0/22',
4261         'KR': '175.192.0.0/10',
4262         'KW': '37.36.0.0/14',
4263         'KY': '64.96.0.0/15',
4264         'KZ': '2.72.0.0/13',
4265         'LA': '115.84.64.0/18',
4266         'LB': '178.135.0.0/16',
4267         'LC': '24.92.144.0/20',
4268         'LI': '82.117.0.0/19',
4269         'LK': '112.134.0.0/15',
4270         'LR': '102.183.0.0/16',
4271         'LS': '129.232.0.0/17',
4272         'LT': '78.56.0.0/13',
4273         'LU': '188.42.0.0/16',
4274         'LV': '46.109.0.0/16',
4275         'LY': '41.252.0.0/14',
4276         'MA': '105.128.0.0/11',
4277         'MC': '88.209.64.0/18',
4278         'MD': '37.246.0.0/16',
4279         'ME': '178.175.0.0/17',
4280         'MF': '74.112.232.0/21',
4281         'MG': '154.126.0.0/17',
4282         'MH': '117.103.88.0/21',
4283         'MK': '77.28.0.0/15',
4284         'ML': '154.118.128.0/18',
4285         'MM': '37.111.0.0/17',
4286         'MN': '49.0.128.0/17',
4287         'MO': '60.246.0.0/16',
4288         'MP': '202.88.64.0/20',
4289         'MQ': '109.203.224.0/19',
4290         'MR': '41.188.64.0/18',
4291         'MS': '208.90.112.0/22',
4292         'MT': '46.11.0.0/16',
4293         'MU': '105.16.0.0/12',
4294         'MV': '27.114.128.0/18',
4295         'MW': '102.70.0.0/15',
4296         'MX': '187.192.0.0/11',
4297         'MY': '175.136.0.0/13',
4298         'MZ': '197.218.0.0/15',
4299         'NA': '41.182.0.0/16',
4300         'NC': '101.101.0.0/18',
4301         'NE': '197.214.0.0/18',
4302         'NF': '203.17.240.0/22',
4303         'NG': '105.112.0.0/12',
4304         'NI': '186.76.0.0/15',
4305         'NL': '145.96.0.0/11',
4306         'NO': '84.208.0.0/13',
4307         'NP': '36.252.0.0/15',
4308         'NR': '203.98.224.0/19',
4309         'NU': '49.156.48.0/22',
4310         'NZ': '49.224.0.0/14',
4311         'OM': '5.36.0.0/15',
4312         'PA': '186.72.0.0/15',
4313         'PE': '186.160.0.0/14',
4314         'PF': '123.50.64.0/18',
4315         'PG': '124.240.192.0/19',
4316         'PH': '49.144.0.0/13',
4317         'PK': '39.32.0.0/11',
4318         'PL': '83.0.0.0/11',
4319         'PM': '70.36.0.0/20',
4320         'PR': '66.50.0.0/16',
4321         'PS': '188.161.0.0/16',
4322         'PT': '85.240.0.0/13',
4323         'PW': '202.124.224.0/20',
4324         'PY': '181.120.0.0/14',
4325         'QA': '37.210.0.0/15',
4326         'RE': '102.35.0.0/16',
4327         'RO': '79.112.0.0/13',
4328         'RS': '93.86.0.0/15',
4329         'RU': '5.136.0.0/13',
4330         'RW': '41.186.0.0/16',
4331         'SA': '188.48.0.0/13',
4332         'SB': '202.1.160.0/19',
4333         'SC': '154.192.0.0/11',
4334         'SD': '102.120.0.0/13',
4335         'SE': '78.64.0.0/12',
4336         'SG': '8.128.0.0/10',
4337         'SI': '188.196.0.0/14',
4338         'SK': '78.98.0.0/15',
4339         'SL': '102.143.0.0/17',
4340         'SM': '89.186.32.0/19',
4341         'SN': '41.82.0.0/15',
4342         'SO': '154.115.192.0/18',
4343         'SR': '186.179.128.0/17',
4344         'SS': '105.235.208.0/21',
4345         'ST': '197.159.160.0/19',
4346         'SV': '168.243.0.0/16',
4347         'SX': '190.102.0.0/20',
4348         'SY': '5.0.0.0/16',
4349         'SZ': '41.84.224.0/19',
4350         'TC': '65.255.48.0/20',
4351         'TD': '154.68.128.0/19',
4352         'TG': '196.168.0.0/14',
4353         'TH': '171.96.0.0/13',
4354         'TJ': '85.9.128.0/18',
4355         'TK': '27.96.24.0/21',
4356         'TL': '180.189.160.0/20',
4357         'TM': '95.85.96.0/19',
4358         'TN': '197.0.0.0/11',
4359         'TO': '175.176.144.0/21',
4360         'TR': '78.160.0.0/11',
4361         'TT': '186.44.0.0/15',
4362         'TV': '202.2.96.0/19',
4363         'TW': '120.96.0.0/11',
4364         'TZ': '156.156.0.0/14',
4365         'UA': '37.52.0.0/14',
4366         'UG': '102.80.0.0/13',
4367         'US': '6.0.0.0/8',
4368         'UY': '167.56.0.0/13',
4369         'UZ': '84.54.64.0/18',
4370         'VA': '212.77.0.0/19',
4371         'VC': '207.191.240.0/21',
4372         'VE': '186.88.0.0/13',
4373         'VG': '66.81.192.0/20',
4374         'VI': '146.226.0.0/16',
4375         'VN': '14.160.0.0/11',
4376         'VU': '202.80.32.0/20',
4377         'WF': '117.20.32.0/21',
4378         'WS': '202.4.32.0/19',
4379         'YE': '134.35.0.0/16',
4380         'YT': '41.242.116.0/22',
4381         'ZA': '41.0.0.0/11',
4382         'ZM': '102.144.0.0/13',
4383         'ZW': '102.177.192.0/18',
4384     }
4385
4386     @classmethod
4387     def random_ipv4(cls, code_or_block):
4388         if len(code_or_block) == 2:
4389             block = cls._country_ip_map.get(code_or_block.upper())
4390             if not block:
4391                 return None
4392         else:
4393             block = code_or_block
4394         addr, preflen = block.split('/')
4395         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4396         addr_max = addr_min | (0xffffffff >> int(preflen))
4397         return compat_str(socket.inet_ntoa(
4398             compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4399
4400
4401 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4402     def __init__(self, proxies=None):
4403         # Set default handlers
4404         for type in ('http', 'https'):
4405             setattr(self, '%s_open' % type,
4406                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4407                         meth(r, proxy, type))
4408         compat_urllib_request.ProxyHandler.__init__(self, proxies)
4409
4410     def proxy_open(self, req, proxy, type):
4411         req_proxy = req.headers.get('Ytdl-request-proxy')
4412         if req_proxy is not None:
4413             proxy = req_proxy
4414             del req.headers['Ytdl-request-proxy']
4415
4416         if proxy == '__noproxy__':
4417             return None  # No Proxy
4418         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4419             req.add_header('Ytdl-socks-proxy', proxy)
4420             # yt-dlp's http/https handlers do wrapping the socket with socks
4421             return None
4422         return compat_urllib_request.ProxyHandler.proxy_open(
4423             self, req, proxy, type)
4424
4425
4426 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4427 # released into Public Domain
4428 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4429
4430 def long_to_bytes(n, blocksize=0):
4431     """long_to_bytes(n:long, blocksize:int) : string
4432     Convert a long integer to a byte string.
4433
4434     If optional blocksize is given and greater than zero, pad the front of the
4435     byte string with binary zeros so that the length is a multiple of
4436     blocksize.
4437     """
4438     # after much testing, this algorithm was deemed to be the fastest
4439     s = b''
4440     n = int(n)
4441     while n > 0:
4442         s = compat_struct_pack('>I', n & 0xffffffff) + s
4443         n = n >> 32
4444     # strip off leading zeros
4445     for i in range(len(s)):
4446         if s[i] != b'\000'[0]:
4447             break
4448     else:
4449         # only happens when n == 0
4450         s = b'\000'
4451         i = 0
4452     s = s[i:]
4453     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4454     # de-padding being done above, but sigh...
4455     if blocksize > 0 and len(s) % blocksize:
4456         s = (blocksize - len(s) % blocksize) * b'\000' + s
4457     return s
4458
4459
4460 def bytes_to_long(s):
4461     """bytes_to_long(string) : long
4462     Convert a byte string to a long integer.
4463
4464     This is (essentially) the inverse of long_to_bytes().
4465     """
4466     acc = 0
4467     length = len(s)
4468     if length % 4:
4469         extra = (4 - length % 4)
4470         s = b'\000' * extra + s
4471         length = length + extra
4472     for i in range(0, length, 4):
4473         acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4474     return acc
4475
4476
4477 def ohdave_rsa_encrypt(data, exponent, modulus):
4478     '''
4479     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4480
4481     Input:
4482         data: data to encrypt, bytes-like object
4483         exponent, modulus: parameter e and N of RSA algorithm, both integer
4484     Output: hex string of encrypted data
4485
4486     Limitation: supports one block encryption only
4487     '''
4488
4489     payload = int(binascii.hexlify(data[::-1]), 16)
4490     encrypted = pow(payload, exponent, modulus)
4491     return '%x' % encrypted
4492
4493
4494 def pkcs1pad(data, length):
4495     """
4496     Padding input data with PKCS#1 scheme
4497
4498     @param {int[]} data        input data
4499     @param {int}   length      target length
4500     @returns {int[]}           padded data
4501     """
4502     if len(data) > length - 11:
4503         raise ValueError('Input data too long for PKCS#1 padding')
4504
4505     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4506     return [0, 2] + pseudo_random + [0] + data
4507
4508
4509 def encode_base_n(num, n, table=None):
4510     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4511     if not table:
4512         table = FULL_TABLE[:n]
4513
4514     if n > len(table):
4515         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4516
4517     if num == 0:
4518         return table[0]
4519
4520     ret = ''
4521     while num:
4522         ret = table[num % n] + ret
4523         num = num // n
4524     return ret
4525
4526
4527 def decode_packed_codes(code):
4528     mobj = re.search(PACKED_CODES_RE, code)
4529     obfuscated_code, base, count, symbols = mobj.groups()
4530     base = int(base)
4531     count = int(count)
4532     symbols = symbols.split('|')
4533     symbol_table = {}
4534
4535     while count:
4536         count -= 1
4537         base_n_count = encode_base_n(count, base)
4538         symbol_table[base_n_count] = symbols[count] or base_n_count
4539
4540     return re.sub(
4541         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4542         obfuscated_code)
4543
4544
4545 def caesar(s, alphabet, shift):
4546     if shift == 0:
4547         return s
4548     l = len(alphabet)
4549     return ''.join(
4550         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4551         for c in s)
4552
4553
4554 def rot47(s):
4555     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4556
4557
4558 def parse_m3u8_attributes(attrib):
4559     info = {}
4560     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4561         if val.startswith('"'):
4562             val = val[1:-1]
4563         info[key] = val
4564     return info
4565
4566
4567 def urshift(val, n):
4568     return val >> n if val >= 0 else (val + 0x100000000) >> n
4569
4570
4571 # Based on png2str() written by @gdkchan and improved by @yokrysty
4572 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4573 def decode_png(png_data):
4574     # Reference: https://www.w3.org/TR/PNG/
4575     header = png_data[8:]
4576
4577     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4578         raise IOError('Not a valid PNG file.')
4579
4580     int_map = {1: '>B', 2: '>H', 4: '>I'}
4581     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4582
4583     chunks = []
4584
4585     while header:
4586         length = unpack_integer(header[:4])
4587         header = header[4:]
4588
4589         chunk_type = header[:4]
4590         header = header[4:]
4591
4592         chunk_data = header[:length]
4593         header = header[length:]
4594
4595         header = header[4:]  # Skip CRC
4596
4597         chunks.append({
4598             'type': chunk_type,
4599             'length': length,
4600             'data': chunk_data
4601         })
4602
4603     ihdr = chunks[0]['data']
4604
4605     width = unpack_integer(ihdr[:4])
4606     height = unpack_integer(ihdr[4:8])
4607
4608     idat = b''
4609
4610     for chunk in chunks:
4611         if chunk['type'] == b'IDAT':
4612             idat += chunk['data']
4613
4614     if not idat:
4615         raise IOError('Unable to read PNG data.')
4616
4617     decompressed_data = bytearray(zlib.decompress(idat))
4618
4619     stride = width * 3
4620     pixels = []
4621
4622     def _get_pixel(idx):
4623         x = idx % stride
4624         y = idx // stride
4625         return pixels[y][x]
4626
4627     for y in range(height):
4628         basePos = y * (1 + stride)
4629         filter_type = decompressed_data[basePos]
4630
4631         current_row = []
4632
4633         pixels.append(current_row)
4634
4635         for x in range(stride):
4636             color = decompressed_data[1 + basePos + x]
4637             basex = y * stride + x
4638             left = 0
4639             up = 0
4640
4641             if x > 2:
4642                 left = _get_pixel(basex - 3)
4643             if y > 0:
4644                 up = _get_pixel(basex - stride)
4645
4646             if filter_type == 1:  # Sub
4647                 color = (color + left) & 0xff
4648             elif filter_type == 2:  # Up
4649                 color = (color + up) & 0xff
4650             elif filter_type == 3:  # Average
4651                 color = (color + ((left + up) >> 1)) & 0xff
4652             elif filter_type == 4:  # Paeth
4653                 a = left
4654                 b = up
4655                 c = 0
4656
4657                 if x > 2 and y > 0:
4658                     c = _get_pixel(basex - stride - 3)
4659
4660                 p = a + b - c
4661
4662                 pa = abs(p - a)
4663                 pb = abs(p - b)
4664                 pc = abs(p - c)
4665
4666                 if pa <= pb and pa <= pc:
4667                     color = (color + a) & 0xff
4668                 elif pb <= pc:
4669                     color = (color + b) & 0xff
4670                 else:
4671                     color = (color + c) & 0xff
4672
4673             current_row.append(color)
4674
4675     return width, height, pixels
4676
4677
4678 def write_xattr(path, key, value):
4679     # This mess below finds the best xattr tool for the job
4680     try:
4681         # try the pyxattr module...
4682         import xattr
4683
4684         if hasattr(xattr, 'set'):  # pyxattr
4685             # Unicode arguments are not supported in python-pyxattr until
4686             # version 0.5.0
4687             # See https://github.com/ytdl-org/youtube-dl/issues/5498
4688             pyxattr_required_version = '0.5.0'
4689             if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4690                 # TODO: fallback to CLI tools
4691                 raise XAttrUnavailableError(
4692                     'python-pyxattr is detected but is too old. '
4693                     'yt-dlp requires %s or above while your version is %s. '
4694                     'Falling back to other xattr implementations' % (
4695                         pyxattr_required_version, xattr.__version__))
4696
4697             setxattr = xattr.set
4698         else:  # xattr
4699             setxattr = xattr.setxattr
4700
4701         try:
4702             setxattr(path, key, value)
4703         except EnvironmentError as e:
4704             raise XAttrMetadataError(e.errno, e.strerror)
4705
4706     except ImportError:
4707         if compat_os_name == 'nt':
4708             # Write xattrs to NTFS Alternate Data Streams:
4709             # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4710             assert ':' not in key
4711             assert os.path.exists(path)
4712
4713             ads_fn = path + ':' + key
4714             try:
4715                 with open(ads_fn, 'wb') as f:
4716                     f.write(value)
4717             except EnvironmentError as e:
4718                 raise XAttrMetadataError(e.errno, e.strerror)
4719         else:
4720             user_has_setfattr = check_executable('setfattr', ['--version'])
4721             user_has_xattr = check_executable('xattr', ['-h'])
4722
4723             if user_has_setfattr or user_has_xattr:
4724
4725                 value = value.decode('utf-8')
4726                 if user_has_setfattr:
4727                     executable = 'setfattr'
4728                     opts = ['-n', key, '-v', value]
4729                 elif user_has_xattr:
4730                     executable = 'xattr'
4731                     opts = ['-w', key, value]
4732
4733                 cmd = ([encodeFilename(executable, True)]
4734                        + [encodeArgument(o) for o in opts]
4735                        + [encodeFilename(path, True)])
4736
4737                 try:
4738                     p = Popen(
4739                         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4740                 except EnvironmentError as e:
4741                     raise XAttrMetadataError(e.errno, e.strerror)
4742                 stdout, stderr = p.communicate_or_kill()
4743                 stderr = stderr.decode('utf-8', 'replace')
4744                 if p.returncode != 0:
4745                     raise XAttrMetadataError(p.returncode, stderr)
4746
4747             else:
4748                 # On Unix, and can't find pyxattr, setfattr, or xattr.
4749                 if sys.platform.startswith('linux'):
4750                     raise XAttrUnavailableError(
4751                         "Couldn't find a tool to set the xattrs. "
4752                         "Install either the python 'pyxattr' or 'xattr' "
4753                         "modules, or the GNU 'attr' package "
4754                         "(which contains the 'setfattr' tool).")
4755                 else:
4756                     raise XAttrUnavailableError(
4757                         "Couldn't find a tool to set the xattrs. "
4758                         "Install either the python 'xattr' module, "
4759                         "or the 'xattr' binary.")
4760
4761
4762 def random_birthday(year_field, month_field, day_field):
4763     start_date = datetime.date(1950, 1, 1)
4764     end_date = datetime.date(1995, 12, 31)
4765     offset = random.randint(0, (end_date - start_date).days)
4766     random_date = start_date + datetime.timedelta(offset)
4767     return {
4768         year_field: str(random_date.year),
4769         month_field: str(random_date.month),
4770         day_field: str(random_date.day),
4771     }
4772
4773
4774 # Templates for internet shortcut files, which are plain text files.
4775 DOT_URL_LINK_TEMPLATE = '''
4776 [InternetShortcut]
4777 URL=%(url)s
4778 '''.lstrip()
4779
4780 DOT_WEBLOC_LINK_TEMPLATE = '''
4781 <?xml version="1.0" encoding="UTF-8"?>
4782 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4783 <plist version="1.0">
4784 <dict>
4785 \t<key>URL</key>
4786 \t<string>%(url)s</string>
4787 </dict>
4788 </plist>
4789 '''.lstrip()
4790
4791 DOT_DESKTOP_LINK_TEMPLATE = '''
4792 [Desktop Entry]
4793 Encoding=UTF-8
4794 Name=%(filename)s
4795 Type=Link
4796 URL=%(url)s
4797 Icon=text-html
4798 '''.lstrip()
4799
4800 LINK_TEMPLATES = {
4801     'url': DOT_URL_LINK_TEMPLATE,
4802     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4803     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4804 }
4805
4806
4807 def iri_to_uri(iri):
4808     """
4809     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4810
4811     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4812     """
4813
4814     iri_parts = compat_urllib_parse_urlparse(iri)
4815
4816     if '[' in iri_parts.netloc:
4817         raise ValueError('IPv6 URIs are not, yet, supported.')
4818         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4819
4820     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4821
4822     net_location = ''
4823     if iri_parts.username:
4824         net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4825         if iri_parts.password is not None:
4826             net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4827         net_location += '@'
4828
4829     net_location += iri_parts.hostname.encode('idna').decode('utf-8')  # Punycode for Unicode hostnames.
4830     # The 'idna' encoding produces ASCII text.
4831     if iri_parts.port is not None and iri_parts.port != 80:
4832         net_location += ':' + str(iri_parts.port)
4833
4834     return compat_urllib_parse_urlunparse(
4835         (iri_parts.scheme,
4836             net_location,
4837
4838             compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4839
4840             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4841             compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4842
4843             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4844             compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4845
4846             compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4847
4848     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4849
4850
4851 def to_high_limit_path(path):
4852     if sys.platform in ['win32', 'cygwin']:
4853         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4854         return r'\\?\ '.rstrip() + os.path.abspath(path)
4855
4856     return path
4857
4858
4859 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4860     if field is None:
4861         val = obj if obj is not None else default
4862     else:
4863         val = obj.get(field, default)
4864     if func and val not in ignore:
4865         val = func(val)
4866     return template % val if val not in ignore else default
4867
4868
4869 def clean_podcast_url(url):
4870     return re.sub(r'''(?x)
4871         (?:
4872             (?:
4873                 chtbl\.com/track|
4874                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4875                 play\.podtrac\.com
4876             )/[^/]+|
4877             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4878             flex\.acast\.com|
4879             pd(?:
4880                 cn\.co| # https://podcorn.com/analytics-prefix/
4881                 st\.fm # https://podsights.com/docs/
4882             )/e
4883         )/''', '', url)
4884
4885
4886 _HEX_TABLE = '0123456789abcdef'
4887
4888
4889 def random_uuidv4():
4890     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4891
4892
4893 def make_dir(path, to_screen=None):
4894     try:
4895         dn = os.path.dirname(path)
4896         if dn and not os.path.exists(dn):
4897             os.makedirs(dn)
4898         return True
4899     except (OSError, IOError) as err:
4900         if callable(to_screen) is not None:
4901             to_screen('unable to create directory ' + error_to_compat_str(err))
4902         return False
4903
4904
4905 def get_executable_path():
4906     from zipimport import zipimporter
4907     if hasattr(sys, 'frozen'):  # Running from PyInstaller
4908         path = os.path.dirname(sys.executable)
4909     elif isinstance(globals().get('__loader__'), zipimporter):  # Running from ZIP
4910         path = os.path.join(os.path.dirname(__file__), '../..')
4911     else:
4912         path = os.path.join(os.path.dirname(__file__), '..')
4913     return os.path.abspath(path)
4914
4915
4916 def load_plugins(name, suffix, namespace):
4917     classes = {}
4918     try:
4919         plugins_spec = importlib.util.spec_from_file_location(
4920             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4921         plugins = importlib.util.module_from_spec(plugins_spec)
4922         sys.modules[plugins_spec.name] = plugins
4923         plugins_spec.loader.exec_module(plugins)
4924         for name in dir(plugins):
4925             if name in namespace:
4926                 continue
4927             if not name.endswith(suffix):
4928                 continue
4929             klass = getattr(plugins, name)
4930             classes[name] = namespace[name] = klass
4931     except FileNotFoundError:
4932         pass
4933     return classes
4934
4935
4936 def traverse_obj(
4937         obj, *path_list, default=None, expected_type=None, get_all=True,
4938         casesense=True, is_user_input=False, traverse_string=False):
4939     ''' Traverse nested list/dict/tuple
4940     @param path_list        A list of paths which are checked one by one.
4941                             Each path is a list of keys where each key is a string,
4942                             a function, a tuple of strings/None or "...".
4943                             When a fuction is given, it takes the key as argument and
4944                             returns whether the key matches or not. When a tuple is given,
4945                             all the keys given in the tuple are traversed, and
4946                             "..." traverses all the keys in the object
4947                             "None" returns the object without traversal
4948     @param default          Default value to return
4949     @param expected_type    Only accept final value of this type (Can also be any callable)
4950     @param get_all          Return all the values obtained from a path or only the first one
4951     @param casesense        Whether to consider dictionary keys as case sensitive
4952     @param is_user_input    Whether the keys are generated from user input. If True,
4953                             strings are converted to int/slice if necessary
4954     @param traverse_string  Whether to traverse inside strings. If True, any
4955                             non-compatible object will also be converted into a string
4956     # TODO: Write tests
4957     '''
4958     if not casesense:
4959         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4960         path_list = (map(_lower, variadic(path)) for path in path_list)
4961
4962     def _traverse_obj(obj, path, _current_depth=0):
4963         nonlocal depth
4964         path = tuple(variadic(path))
4965         for i, key in enumerate(path):
4966             if None in (key, obj):
4967                 return obj
4968             if isinstance(key, (list, tuple)):
4969                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4970                 key = ...
4971             if key is ...:
4972                 obj = (obj.values() if isinstance(obj, dict)
4973                        else obj if isinstance(obj, (list, tuple, LazyList))
4974                        else str(obj) if traverse_string else [])
4975                 _current_depth += 1
4976                 depth = max(depth, _current_depth)
4977                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4978             elif callable(key):
4979                 if isinstance(obj, (list, tuple, LazyList)):
4980                     obj = enumerate(obj)
4981                 elif isinstance(obj, dict):
4982                     obj = obj.items()
4983                 else:
4984                     if not traverse_string:
4985                         return None
4986                     obj = str(obj)
4987                 _current_depth += 1
4988                 depth = max(depth, _current_depth)
4989                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
4990             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
4991                 obj = (obj.get(key) if casesense or (key in obj)
4992                        else next((v for k, v in obj.items() if _lower(k) == key), None))
4993             else:
4994                 if is_user_input:
4995                     key = (int_or_none(key) if ':' not in key
4996                            else slice(*map(int_or_none, key.split(':'))))
4997                     if key == slice(None):
4998                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
4999                 if not isinstance(key, (int, slice)):
5000                     return None
5001                 if not isinstance(obj, (list, tuple, LazyList)):
5002                     if not traverse_string:
5003                         return None
5004                     obj = str(obj)
5005                 try:
5006                     obj = obj[key]
5007                 except IndexError:
5008                     return None
5009         return obj
5010
5011     if isinstance(expected_type, type):
5012         type_test = lambda val: val if isinstance(val, expected_type) else None
5013     elif expected_type is not None:
5014         type_test = expected_type
5015     else:
5016         type_test = lambda val: val
5017
5018     for path in path_list:
5019         depth = 0
5020         val = _traverse_obj(obj, path)
5021         if val is not None:
5022             if depth:
5023                 for _ in range(depth - 1):
5024                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5025                 val = [v for v in map(type_test, val) if v is not None]
5026                 if val:
5027                     return val if get_all else val[0]
5028             else:
5029                 val = type_test(val)
5030                 if val is not None:
5031                     return val
5032     return default
5033
5034
5035 # Deprecated
5036 def traverse_dict(dictn, keys, casesense=True):
5037     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5038                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5039     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5040
5041
5042 def variadic(x, allowed_types=(str, bytes, dict)):
5043     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5044
5045
5046 # create a JSON Web Signature (jws) with HS256 algorithm
5047 # the resulting format is in JWS Compact Serialization
5048 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5049 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5050 def jwt_encode_hs256(payload_data, key, headers={}):
5051     header_data = {
5052         'alg': 'HS256',
5053         'typ': 'JWT',
5054     }
5055     if headers:
5056         header_data.update(headers)
5057     header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5058     payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5059     h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5060     signature_b64 = base64.b64encode(h.digest())
5061     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5062     return token
5063
5064
5065 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5066 def jwt_decode_hs256(jwt):
5067     header_b64, payload_b64, signature_b64 = jwt.split('.')
5068     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5069     return payload_data
5070
5071
5072 def supports_terminal_sequences(stream):
5073     if compat_os_name == 'nt':
5074         from .compat import WINDOWS_VT_MODE  # Must be imported locally
5075         if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5076             return False
5077     elif not os.getenv('TERM'):
5078         return False
5079     try:
5080         return stream.isatty()
5081     except BaseException:
5082         return False
5083
5084
5085 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5086
5087
5088 def remove_terminal_sequences(string):
5089     return _terminal_sequences_re.sub('', string)
5090
5091
5092 def number_of_digits(number):
5093     return len('%d' % number)
5094
5095
5096 def join_nonempty(*values, delim='-', from_dict=None):
5097     if from_dict is not None:
5098         values = map(from_dict.get, values)
5099     return delim.join(map(str, filter(None, values)))