yt_dlp/utils/_utils.py

   1 import base64
   2 import binascii
   3 import calendar
   4 import codecs
   5 import collections
   6 import collections.abc
   7 import contextlib
   8 import datetime
   9 import email.header
  10 import email.utils
  11 import errno
  12 import hashlib
  13 import hmac
  14 import html.entities
  15 import html.parser
  16 import inspect
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import mimetypes
  23 import netrc
  24 import operator
  25 import os
  26 import platform
  27 import random
  28 import re
  29 import shlex
  30 import socket
  31 import ssl
  32 import struct
  33 import subprocess
  34 import sys
  35 import tempfile
  36 import time
  37 import traceback
  38 import types
  39 import unicodedata
  40 import urllib.error
  41 import urllib.parse
  42 import urllib.request
  43 import xml.etree.ElementTree
  44
  45 from . import traversal
  46
  47 from ..compat import functools  # isort: split
  48 from ..compat import (
  49     compat_etree_fromstring,
  50     compat_expanduser,
  51     compat_HTMLParseError,
  52     compat_os_name,
  53     compat_shlex_quote,
  54 )
  55 from ..dependencies import xattr
  56
  57 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  58
  59 # This is not clearly defined otherwise
  60 compiled_regex_type = type(re.compile(''))
  61
  62
  63 class NO_DEFAULT:
  64     pass
  65
  66
  67 def IDENTITY(x):
  68     return x
  69
  70
  71 ENGLISH_MONTH_NAMES = [
  72     'January', 'February', 'March', 'April', 'May', 'June',
  73     'July', 'August', 'September', 'October', 'November', 'December']
  74
  75 MONTH_NAMES = {
  76     'en': ENGLISH_MONTH_NAMES,
  77     'fr': [
  78         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  79         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  80     # these follow the genitive grammatical case (dopełniacz)
  81     # some websites might be using nominative, which will require another month list
  82     # https://en.wikibooks.org/wiki/Polish/Noun_cases
  83     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  84            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  85 }
  86
  87 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  88 TIMEZONE_NAMES = {
  89     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  90     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
  91     'EST': -5, 'EDT': -4,  # Eastern
  92     'CST': -6, 'CDT': -5,  # Central
  93     'MST': -7, 'MDT': -6,  # Mountain
  94     'PST': -8, 'PDT': -7   # Pacific
  95 }
  96
  97 # needed for sanitizing filenames in restricted mode
  98 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
  99                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 100                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 101
 102 DATE_FORMATS = (
 103     '%d %B %Y',
 104     '%d %b %Y',
 105     '%B %d %Y',
 106     '%B %dst %Y',
 107     '%B %dnd %Y',
 108     '%B %drd %Y',
 109     '%B %dth %Y',
 110     '%b %d %Y',
 111     '%b %dst %Y',
 112     '%b %dnd %Y',
 113     '%b %drd %Y',
 114     '%b %dth %Y',
 115     '%b %dst %Y %I:%M',
 116     '%b %dnd %Y %I:%M',
 117     '%b %drd %Y %I:%M',
 118     '%b %dth %Y %I:%M',
 119     '%Y %m %d',
 120     '%Y-%m-%d',
 121     '%Y.%m.%d.',
 122     '%Y/%m/%d',
 123     '%Y/%m/%d %H:%M',
 124     '%Y/%m/%d %H:%M:%S',
 125     '%Y%m%d%H%M',
 126     '%Y%m%d%H%M%S',
 127     '%Y%m%d',
 128     '%Y-%m-%d %H:%M',
 129     '%Y-%m-%d %H:%M:%S',
 130     '%Y-%m-%d %H:%M:%S.%f',
 131     '%Y-%m-%d %H:%M:%S:%f',
 132     '%d.%m.%Y %H:%M',
 133     '%d.%m.%Y %H.%M',
 134     '%Y-%m-%dT%H:%M:%SZ',
 135     '%Y-%m-%dT%H:%M:%S.%fZ',
 136     '%Y-%m-%dT%H:%M:%S.%f0Z',
 137     '%Y-%m-%dT%H:%M:%S',
 138     '%Y-%m-%dT%H:%M:%S.%f',
 139     '%Y-%m-%dT%H:%M',
 140     '%b %d %Y at %H:%M',
 141     '%b %d %Y at %H:%M:%S',
 142     '%B %d %Y at %H:%M',
 143     '%B %d %Y at %H:%M:%S',
 144     '%H:%M %d-%b-%Y',
 145 )
 146
 147 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 148 DATE_FORMATS_DAY_FIRST.extend([
 149     '%d-%m-%Y',
 150     '%d.%m.%Y',
 151     '%d.%m.%y',
 152     '%d/%m/%Y',
 153     '%d/%m/%y',
 154     '%d/%m/%Y %H:%M:%S',
 155     '%d-%m-%Y %H:%M',
 156     '%H:%M %d/%m/%Y',
 157 ])
 158
 159 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 160 DATE_FORMATS_MONTH_FIRST.extend([
 161     '%m-%d-%Y',
 162     '%m.%d.%Y',
 163     '%m/%d/%Y',
 164     '%m/%d/%y',
 165     '%m/%d/%Y %H:%M:%S',
 166 ])
 167
 168 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 169 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 170
 171 NUMBER_RE = r'\d+(?:\.\d+)?'
 172
 173
 174 @functools.cache
 175 def preferredencoding():
 176     """Get preferred encoding.
 177
 178     Returns the best encoding scheme for the system, based on
 179     locale.getpreferredencoding() and some further tweaks.
 180     """
 181     try:
 182         pref = locale.getpreferredencoding()
 183         'TEST'.encode(pref)
 184     except Exception:
 185         pref = 'UTF-8'
 186
 187     return pref
 188
 189
 190 def write_json_file(obj, fn):
 191     """ Encode obj as JSON and write it to fn, atomically if possible """
 192
 193     tf = tempfile.NamedTemporaryFile(
 194         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 195         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 196
 197     try:
 198         with tf:
 199             json.dump(obj, tf, ensure_ascii=False)
 200         if sys.platform == 'win32':
 201             # Need to remove existing file on Windows, else os.rename raises
 202             # WindowsError or FileExistsError.
 203             with contextlib.suppress(OSError):
 204                 os.unlink(fn)
 205         with contextlib.suppress(OSError):
 206             mask = os.umask(0)
 207             os.umask(mask)
 208             os.chmod(tf.name, 0o666 & ~mask)
 209         os.rename(tf.name, fn)
 210     except Exception:
 211         with contextlib.suppress(OSError):
 212             os.remove(tf.name)
 213         raise
 214
 215
 216 def find_xpath_attr(node, xpath, key, val=None):
 217     """ Find the xpath xpath[@key=val] """
 218     assert re.match(r'^[a-zA-Z_-]+$', key)
 219     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 220     return node.find(expr)
 221
 222 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 223 # the namespace parameter
 224
 225
 226 def xpath_with_ns(path, ns_map):
 227     components = [c.split(':') for c in path.split('/')]
 228     replaced = []
 229     for c in components:
 230         if len(c) == 1:
 231             replaced.append(c[0])
 232         else:
 233             ns, tag = c
 234             replaced.append('{%s}%s' % (ns_map[ns], tag))
 235     return '/'.join(replaced)
 236
 237
 238 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 239     def _find_xpath(xpath):
 240         return node.find(xpath)
 241
 242     if isinstance(xpath, str):
 243         n = _find_xpath(xpath)
 244     else:
 245         for xp in xpath:
 246             n = _find_xpath(xp)
 247             if n is not None:
 248                 break
 249
 250     if n is None:
 251         if default is not NO_DEFAULT:
 252             return default
 253         elif fatal:
 254             name = xpath if name is None else name
 255             raise ExtractorError('Could not find XML element %s' % name)
 256         else:
 257             return None
 258     return n
 259
 260
 261 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 262     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 263     if n is None or n == default:
 264         return n
 265     if n.text is None:
 266         if default is not NO_DEFAULT:
 267             return default
 268         elif fatal:
 269             name = xpath if name is None else name
 270             raise ExtractorError('Could not find XML element\'s text %s' % name)
 271         else:
 272             return None
 273     return n.text
 274
 275
 276 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 277     n = find_xpath_attr(node, xpath, key)
 278     if n is None:
 279         if default is not NO_DEFAULT:
 280             return default
 281         elif fatal:
 282             name = f'{xpath}[@{key}]' if name is None else name
 283             raise ExtractorError('Could not find XML attribute %s' % name)
 284         else:
 285             return None
 286     return n.attrib[key]
 287
 288
 289 def get_element_by_id(id, html, **kwargs):
 290     """Return the content of the tag with the specified ID in the passed HTML document"""
 291     return get_element_by_attribute('id', id, html, **kwargs)
 292
 293
 294 def get_element_html_by_id(id, html, **kwargs):
 295     """Return the html of the tag with the specified ID in the passed HTML document"""
 296     return get_element_html_by_attribute('id', id, html, **kwargs)
 297
 298
 299 def get_element_by_class(class_name, html):
 300     """Return the content of the first tag with the specified class in the passed HTML document"""
 301     retval = get_elements_by_class(class_name, html)
 302     return retval[0] if retval else None
 303
 304
 305 def get_element_html_by_class(class_name, html):
 306     """Return the html of the first tag with the specified class in the passed HTML document"""
 307     retval = get_elements_html_by_class(class_name, html)
 308     return retval[0] if retval else None
 309
 310
 311 def get_element_by_attribute(attribute, value, html, **kwargs):
 312     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 313     return retval[0] if retval else None
 314
 315
 316 def get_element_html_by_attribute(attribute, value, html, **kargs):
 317     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 318     return retval[0] if retval else None
 319
 320
 321 def get_elements_by_class(class_name, html, **kargs):
 322     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 323     return get_elements_by_attribute(
 324         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 325         html, escape_value=False)
 326
 327
 328 def get_elements_html_by_class(class_name, html):
 329     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 330     return get_elements_html_by_attribute(
 331         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 332         html, escape_value=False)
 333
 334
 335 def get_elements_by_attribute(*args, **kwargs):
 336     """Return the content of the tag with the specified attribute in the passed HTML document"""
 337     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 338
 339
 340 def get_elements_html_by_attribute(*args, **kwargs):
 341     """Return the html of the tag with the specified attribute in the passed HTML document"""
 342     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 343
 344
 345 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 346     """
 347     Return the text (content) and the html (whole) of the tag with the specified
 348     attribute in the passed HTML document
 349     """
 350     if not value:
 351         return
 352
 353     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 354
 355     value = re.escape(value) if escape_value else value
 356
 357     partial_element_re = rf'''(?x)
 358         <(?P<tag>{tag})
 359          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 360          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 361         '''
 362
 363     for m in re.finditer(partial_element_re, html):
 364         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 365
 366         yield (
 367             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 368             whole
 369         )
 370
 371
 372 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 373     """
 374     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 375     closing tag for the first opening tag it has encountered, and can be used
 376     as a context manager
 377     """
 378
 379     class HTMLBreakOnClosingTagException(Exception):
 380         pass
 381
 382     def __init__(self):
 383         self.tagstack = collections.deque()
 384         html.parser.HTMLParser.__init__(self)
 385
 386     def __enter__(self):
 387         return self
 388
 389     def __exit__(self, *_):
 390         self.close()
 391
 392     def close(self):
 393         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 394         # so data remains buffered; we no longer have any interest in it, thus
 395         # override this method to discard it
 396         pass
 397
 398     def handle_starttag(self, tag, _):
 399         self.tagstack.append(tag)
 400
 401     def handle_endtag(self, tag):
 402         if not self.tagstack:
 403             raise compat_HTMLParseError('no tags in the stack')
 404         while self.tagstack:
 405             inner_tag = self.tagstack.pop()
 406             if inner_tag == tag:
 407                 break
 408         else:
 409             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 410         if not self.tagstack:
 411             raise self.HTMLBreakOnClosingTagException()
 412
 413
 414 # XXX: This should be far less strict
 415 def get_element_text_and_html_by_tag(tag, html):
 416     """
 417     For the first element with the specified tag in the passed HTML document
 418     return its' content (text) and the whole element (html)
 419     """
 420     def find_or_raise(haystack, needle, exc):
 421         try:
 422             return haystack.index(needle)
 423         except ValueError:
 424             raise exc
 425     closing_tag = f'</{tag}>'
 426     whole_start = find_or_raise(
 427         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 428     content_start = find_or_raise(
 429         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 430     content_start += whole_start + 1
 431     with HTMLBreakOnClosingTagParser() as parser:
 432         parser.feed(html[whole_start:content_start])
 433         if not parser.tagstack or parser.tagstack[0] != tag:
 434             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 435         offset = content_start
 436         while offset < len(html):
 437             next_closing_tag_start = find_or_raise(
 438                 html[offset:], closing_tag,
 439                 compat_HTMLParseError(f'closing {tag} tag not found'))
 440             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 441             try:
 442                 parser.feed(html[offset:offset + next_closing_tag_end])
 443                 offset += next_closing_tag_end
 444             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 445                 return html[content_start:offset + next_closing_tag_start], \
 446                     html[whole_start:offset + next_closing_tag_end]
 447         raise compat_HTMLParseError('unexpected end of html')
 448
 449
 450 class HTMLAttributeParser(html.parser.HTMLParser):
 451     """Trivial HTML parser to gather the attributes for a single element"""
 452
 453     def __init__(self):
 454         self.attrs = {}
 455         html.parser.HTMLParser.__init__(self)
 456
 457     def handle_starttag(self, tag, attrs):
 458         self.attrs = dict(attrs)
 459         raise compat_HTMLParseError('done')
 460
 461
 462 class HTMLListAttrsParser(html.parser.HTMLParser):
 463     """HTML parser to gather the attributes for the elements of a list"""
 464
 465     def __init__(self):
 466         html.parser.HTMLParser.__init__(self)
 467         self.items = []
 468         self._level = 0
 469
 470     def handle_starttag(self, tag, attrs):
 471         if tag == 'li' and self._level == 0:
 472             self.items.append(dict(attrs))
 473         self._level += 1
 474
 475     def handle_endtag(self, tag):
 476         self._level -= 1
 477
 478
 479 def extract_attributes(html_element):
 480     """Given a string for an HTML element such as
 481     <el
 482          a="foo" B="bar" c="&98;az" d=boz
 483          empty= noval entity="&amp;"
 484          sq='"' dq="'"
 485     >
 486     Decode and return a dictionary of attributes.
 487     {
 488         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 489         'empty': '', 'noval': None, 'entity': '&',
 490         'sq': '"', 'dq': '\''
 491     }.
 492     """
 493     parser = HTMLAttributeParser()
 494     with contextlib.suppress(compat_HTMLParseError):
 495         parser.feed(html_element)
 496         parser.close()
 497     return parser.attrs
 498
 499
 500 def parse_list(webpage):
 501     """Given a string for an series of HTML <li> elements,
 502     return a dictionary of their attributes"""
 503     parser = HTMLListAttrsParser()
 504     parser.feed(webpage)
 505     parser.close()
 506     return parser.items
 507
 508
 509 def clean_html(html):
 510     """Clean an HTML snippet into a readable string"""
 511
 512     if html is None:  # Convenience for sanitizing descriptions etc.
 513         return html
 514
 515     html = re.sub(r'\s+', ' ', html)
 516     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 517     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 518     # Strip html tags
 519     html = re.sub('<.*?>', '', html)
 520     # Replace html entities
 521     html = unescapeHTML(html)
 522     return html.strip()
 523
 524
 525 class LenientJSONDecoder(json.JSONDecoder):
 526     # TODO: Write tests
 527     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 528         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 529         self._close_attempts = 2 * close_objects
 530         super().__init__(*args, **kwargs)
 531
 532     @staticmethod
 533     def _close_object(err):
 534         doc = err.doc[:err.pos]
 535         # We need to add comma first to get the correct error message
 536         if err.msg.startswith('Expecting \',\''):
 537             return doc + ','
 538         elif not doc.endswith(','):
 539             return
 540
 541         if err.msg.startswith('Expecting property name'):
 542             return doc[:-1] + '}'
 543         elif err.msg.startswith('Expecting value'):
 544             return doc[:-1] + ']'
 545
 546     def decode(self, s):
 547         if self.transform_source:
 548             s = self.transform_source(s)
 549         for attempt in range(self._close_attempts + 1):
 550             try:
 551                 if self.ignore_extra:
 552                     return self.raw_decode(s.lstrip())[0]
 553                 return super().decode(s)
 554             except json.JSONDecodeError as e:
 555                 if e.pos is None:
 556                     raise
 557                 elif attempt < self._close_attempts:
 558                     s = self._close_object(e)
 559                     if s is not None:
 560                         continue
 561                 raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
 562         assert False, 'Too many attempts to decode JSON'
 563
 564
 565 def sanitize_open(filename, open_mode):
 566     """Try to open the given filename, and slightly tweak it if this fails.
 567
 568     Attempts to open the given filename. If this fails, it tries to change
 569     the filename slightly, step by step, until it's either able to open it
 570     or it fails and raises a final exception, like the standard open()
 571     function.
 572
 573     It returns the tuple (stream, definitive_file_name).
 574     """
 575     if filename == '-':
 576         if sys.platform == 'win32':
 577             import msvcrt
 578
 579             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 580             with contextlib.suppress(io.UnsupportedOperation):
 581                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 582         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 583
 584     for attempt in range(2):
 585         try:
 586             try:
 587                 if sys.platform == 'win32':
 588                     # FIXME: An exclusive lock also locks the file from being read.
 589                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 590                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 591                     raise LockingUnsupportedError()
 592                 stream = locked_file(filename, open_mode, block=False).__enter__()
 593             except OSError:
 594                 stream = open(filename, open_mode)
 595             return stream, filename
 596         except OSError as err:
 597             if attempt or err.errno in (errno.EACCES,):
 598                 raise
 599             old_filename, filename = filename, sanitize_path(filename)
 600             if old_filename == filename:
 601                 raise
 602
 603
 604 def timeconvert(timestr):
 605     """Convert RFC 2822 defined time string into system timestamp"""
 606     timestamp = None
 607     timetuple = email.utils.parsedate_tz(timestr)
 608     if timetuple is not None:
 609         timestamp = email.utils.mktime_tz(timetuple)
 610     return timestamp
 611
 612
 613 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 614     """Sanitizes a string so it could be used as part of a filename.
 615     @param restricted   Use a stricter subset of allowed characters
 616     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 617                         If unset, yt-dlp's new sanitization rules are in effect
 618     """
 619     if s == '':
 620         return ''
 621
 622     def replace_insane(char):
 623         if restricted and char in ACCENT_CHARS:
 624             return ACCENT_CHARS[char]
 625         elif not restricted and char == '\n':
 626             return '\0 '
 627         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 628             # Replace with their full-width unicode counterparts
 629             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 630         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 631             return ''
 632         elif char == '"':
 633             return '' if restricted else '\''
 634         elif char == ':':
 635             return '\0_\0-' if restricted else '\0 \0-'
 636         elif char in '\\/|*<>':
 637             return '\0_'
 638         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 639             return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
 640         return char
 641
 642     # Replace look-alike Unicode glyphs
 643     if restricted and (is_id is NO_DEFAULT or not is_id):
 644         s = unicodedata.normalize('NFKC', s)
 645     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 646     result = ''.join(map(replace_insane, s))
 647     if is_id is NO_DEFAULT:
 648         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 649         STRIP_RE = r'(?:\0.|[ _-])*'
 650         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 651     result = result.replace('\0', '') or '_'
 652
 653     if not is_id:
 654         while '__' in result:
 655             result = result.replace('__', '_')
 656         result = result.strip('_')
 657         # Common case of "Foreign band name - English song title"
 658         if restricted and result.startswith('-_'):
 659             result = result[2:]
 660         if result.startswith('-'):
 661             result = '_' + result[len('-'):]
 662         result = result.lstrip('.')
 663         if not result:
 664             result = '_'
 665     return result
 666
 667
 668 def sanitize_path(s, force=False):
 669     """Sanitizes and normalizes path on Windows"""
 670     # XXX: this handles drive relative paths (c:sth) incorrectly
 671     if sys.platform == 'win32':
 672         force = False
 673         drive_or_unc, _ = os.path.splitdrive(s)
 674     elif force:
 675         drive_or_unc = ''
 676     else:
 677         return s
 678
 679     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 680     if drive_or_unc:
 681         norm_path.pop(0)
 682     sanitized_path = [
 683         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 684         for path_part in norm_path]
 685     if drive_or_unc:
 686         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 687     elif force and s and s[0] == os.path.sep:
 688         sanitized_path.insert(0, os.path.sep)
 689     # TODO: Fix behavioral differences <3.12
 690     # The workaround using `normpath` only superficially passes tests
 691     # Ref: https://github.com/python/cpython/pull/100351
 692     return os.path.normpath(os.path.join(*sanitized_path))
 693
 694
 695 def sanitize_url(url, *, scheme='http'):
 696     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 697     # the number of unwanted failures due to missing protocol
 698     if url is None:
 699         return
 700     elif url.startswith('//'):
 701         return f'{scheme}:{url}'
 702     # Fix some common typos seen so far
 703     COMMON_TYPOS = (
 704         # https://github.com/ytdl-org/youtube-dl/issues/15649
 705         (r'^httpss://', r'https://'),
 706         # https://bx1.be/lives/direct-tv/
 707         (r'^rmtp([es]?)://', r'rtmp\1://'),
 708     )
 709     for mistake, fixup in COMMON_TYPOS:
 710         if re.match(mistake, url):
 711             return re.sub(mistake, fixup, url)
 712     return url
 713
 714
 715 def extract_basic_auth(url):
 716     parts = urllib.parse.urlsplit(url)
 717     if parts.username is None:
 718         return url, None
 719     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 720         parts.hostname if parts.port is None
 721         else '%s:%d' % (parts.hostname, parts.port))))
 722     auth_payload = base64.b64encode(
 723         ('%s:%s' % (parts.username, parts.password or '')).encode())
 724     return url, f'Basic {auth_payload.decode()}'
 725
 726
 727 def expand_path(s):
 728     """Expand shell variables and ~"""
 729     return os.path.expandvars(compat_expanduser(s))
 730
 731
 732 def orderedSet(iterable, *, lazy=False):
 733     """Remove all duplicates from the input iterable"""
 734     def _iter():
 735         seen = []  # Do not use set since the items can be unhashable
 736         for x in iterable:
 737             if x not in seen:
 738                 seen.append(x)
 739                 yield x
 740
 741     return _iter() if lazy else list(_iter())
 742
 743
 744 def _htmlentity_transform(entity_with_semicolon):
 745     """Transforms an HTML entity to a character."""
 746     entity = entity_with_semicolon[:-1]
 747
 748     # Known non-numeric HTML entity
 749     if entity in html.entities.name2codepoint:
 750         return chr(html.entities.name2codepoint[entity])
 751
 752     # TODO: HTML5 allows entities without a semicolon.
 753     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 754     if entity_with_semicolon in html.entities.html5:
 755         return html.entities.html5[entity_with_semicolon]
 756
 757     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 758     if mobj is not None:
 759         numstr = mobj.group(1)
 760         if numstr.startswith('x'):
 761             base = 16
 762             numstr = '0%s' % numstr
 763         else:
 764             base = 10
 765         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 766         with contextlib.suppress(ValueError):
 767             return chr(int(numstr, base))
 768
 769     # Unknown entity in name, return its literal representation
 770     return '&%s;' % entity
 771
 772
 773 def unescapeHTML(s):
 774     if s is None:
 775         return None
 776     assert isinstance(s, str)
 777
 778     return re.sub(
 779         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 780
 781
 782 def escapeHTML(text):
 783     return (
 784         text
 785         .replace('&', '&amp;')
 786         .replace('<', '&lt;')
 787         .replace('>', '&gt;')
 788         .replace('"', '&quot;')
 789         .replace("'", '&#39;')
 790     )
 791
 792
 793 class netrc_from_content(netrc.netrc):
 794     def __init__(self, content):
 795         self.hosts, self.macros = {}, {}
 796         with io.StringIO(content) as stream:
 797             self._parse('-', stream, False)
 798
 799
 800 class Popen(subprocess.Popen):
 801     if sys.platform == 'win32':
 802         _startupinfo = subprocess.STARTUPINFO()
 803         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 804     else:
 805         _startupinfo = None
 806
 807     @staticmethod
 808     def _fix_pyinstaller_ld_path(env):
 809         """Restore LD_LIBRARY_PATH when using PyInstaller
 810             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 811                  https://github.com/yt-dlp/yt-dlp/issues/4573
 812         """
 813         if not hasattr(sys, '_MEIPASS'):
 814             return
 815
 816         def _fix(key):
 817             orig = env.get(f'{key}_ORIG')
 818             if orig is None:
 819                 env.pop(key, None)
 820             else:
 821                 env[key] = orig
 822
 823         _fix('LD_LIBRARY_PATH')  # Linux
 824         _fix('DYLD_LIBRARY_PATH')  # macOS
 825
 826     def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
 827         if env is None:
 828             env = os.environ.copy()
 829         self._fix_pyinstaller_ld_path(env)
 830
 831         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 832         if text is True:
 833             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 834             kwargs.setdefault('encoding', 'utf-8')
 835             kwargs.setdefault('errors', 'replace')
 836
 837         if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
 838             if not isinstance(args, str):
 839                 args = ' '.join(compat_shlex_quote(a) for a in args)
 840             shell = False
 841             args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"'
 842
 843         super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
 844
 845     def __comspec(self):
 846         comspec = os.environ.get('ComSpec') or os.path.join(
 847             os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
 848         if os.path.isabs(comspec):
 849             return comspec
 850         raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
 851
 852     def communicate_or_kill(self, *args, **kwargs):
 853         try:
 854             return self.communicate(*args, **kwargs)
 855         except BaseException:  # Including KeyboardInterrupt
 856             self.kill(timeout=None)
 857             raise
 858
 859     def kill(self, *, timeout=0):
 860         super().kill()
 861         if timeout != 0:
 862             self.wait(timeout=timeout)
 863
 864     @classmethod
 865     def run(cls, *args, timeout=None, **kwargs):
 866         with cls(*args, **kwargs) as proc:
 867             default = '' if proc.__text_mode else b''
 868             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 869             return stdout or default, stderr or default, proc.returncode
 870
 871
 872 def encodeArgument(s):
 873     # Legacy code that uses byte strings
 874     # Uncomment the following line after fixing all post processors
 875     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 876     return s if isinstance(s, str) else s.decode('ascii')
 877
 878
 879 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 880
 881
 882 def timetuple_from_msec(msec):
 883     secs, msec = divmod(msec, 1000)
 884     mins, secs = divmod(secs, 60)
 885     hrs, mins = divmod(mins, 60)
 886     return _timetuple(hrs, mins, secs, msec)
 887
 888
 889 def formatSeconds(secs, delim=':', msec=False):
 890     time = timetuple_from_msec(secs * 1000)
 891     if time.hours:
 892         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 893     elif time.minutes:
 894         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 895     else:
 896         ret = '%d' % time.seconds
 897     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 898
 899
 900 def bug_reports_message(before=';'):
 901     from ..update import REPOSITORY
 902
 903     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 904            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 905
 906     before = before.rstrip()
 907     if not before or before.endswith(('.', '!', '?')):
 908         msg = msg[0].title() + msg[1:]
 909
 910     return (before + ' ' if before else '') + msg
 911
 912
 913 class YoutubeDLError(Exception):
 914     """Base exception for YoutubeDL errors."""
 915     msg = None
 916
 917     def __init__(self, msg=None):
 918         if msg is not None:
 919             self.msg = msg
 920         elif self.msg is None:
 921             self.msg = type(self).__name__
 922         super().__init__(self.msg)
 923
 924
 925 class ExtractorError(YoutubeDLError):
 926     """Error during info extraction."""
 927
 928     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 929         """ tb, if given, is the original traceback (so that it can be printed out).
 930         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 931         """
 932         from ..networking.exceptions import network_exceptions
 933         if sys.exc_info()[0] in network_exceptions:
 934             expected = True
 935
 936         self.orig_msg = str(msg)
 937         self.traceback = tb
 938         self.expected = expected
 939         self.cause = cause
 940         self.video_id = video_id
 941         self.ie = ie
 942         self.exc_info = sys.exc_info()  # preserve original exception
 943         if isinstance(self.exc_info[1], ExtractorError):
 944             self.exc_info = self.exc_info[1].exc_info
 945         super().__init__(self.__msg)
 946
 947     @property
 948     def __msg(self):
 949         return ''.join((
 950             format_field(self.ie, None, '[%s] '),
 951             format_field(self.video_id, None, '%s: '),
 952             self.orig_msg,
 953             format_field(self.cause, None, ' (caused by %r)'),
 954             '' if self.expected else bug_reports_message()))
 955
 956     def format_traceback(self):
 957         return join_nonempty(
 958             self.traceback and ''.join(traceback.format_tb(self.traceback)),
 959             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
 960             delim='\n') or None
 961
 962     def __setattr__(self, name, value):
 963         super().__setattr__(name, value)
 964         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
 965             self.msg = self.__msg or type(self).__name__
 966             self.args = (self.msg, )  # Cannot be property
 967
 968
 969 class UnsupportedError(ExtractorError):
 970     def __init__(self, url):
 971         super().__init__(
 972             'Unsupported URL: %s' % url, expected=True)
 973         self.url = url
 974
 975
 976 class RegexNotFoundError(ExtractorError):
 977     """Error when a regex didn't match"""
 978     pass
 979
 980
 981 class GeoRestrictedError(ExtractorError):
 982     """Geographic restriction Error exception.
 983
 984     This exception may be thrown when a video is not available from your
 985     geographic location due to geographic restrictions imposed by a website.
 986     """
 987
 988     def __init__(self, msg, countries=None, **kwargs):
 989         kwargs['expected'] = True
 990         super().__init__(msg, **kwargs)
 991         self.countries = countries
 992
 993
 994 class UserNotLive(ExtractorError):
 995     """Error when a channel/user is not live"""
 996
 997     def __init__(self, msg=None, **kwargs):
 998         kwargs['expected'] = True
 999         super().__init__(msg or 'The channel is not currently live', **kwargs)
1000
1001
1002 class DownloadError(YoutubeDLError):
1003     """Download Error exception.
1004
1005     This exception may be thrown by FileDownloader objects if they are not
1006     configured to continue on errors. They will contain the appropriate
1007     error message.
1008     """
1009
1010     def __init__(self, msg, exc_info=None):
1011         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1012         super().__init__(msg)
1013         self.exc_info = exc_info
1014
1015
1016 class EntryNotInPlaylist(YoutubeDLError):
1017     """Entry not in playlist exception.
1018
1019     This exception will be thrown by YoutubeDL when a requested entry
1020     is not found in the playlist info_dict
1021     """
1022     msg = 'Entry not found in info'
1023
1024
1025 class SameFileError(YoutubeDLError):
1026     """Same File exception.
1027
1028     This exception will be thrown by FileDownloader objects if they detect
1029     multiple files would have to be downloaded to the same file on disk.
1030     """
1031     msg = 'Fixed output name but more than one file to download'
1032
1033     def __init__(self, filename=None):
1034         if filename is not None:
1035             self.msg += f': {filename}'
1036         super().__init__(self.msg)
1037
1038
1039 class PostProcessingError(YoutubeDLError):
1040     """Post Processing exception.
1041
1042     This exception may be raised by PostProcessor's .run() method to
1043     indicate an error in the postprocessing task.
1044     """
1045
1046
1047 class DownloadCancelled(YoutubeDLError):
1048     """ Exception raised when the download queue should be interrupted """
1049     msg = 'The download was cancelled'
1050
1051
1052 class ExistingVideoReached(DownloadCancelled):
1053     """ --break-on-existing triggered """
1054     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1055
1056
1057 class RejectedVideoReached(DownloadCancelled):
1058     """ --break-match-filter triggered """
1059     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1060
1061
1062 class MaxDownloadsReached(DownloadCancelled):
1063     """ --max-downloads limit has been reached. """
1064     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1065
1066
1067 class ReExtractInfo(YoutubeDLError):
1068     """ Video info needs to be re-extracted. """
1069
1070     def __init__(self, msg, expected=False):
1071         super().__init__(msg)
1072         self.expected = expected
1073
1074
1075 class ThrottledDownload(ReExtractInfo):
1076     """ Download speed below --throttled-rate. """
1077     msg = 'The download speed is below throttle limit'
1078
1079     def __init__(self):
1080         super().__init__(self.msg, expected=False)
1081
1082
1083 class UnavailableVideoError(YoutubeDLError):
1084     """Unavailable Format exception.
1085
1086     This exception will be thrown when a video is requested
1087     in a format that is not available for that video.
1088     """
1089     msg = 'Unable to download video'
1090
1091     def __init__(self, err=None):
1092         if err is not None:
1093             self.msg += f': {err}'
1094         super().__init__(self.msg)
1095
1096
1097 class ContentTooShortError(YoutubeDLError):
1098     """Content Too Short exception.
1099
1100     This exception may be raised by FileDownloader objects when a file they
1101     download is too small for what the server announced first, indicating
1102     the connection was probably interrupted.
1103     """
1104
1105     def __init__(self, downloaded, expected):
1106         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1107         # Both in bytes
1108         self.downloaded = downloaded
1109         self.expected = expected
1110
1111
1112 class XAttrMetadataError(YoutubeDLError):
1113     def __init__(self, code=None, msg='Unknown error'):
1114         super().__init__(msg)
1115         self.code = code
1116         self.msg = msg
1117
1118         # Parsing code and msg
1119         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1120                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1121             self.reason = 'NO_SPACE'
1122         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1123             self.reason = 'VALUE_TOO_LONG'
1124         else:
1125             self.reason = 'NOT_SUPPORTED'
1126
1127
1128 class XAttrUnavailableError(YoutubeDLError):
1129     pass
1130
1131
1132 def is_path_like(f):
1133     return isinstance(f, (str, bytes, os.PathLike))
1134
1135
1136 def extract_timezone(date_str):
1137     m = re.search(
1138         r'''(?x)
1139             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1140             (?P<tz>Z|                                            # just the UTC Z, or
1141                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1142                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1143                    [ ]?                                          # optional space
1144                 (?P<sign>\+|-)                                   # +/-
1145                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1146             $)
1147         ''', date_str)
1148     if not m:
1149         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1150         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1151         if timezone is not None:
1152             date_str = date_str[:-len(m.group('tz'))]
1153         timezone = datetime.timedelta(hours=timezone or 0)
1154     else:
1155         date_str = date_str[:-len(m.group('tz'))]
1156         if not m.group('sign'):
1157             timezone = datetime.timedelta()
1158         else:
1159             sign = 1 if m.group('sign') == '+' else -1
1160             timezone = datetime.timedelta(
1161                 hours=sign * int(m.group('hours')),
1162                 minutes=sign * int(m.group('minutes')))
1163     return timezone, date_str
1164
1165
1166 def parse_iso8601(date_str, delimiter='T', timezone=None):
1167     """ Return a UNIX timestamp from the given date """
1168
1169     if date_str is None:
1170         return None
1171
1172     date_str = re.sub(r'\.[0-9]+', '', date_str)
1173
1174     if timezone is None:
1175         timezone, date_str = extract_timezone(date_str)
1176
1177     with contextlib.suppress(ValueError):
1178         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1179         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1180         return calendar.timegm(dt.timetuple())
1181
1182
1183 def date_formats(day_first=True):
1184     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1185
1186
1187 def unified_strdate(date_str, day_first=True):
1188     """Return a string with the date in the format YYYYMMDD"""
1189
1190     if date_str is None:
1191         return None
1192     upload_date = None
1193     # Replace commas
1194     date_str = date_str.replace(',', ' ')
1195     # Remove AM/PM + timezone
1196     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1197     _, date_str = extract_timezone(date_str)
1198
1199     for expression in date_formats(day_first):
1200         with contextlib.suppress(ValueError):
1201             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1202     if upload_date is None:
1203         timetuple = email.utils.parsedate_tz(date_str)
1204         if timetuple:
1205             with contextlib.suppress(ValueError):
1206                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1207     if upload_date is not None:
1208         return str(upload_date)
1209
1210
1211 def unified_timestamp(date_str, day_first=True):
1212     if not isinstance(date_str, str):
1213         return None
1214
1215     date_str = re.sub(r'\s+', ' ', re.sub(
1216         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1217
1218     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1219     timezone, date_str = extract_timezone(date_str)
1220
1221     # Remove AM/PM + timezone
1222     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1223
1224     # Remove unrecognized timezones from ISO 8601 alike timestamps
1225     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1226     if m:
1227         date_str = date_str[:-len(m.group('tz'))]
1228
1229     # Python only supports microseconds, so remove nanoseconds
1230     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1231     if m:
1232         date_str = m.group(1)
1233
1234     for expression in date_formats(day_first):
1235         with contextlib.suppress(ValueError):
1236             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1237             return calendar.timegm(dt.timetuple())
1238
1239     timetuple = email.utils.parsedate_tz(date_str)
1240     if timetuple:
1241         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1242
1243
1244 def determine_ext(url, default_ext='unknown_video'):
1245     if url is None or '.' not in url:
1246         return default_ext
1247     guess = url.partition('?')[0].rpartition('.')[2]
1248     if re.match(r'^[A-Za-z0-9]+$', guess):
1249         return guess
1250     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1251     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1252         return guess.rstrip('/')
1253     else:
1254         return default_ext
1255
1256
1257 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1258     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1259
1260
1261 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1262     R"""
1263     Return a datetime object from a string.
1264     Supported format:
1265         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1266
1267     @param format       strftime format of DATE
1268     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1269                         auto: round to the unit provided in date_str (if applicable).
1270     """
1271     auto_precision = False
1272     if precision == 'auto':
1273         auto_precision = True
1274         precision = 'microsecond'
1275     today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
1276     if date_str in ('now', 'today'):
1277         return today
1278     if date_str == 'yesterday':
1279         return today - datetime.timedelta(days=1)
1280     match = re.match(
1281         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1282         date_str)
1283     if match is not None:
1284         start_time = datetime_from_str(match.group('start'), precision, format)
1285         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1286         unit = match.group('unit')
1287         if unit == 'month' or unit == 'year':
1288             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1289             unit = 'day'
1290         else:
1291             if unit == 'week':
1292                 unit = 'day'
1293                 time *= 7
1294             delta = datetime.timedelta(**{unit + 's': time})
1295             new_date = start_time + delta
1296         if auto_precision:
1297             return datetime_round(new_date, unit)
1298         return new_date
1299
1300     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1301
1302
1303 def date_from_str(date_str, format='%Y%m%d', strict=False):
1304     R"""
1305     Return a date object from a string using datetime_from_str
1306
1307     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1308                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1309     """
1310     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1311         raise ValueError(f'Invalid date format "{date_str}"')
1312     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1313
1314
1315 def datetime_add_months(dt, months):
1316     """Increment/Decrement a datetime object by months."""
1317     month = dt.month + months - 1
1318     year = dt.year + month // 12
1319     month = month % 12 + 1
1320     day = min(dt.day, calendar.monthrange(year, month)[1])
1321     return dt.replace(year, month, day)
1322
1323
1324 def datetime_round(dt, precision='day'):
1325     """
1326     Round a datetime object's time to a specific precision
1327     """
1328     if precision == 'microsecond':
1329         return dt
1330
1331     unit_seconds = {
1332         'day': 86400,
1333         'hour': 3600,
1334         'minute': 60,
1335         'second': 1,
1336     }
1337     roundto = lambda x, n: ((x + n / 2) // n) * n
1338     timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
1339     return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
1340
1341
1342 def hyphenate_date(date_str):
1343     """
1344     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1345     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1346     if match is not None:
1347         return '-'.join(match.groups())
1348     else:
1349         return date_str
1350
1351
1352 class DateRange:
1353     """Represents a time interval between two dates"""
1354
1355     def __init__(self, start=None, end=None):
1356         """start and end must be strings in the format accepted by date"""
1357         if start is not None:
1358             self.start = date_from_str(start, strict=True)
1359         else:
1360             self.start = datetime.datetime.min.date()
1361         if end is not None:
1362             self.end = date_from_str(end, strict=True)
1363         else:
1364             self.end = datetime.datetime.max.date()
1365         if self.start > self.end:
1366             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1367
1368     @classmethod
1369     def day(cls, day):
1370         """Returns a range that only contains the given day"""
1371         return cls(day, day)
1372
1373     def __contains__(self, date):
1374         """Check if the date is in the range"""
1375         if not isinstance(date, datetime.date):
1376             date = date_from_str(date)
1377         return self.start <= date <= self.end
1378
1379     def __repr__(self):
1380         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1381
1382     def __str__(self):
1383         return f'{self.start} to {self.end}'
1384
1385     def __eq__(self, other):
1386         return (isinstance(other, DateRange)
1387                 and self.start == other.start and self.end == other.end)
1388
1389
1390 @functools.cache
1391 def system_identifier():
1392     python_implementation = platform.python_implementation()
1393     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1394         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1395     libc_ver = []
1396     with contextlib.suppress(OSError):  # We may not have access to the executable
1397         libc_ver = platform.libc_ver()
1398
1399     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1400         platform.python_version(),
1401         python_implementation,
1402         platform.machine(),
1403         platform.architecture()[0],
1404         platform.platform(),
1405         ssl.OPENSSL_VERSION,
1406         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1407     )
1408
1409
1410 @functools.cache
1411 def get_windows_version():
1412     ''' Get Windows version. returns () if it's not running on Windows '''
1413     if compat_os_name == 'nt':
1414         return version_tuple(platform.win32_ver()[1])
1415     else:
1416         return ()
1417
1418
1419 def write_string(s, out=None, encoding=None):
1420     assert isinstance(s, str)
1421     out = out or sys.stderr
1422     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1423     if not out:
1424         return
1425
1426     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1427         s = re.sub(r'([\r\n]+)', r' \1', s)
1428
1429     enc, buffer = None, out
1430     # `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
1431     if 'b' in (getattr(out, 'mode', None) or ''):
1432         enc = encoding or preferredencoding()
1433     elif hasattr(out, 'buffer'):
1434         buffer = out.buffer
1435         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1436
1437     buffer.write(s.encode(enc, 'ignore') if enc else s)
1438     out.flush()
1439
1440
1441 # TODO: Use global logger
1442 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1443     from .. import _IN_CLI
1444     if _IN_CLI:
1445         if msg in deprecation_warning._cache:
1446             return
1447         deprecation_warning._cache.add(msg)
1448         if printer:
1449             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1450         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1451     else:
1452         import warnings
1453         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1454
1455
1456 deprecation_warning._cache = set()
1457
1458
1459 def bytes_to_intlist(bs):
1460     if not bs:
1461         return []
1462     if isinstance(bs[0], int):  # Python 3
1463         return list(bs)
1464     else:
1465         return [ord(c) for c in bs]
1466
1467
1468 def intlist_to_bytes(xs):
1469     if not xs:
1470         return b''
1471     return struct.pack('%dB' % len(xs), *xs)
1472
1473
1474 class LockingUnsupportedError(OSError):
1475     msg = 'File locking is not supported'
1476
1477     def __init__(self):
1478         super().__init__(self.msg)
1479
1480
1481 # Cross-platform file locking
1482 if sys.platform == 'win32':
1483     import ctypes
1484     import ctypes.wintypes
1485     import msvcrt
1486
1487     class OVERLAPPED(ctypes.Structure):
1488         _fields_ = [
1489             ('Internal', ctypes.wintypes.LPVOID),
1490             ('InternalHigh', ctypes.wintypes.LPVOID),
1491             ('Offset', ctypes.wintypes.DWORD),
1492             ('OffsetHigh', ctypes.wintypes.DWORD),
1493             ('hEvent', ctypes.wintypes.HANDLE),
1494         ]
1495
1496     kernel32 = ctypes.WinDLL('kernel32')
1497     LockFileEx = kernel32.LockFileEx
1498     LockFileEx.argtypes = [
1499         ctypes.wintypes.HANDLE,     # hFile
1500         ctypes.wintypes.DWORD,      # dwFlags
1501         ctypes.wintypes.DWORD,      # dwReserved
1502         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1503         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1504         ctypes.POINTER(OVERLAPPED)  # Overlapped
1505     ]
1506     LockFileEx.restype = ctypes.wintypes.BOOL
1507     UnlockFileEx = kernel32.UnlockFileEx
1508     UnlockFileEx.argtypes = [
1509         ctypes.wintypes.HANDLE,     # hFile
1510         ctypes.wintypes.DWORD,      # dwReserved
1511         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1512         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1513         ctypes.POINTER(OVERLAPPED)  # Overlapped
1514     ]
1515     UnlockFileEx.restype = ctypes.wintypes.BOOL
1516     whole_low = 0xffffffff
1517     whole_high = 0x7fffffff
1518
1519     def _lock_file(f, exclusive, block):
1520         overlapped = OVERLAPPED()
1521         overlapped.Offset = 0
1522         overlapped.OffsetHigh = 0
1523         overlapped.hEvent = 0
1524         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1525
1526         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1527                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1528                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1529             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1530             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1531
1532     def _unlock_file(f):
1533         assert f._lock_file_overlapped_p
1534         handle = msvcrt.get_osfhandle(f.fileno())
1535         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1536             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1537
1538 else:
1539     try:
1540         import fcntl
1541
1542         def _lock_file(f, exclusive, block):
1543             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1544             if not block:
1545                 flags |= fcntl.LOCK_NB
1546             try:
1547                 fcntl.flock(f, flags)
1548             except BlockingIOError:
1549                 raise
1550             except OSError:  # AOSP does not have flock()
1551                 fcntl.lockf(f, flags)
1552
1553         def _unlock_file(f):
1554             with contextlib.suppress(OSError):
1555                 return fcntl.flock(f, fcntl.LOCK_UN)
1556             with contextlib.suppress(OSError):
1557                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1558             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1559
1560     except ImportError:
1561
1562         def _lock_file(f, exclusive, block):
1563             raise LockingUnsupportedError()
1564
1565         def _unlock_file(f):
1566             raise LockingUnsupportedError()
1567
1568
1569 class locked_file:
1570     locked = False
1571
1572     def __init__(self, filename, mode, block=True, encoding=None):
1573         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1574             raise NotImplementedError(mode)
1575         self.mode, self.block = mode, block
1576
1577         writable = any(f in mode for f in 'wax+')
1578         readable = any(f in mode for f in 'r+')
1579         flags = functools.reduce(operator.ior, (
1580             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
1581             getattr(os, 'O_BINARY', 0),  # Windows only
1582             getattr(os, 'O_NOINHERIT', 0),  # Windows only
1583             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
1584             os.O_APPEND if 'a' in mode else 0,
1585             os.O_EXCL if 'x' in mode else 0,
1586             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1587         ))
1588
1589         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1590
1591     def __enter__(self):
1592         exclusive = 'r' not in self.mode
1593         try:
1594             _lock_file(self.f, exclusive, self.block)
1595             self.locked = True
1596         except OSError:
1597             self.f.close()
1598             raise
1599         if 'w' in self.mode:
1600             try:
1601                 self.f.truncate()
1602             except OSError as e:
1603                 if e.errno not in (
1604                     errno.ESPIPE,  # Illegal seek - expected for FIFO
1605                     errno.EINVAL,  # Invalid argument - expected for /dev/null
1606                 ):
1607                     raise
1608         return self
1609
1610     def unlock(self):
1611         if not self.locked:
1612             return
1613         try:
1614             _unlock_file(self.f)
1615         finally:
1616             self.locked = False
1617
1618     def __exit__(self, *_):
1619         try:
1620             self.unlock()
1621         finally:
1622             self.f.close()
1623
1624     open = __enter__
1625     close = __exit__
1626
1627     def __getattr__(self, attr):
1628         return getattr(self.f, attr)
1629
1630     def __iter__(self):
1631         return iter(self.f)
1632
1633
1634 @functools.cache
1635 def get_filesystem_encoding():
1636     encoding = sys.getfilesystemencoding()
1637     return encoding if encoding is not None else 'utf-8'
1638
1639
1640 def shell_quote(args):
1641     quoted_args = []
1642     encoding = get_filesystem_encoding()
1643     for a in args:
1644         if isinstance(a, bytes):
1645             # We may get a filename encoded with 'encodeFilename'
1646             a = a.decode(encoding)
1647         quoted_args.append(compat_shlex_quote(a))
1648     return ' '.join(quoted_args)
1649
1650
1651 def smuggle_url(url, data):
1652     """ Pass additional data in a URL for internal use. """
1653
1654     url, idata = unsmuggle_url(url, {})
1655     data.update(idata)
1656     sdata = urllib.parse.urlencode(
1657         {'__youtubedl_smuggle': json.dumps(data)})
1658     return url + '#' + sdata
1659
1660
1661 def unsmuggle_url(smug_url, default=None):
1662     if '#__youtubedl_smuggle' not in smug_url:
1663         return smug_url, default
1664     url, _, sdata = smug_url.rpartition('#')
1665     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1666     data = json.loads(jsond)
1667     return url, data
1668
1669
1670 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1671     """ Formats numbers with decimal sufixes like K, M, etc """
1672     num, factor = float_or_none(num), float(factor)
1673     if num is None or num < 0:
1674         return None
1675     POSSIBLE_SUFFIXES = 'kMGTPEZY'
1676     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1677     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1678     if factor == 1024:
1679         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1680     converted = num / (factor ** exponent)
1681     return fmt % (converted, suffix)
1682
1683
1684 def format_bytes(bytes):
1685     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1686
1687
1688 def lookup_unit_table(unit_table, s, strict=False):
1689     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1690     units_re = '|'.join(re.escape(u) for u in unit_table)
1691     m = (re.fullmatch if strict else re.match)(
1692         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1693     if not m:
1694         return None
1695
1696     num = float(m.group('num').replace(',', '.'))
1697     mult = unit_table[m.group('unit')]
1698     return round(num * mult)
1699
1700
1701 def parse_bytes(s):
1702     """Parse a string indicating a byte quantity into an integer"""
1703     return lookup_unit_table(
1704         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1705         s.upper(), strict=True)
1706
1707
1708 def parse_filesize(s):
1709     if s is None:
1710         return None
1711
1712     # The lower-case forms are of course incorrect and unofficial,
1713     # but we support those too
1714     _UNIT_TABLE = {
1715         'B': 1,
1716         'b': 1,
1717         'bytes': 1,
1718         'KiB': 1024,
1719         'KB': 1000,
1720         'kB': 1024,
1721         'Kb': 1000,
1722         'kb': 1000,
1723         'kilobytes': 1000,
1724         'kibibytes': 1024,
1725         'MiB': 1024 ** 2,
1726         'MB': 1000 ** 2,
1727         'mB': 1024 ** 2,
1728         'Mb': 1000 ** 2,
1729         'mb': 1000 ** 2,
1730         'megabytes': 1000 ** 2,
1731         'mebibytes': 1024 ** 2,
1732         'GiB': 1024 ** 3,
1733         'GB': 1000 ** 3,
1734         'gB': 1024 ** 3,
1735         'Gb': 1000 ** 3,
1736         'gb': 1000 ** 3,
1737         'gigabytes': 1000 ** 3,
1738         'gibibytes': 1024 ** 3,
1739         'TiB': 1024 ** 4,
1740         'TB': 1000 ** 4,
1741         'tB': 1024 ** 4,
1742         'Tb': 1000 ** 4,
1743         'tb': 1000 ** 4,
1744         'terabytes': 1000 ** 4,
1745         'tebibytes': 1024 ** 4,
1746         'PiB': 1024 ** 5,
1747         'PB': 1000 ** 5,
1748         'pB': 1024 ** 5,
1749         'Pb': 1000 ** 5,
1750         'pb': 1000 ** 5,
1751         'petabytes': 1000 ** 5,
1752         'pebibytes': 1024 ** 5,
1753         'EiB': 1024 ** 6,
1754         'EB': 1000 ** 6,
1755         'eB': 1024 ** 6,
1756         'Eb': 1000 ** 6,
1757         'eb': 1000 ** 6,
1758         'exabytes': 1000 ** 6,
1759         'exbibytes': 1024 ** 6,
1760         'ZiB': 1024 ** 7,
1761         'ZB': 1000 ** 7,
1762         'zB': 1024 ** 7,
1763         'Zb': 1000 ** 7,
1764         'zb': 1000 ** 7,
1765         'zettabytes': 1000 ** 7,
1766         'zebibytes': 1024 ** 7,
1767         'YiB': 1024 ** 8,
1768         'YB': 1000 ** 8,
1769         'yB': 1024 ** 8,
1770         'Yb': 1000 ** 8,
1771         'yb': 1000 ** 8,
1772         'yottabytes': 1000 ** 8,
1773         'yobibytes': 1024 ** 8,
1774     }
1775
1776     return lookup_unit_table(_UNIT_TABLE, s)
1777
1778
1779 def parse_count(s):
1780     if s is None:
1781         return None
1782
1783     s = re.sub(r'^[^\d]+\s', '', s).strip()
1784
1785     if re.match(r'^[\d,.]+$', s):
1786         return str_to_int(s)
1787
1788     _UNIT_TABLE = {
1789         'k': 1000,
1790         'K': 1000,
1791         'm': 1000 ** 2,
1792         'M': 1000 ** 2,
1793         'kk': 1000 ** 2,
1794         'KK': 1000 ** 2,
1795         'b': 1000 ** 3,
1796         'B': 1000 ** 3,
1797     }
1798
1799     ret = lookup_unit_table(_UNIT_TABLE, s)
1800     if ret is not None:
1801         return ret
1802
1803     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1804     if mobj:
1805         return str_to_int(mobj.group(1))
1806
1807
1808 def parse_resolution(s, *, lenient=False):
1809     if s is None:
1810         return {}
1811
1812     if lenient:
1813         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1814     else:
1815         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1816     if mobj:
1817         return {
1818             'width': int(mobj.group('w')),
1819             'height': int(mobj.group('h')),
1820         }
1821
1822     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1823     if mobj:
1824         return {'height': int(mobj.group(1))}
1825
1826     mobj = re.search(r'\b([48])[kK]\b', s)
1827     if mobj:
1828         return {'height': int(mobj.group(1)) * 540}
1829
1830     return {}
1831
1832
1833 def parse_bitrate(s):
1834     if not isinstance(s, str):
1835         return
1836     mobj = re.search(r'\b(\d+)\s*kbps', s)
1837     if mobj:
1838         return int(mobj.group(1))
1839
1840
1841 def month_by_name(name, lang='en'):
1842     """ Return the number of a month by (locale-independently) English name """
1843
1844     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1845
1846     try:
1847         return month_names.index(name) + 1
1848     except ValueError:
1849         return None
1850
1851
1852 def month_by_abbreviation(abbrev):
1853     """ Return the number of a month by (locale-independently) English
1854         abbreviations """
1855
1856     try:
1857         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1858     except ValueError:
1859         return None
1860
1861
1862 def fix_xml_ampersands(xml_str):
1863     """Replace all the '&' by '&amp;' in XML"""
1864     return re.sub(
1865         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1866         '&amp;',
1867         xml_str)
1868
1869
1870 def setproctitle(title):
1871     assert isinstance(title, str)
1872
1873     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1874     try:
1875         import ctypes
1876     except ImportError:
1877         return
1878
1879     try:
1880         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1881     except OSError:
1882         return
1883     except TypeError:
1884         # LoadLibrary in Windows Python 2.7.13 only expects
1885         # a bytestring, but since unicode_literals turns
1886         # every string into a unicode string, it fails.
1887         return
1888     title_bytes = title.encode()
1889     buf = ctypes.create_string_buffer(len(title_bytes))
1890     buf.value = title_bytes
1891     try:
1892         # PR_SET_NAME = 15      Ref: /usr/include/linux/prctl.h
1893         libc.prctl(15, buf, 0, 0, 0)
1894     except AttributeError:
1895         return  # Strange libc, just skip this
1896
1897
1898 def remove_start(s, start):
1899     return s[len(start):] if s is not None and s.startswith(start) else s
1900
1901
1902 def remove_end(s, end):
1903     return s[:-len(end)] if s is not None and s.endswith(end) else s
1904
1905
1906 def remove_quotes(s):
1907     if s is None or len(s) < 2:
1908         return s
1909     for quote in ('"', "'", ):
1910         if s[0] == quote and s[-1] == quote:
1911             return s[1:-1]
1912     return s
1913
1914
1915 def get_domain(url):
1916     """
1917     This implementation is inconsistent, but is kept for compatibility.
1918     Use this only for "webpage_url_domain"
1919     """
1920     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1921
1922
1923 def url_basename(url):
1924     path = urllib.parse.urlparse(url).path
1925     return path.strip('/').split('/')[-1]
1926
1927
1928 def base_url(url):
1929     return re.match(r'https?://[^?#]+/', url).group()
1930
1931
1932 def urljoin(base, path):
1933     if isinstance(path, bytes):
1934         path = path.decode()
1935     if not isinstance(path, str) or not path:
1936         return None
1937     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1938         return path
1939     if isinstance(base, bytes):
1940         base = base.decode()
1941     if not isinstance(base, str) or not re.match(
1942             r'^(?:https?:)?//', base):
1943         return None
1944     return urllib.parse.urljoin(base, path)
1945
1946
1947 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1948     if get_attr and v is not None:
1949         v = getattr(v, get_attr, None)
1950     try:
1951         return int(v) * invscale // scale
1952     except (ValueError, TypeError, OverflowError):
1953         return default
1954
1955
1956 def str_or_none(v, default=None):
1957     return default if v is None else str(v)
1958
1959
1960 def str_to_int(int_str):
1961     """ A more relaxed version of int_or_none """
1962     if isinstance(int_str, int):
1963         return int_str
1964     elif isinstance(int_str, str):
1965         int_str = re.sub(r'[,\.\+]', '', int_str)
1966         return int_or_none(int_str)
1967
1968
1969 def float_or_none(v, scale=1, invscale=1, default=None):
1970     if v is None:
1971         return default
1972     try:
1973         return float(v) * invscale / scale
1974     except (ValueError, TypeError):
1975         return default
1976
1977
1978 def bool_or_none(v, default=None):
1979     return v if isinstance(v, bool) else default
1980
1981
1982 def strip_or_none(v, default=None):
1983     return v.strip() if isinstance(v, str) else default
1984
1985
1986 def url_or_none(url):
1987     if not url or not isinstance(url, str):
1988         return None
1989     url = url.strip()
1990     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
1991
1992
1993 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
1994     datetime_object = None
1995     try:
1996         if isinstance(timestamp, (int, float)):  # unix timestamp
1997             # Using naive datetime here can break timestamp() in Windows
1998             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
1999             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2000             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2001             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2002                                + datetime.timedelta(seconds=timestamp))
2003         elif isinstance(timestamp, str):  # assume YYYYMMDD
2004             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2005         date_format = re.sub(  # Support %s on windows
2006             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2007         return datetime_object.strftime(date_format)
2008     except (ValueError, TypeError, AttributeError):
2009         return default
2010
2011
2012 def parse_duration(s):
2013     if not isinstance(s, str):
2014         return None
2015     s = s.strip()
2016     if not s:
2017         return None
2018
2019     days, hours, mins, secs, ms = [None] * 5
2020     m = re.match(r'''(?x)
2021             (?P<before_secs>
2022                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2023             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2024             (?P<ms>[.:][0-9]+)?Z?$
2025         ''', s)
2026     if m:
2027         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2028     else:
2029         m = re.match(
2030             r'''(?ix)(?:P?
2031                 (?:
2032                     [0-9]+\s*y(?:ears?)?,?\s*
2033                 )?
2034                 (?:
2035                     [0-9]+\s*m(?:onths?)?,?\s*
2036                 )?
2037                 (?:
2038                     [0-9]+\s*w(?:eeks?)?,?\s*
2039                 )?
2040                 (?:
2041                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2042                 )?
2043                 T)?
2044                 (?:
2045                     (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2046                 )?
2047                 (?:
2048                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2049                 )?
2050                 (?:
2051                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2052                 )?Z?$''', s)
2053         if m:
2054             days, hours, mins, secs, ms = m.groups()
2055         else:
2056             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2057             if m:
2058                 hours, mins = m.groups()
2059             else:
2060                 return None
2061
2062     if ms:
2063         ms = ms.replace(':', '.')
2064     return sum(float(part or 0) * mult for part, mult in (
2065         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2066
2067
2068 def prepend_extension(filename, ext, expected_real_ext=None):
2069     name, real_ext = os.path.splitext(filename)
2070     return (
2071         f'{name}.{ext}{real_ext}'
2072         if not expected_real_ext or real_ext[1:] == expected_real_ext
2073         else f'{filename}.{ext}')
2074
2075
2076 def replace_extension(filename, ext, expected_real_ext=None):
2077     name, real_ext = os.path.splitext(filename)
2078     return '{}.{}'.format(
2079         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2080         ext)
2081
2082
2083 def check_executable(exe, args=[]):
2084     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2085     args can be a list of arguments for a short output (like -version) """
2086     try:
2087         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2088     except OSError:
2089         return False
2090     return exe
2091
2092
2093 def _get_exe_version_output(exe, args):
2094     try:
2095         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2096         # SIGTTOU if yt-dlp is run in the background.
2097         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2098         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2099                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2100         if ret:
2101             return None
2102     except OSError:
2103         return False
2104     return stdout
2105
2106
2107 def detect_exe_version(output, version_re=None, unrecognized='present'):
2108     assert isinstance(output, str)
2109     if version_re is None:
2110         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2111     m = re.search(version_re, output)
2112     if m:
2113         return m.group(1)
2114     else:
2115         return unrecognized
2116
2117
2118 def get_exe_version(exe, args=['--version'],
2119                     version_re=None, unrecognized=('present', 'broken')):
2120     """ Returns the version of the specified executable,
2121     or False if the executable is not present """
2122     unrecognized = variadic(unrecognized)
2123     assert len(unrecognized) in (1, 2)
2124     out = _get_exe_version_output(exe, args)
2125     if out is None:
2126         return unrecognized[-1]
2127     return out and detect_exe_version(out, version_re, unrecognized[0])
2128
2129
2130 def frange(start=0, stop=None, step=1):
2131     """Float range"""
2132     if stop is None:
2133         start, stop = 0, start
2134     sign = [-1, 1][step > 0] if step else 0
2135     while sign * start < sign * stop:
2136         yield start
2137         start += step
2138
2139
2140 class LazyList(collections.abc.Sequence):
2141     """Lazy immutable list from an iterable
2142     Note that slices of a LazyList are lists and not LazyList"""
2143
2144     class IndexError(IndexError):
2145         pass
2146
2147     def __init__(self, iterable, *, reverse=False, _cache=None):
2148         self._iterable = iter(iterable)
2149         self._cache = [] if _cache is None else _cache
2150         self._reversed = reverse
2151
2152     def __iter__(self):
2153         if self._reversed:
2154             # We need to consume the entire iterable to iterate in reverse
2155             yield from self.exhaust()
2156             return
2157         yield from self._cache
2158         for item in self._iterable:
2159             self._cache.append(item)
2160             yield item
2161
2162     def _exhaust(self):
2163         self._cache.extend(self._iterable)
2164         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2165         return self._cache
2166
2167     def exhaust(self):
2168         """Evaluate the entire iterable"""
2169         return self._exhaust()[::-1 if self._reversed else 1]
2170
2171     @staticmethod
2172     def _reverse_index(x):
2173         return None if x is None else ~x
2174
2175     def __getitem__(self, idx):
2176         if isinstance(idx, slice):
2177             if self._reversed:
2178                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2179             start, stop, step = idx.start, idx.stop, idx.step or 1
2180         elif isinstance(idx, int):
2181             if self._reversed:
2182                 idx = self._reverse_index(idx)
2183             start, stop, step = idx, idx, 0
2184         else:
2185             raise TypeError('indices must be integers or slices')
2186         if ((start or 0) < 0 or (stop or 0) < 0
2187                 or (start is None and step < 0)
2188                 or (stop is None and step > 0)):
2189             # We need to consume the entire iterable to be able to slice from the end
2190             # Obviously, never use this with infinite iterables
2191             self._exhaust()
2192             try:
2193                 return self._cache[idx]
2194             except IndexError as e:
2195                 raise self.IndexError(e) from e
2196         n = max(start or 0, stop or 0) - len(self._cache) + 1
2197         if n > 0:
2198             self._cache.extend(itertools.islice(self._iterable, n))
2199         try:
2200             return self._cache[idx]
2201         except IndexError as e:
2202             raise self.IndexError(e) from e
2203
2204     def __bool__(self):
2205         try:
2206             self[-1] if self._reversed else self[0]
2207         except self.IndexError:
2208             return False
2209         return True
2210
2211     def __len__(self):
2212         self._exhaust()
2213         return len(self._cache)
2214
2215     def __reversed__(self):
2216         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2217
2218     def __copy__(self):
2219         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2220
2221     def __repr__(self):
2222         # repr and str should mimic a list. So we exhaust the iterable
2223         return repr(self.exhaust())
2224
2225     def __str__(self):
2226         return repr(self.exhaust())
2227
2228
2229 class PagedList:
2230
2231     class IndexError(IndexError):
2232         pass
2233
2234     def __len__(self):
2235         # This is only useful for tests
2236         return len(self.getslice())
2237
2238     def __init__(self, pagefunc, pagesize, use_cache=True):
2239         self._pagefunc = pagefunc
2240         self._pagesize = pagesize
2241         self._pagecount = float('inf')
2242         self._use_cache = use_cache
2243         self._cache = {}
2244
2245     def getpage(self, pagenum):
2246         page_results = self._cache.get(pagenum)
2247         if page_results is None:
2248             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2249         if self._use_cache:
2250             self._cache[pagenum] = page_results
2251         return page_results
2252
2253     def getslice(self, start=0, end=None):
2254         return list(self._getslice(start, end))
2255
2256     def _getslice(self, start, end):
2257         raise NotImplementedError('This method must be implemented by subclasses')
2258
2259     def __getitem__(self, idx):
2260         assert self._use_cache, 'Indexing PagedList requires cache'
2261         if not isinstance(idx, int) or idx < 0:
2262             raise TypeError('indices must be non-negative integers')
2263         entries = self.getslice(idx, idx + 1)
2264         if not entries:
2265             raise self.IndexError()
2266         return entries[0]
2267
2268     def __bool__(self):
2269         return bool(self.getslice(0, 1))
2270
2271
2272 class OnDemandPagedList(PagedList):
2273     """Download pages until a page with less than maximum results"""
2274
2275     def _getslice(self, start, end):
2276         for pagenum in itertools.count(start // self._pagesize):
2277             firstid = pagenum * self._pagesize
2278             nextfirstid = pagenum * self._pagesize + self._pagesize
2279             if start >= nextfirstid:
2280                 continue
2281
2282             startv = (
2283                 start % self._pagesize
2284                 if firstid <= start < nextfirstid
2285                 else 0)
2286             endv = (
2287                 ((end - 1) % self._pagesize) + 1
2288                 if (end is not None and firstid <= end <= nextfirstid)
2289                 else None)
2290
2291             try:
2292                 page_results = self.getpage(pagenum)
2293             except Exception:
2294                 self._pagecount = pagenum - 1
2295                 raise
2296             if startv != 0 or endv is not None:
2297                 page_results = page_results[startv:endv]
2298             yield from page_results
2299
2300             # A little optimization - if current page is not "full", ie. does
2301             # not contain page_size videos then we can assume that this page
2302             # is the last one - there are no more ids on further pages -
2303             # i.e. no need to query again.
2304             if len(page_results) + startv < self._pagesize:
2305                 break
2306
2307             # If we got the whole page, but the next page is not interesting,
2308             # break out early as well
2309             if end == nextfirstid:
2310                 break
2311
2312
2313 class InAdvancePagedList(PagedList):
2314     """PagedList with total number of pages known in advance"""
2315
2316     def __init__(self, pagefunc, pagecount, pagesize):
2317         PagedList.__init__(self, pagefunc, pagesize, True)
2318         self._pagecount = pagecount
2319
2320     def _getslice(self, start, end):
2321         start_page = start // self._pagesize
2322         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2323         skip_elems = start - start_page * self._pagesize
2324         only_more = None if end is None else end - start
2325         for pagenum in range(start_page, end_page):
2326             page_results = self.getpage(pagenum)
2327             if skip_elems:
2328                 page_results = page_results[skip_elems:]
2329                 skip_elems = None
2330             if only_more is not None:
2331                 if len(page_results) < only_more:
2332                     only_more -= len(page_results)
2333                 else:
2334                     yield from page_results[:only_more]
2335                     break
2336             yield from page_results
2337
2338
2339 class PlaylistEntries:
2340     MissingEntry = object()
2341     is_exhausted = False
2342
2343     def __init__(self, ydl, info_dict):
2344         self.ydl = ydl
2345
2346         # _entries must be assigned now since infodict can change during iteration
2347         entries = info_dict.get('entries')
2348         if entries is None:
2349             raise EntryNotInPlaylist('There are no entries')
2350         elif isinstance(entries, list):
2351             self.is_exhausted = True
2352
2353         requested_entries = info_dict.get('requested_entries')
2354         self.is_incomplete = requested_entries is not None
2355         if self.is_incomplete:
2356             assert self.is_exhausted
2357             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2358             for i, entry in zip(requested_entries, entries):
2359                 self._entries[i - 1] = entry
2360         elif isinstance(entries, (list, PagedList, LazyList)):
2361             self._entries = entries
2362         else:
2363             self._entries = LazyList(entries)
2364
2365     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2366         (?P<start>[+-]?\d+)?
2367         (?P<range>[:-]
2368             (?P<end>[+-]?\d+|inf(?:inite)?)?
2369             (?::(?P<step>[+-]?\d+))?
2370         )?''')
2371
2372     @classmethod
2373     def parse_playlist_items(cls, string):
2374         for segment in string.split(','):
2375             if not segment:
2376                 raise ValueError('There is two or more consecutive commas')
2377             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2378             if not mobj:
2379                 raise ValueError(f'{segment!r} is not a valid specification')
2380             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2381             if int_or_none(step) == 0:
2382                 raise ValueError(f'Step in {segment!r} cannot be zero')
2383             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2384
2385     def get_requested_items(self):
2386         playlist_items = self.ydl.params.get('playlist_items')
2387         playlist_start = self.ydl.params.get('playliststart', 1)
2388         playlist_end = self.ydl.params.get('playlistend')
2389         # For backwards compatibility, interpret -1 as whole list
2390         if playlist_end in (-1, None):
2391             playlist_end = ''
2392         if not playlist_items:
2393             playlist_items = f'{playlist_start}:{playlist_end}'
2394         elif playlist_start != 1 or playlist_end:
2395             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2396
2397         for index in self.parse_playlist_items(playlist_items):
2398             for i, entry in self[index]:
2399                 yield i, entry
2400                 if not entry:
2401                     continue
2402                 try:
2403                     # The item may have just been added to archive. Don't break due to it
2404                     if not self.ydl.params.get('lazy_playlist'):
2405                         # TODO: Add auto-generated fields
2406                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2407                 except (ExistingVideoReached, RejectedVideoReached):
2408                     return
2409
2410     def get_full_count(self):
2411         if self.is_exhausted and not self.is_incomplete:
2412             return len(self)
2413         elif isinstance(self._entries, InAdvancePagedList):
2414             if self._entries._pagesize == 1:
2415                 return self._entries._pagecount
2416
2417     @functools.cached_property
2418     def _getter(self):
2419         if isinstance(self._entries, list):
2420             def get_entry(i):
2421                 try:
2422                     entry = self._entries[i]
2423                 except IndexError:
2424                     entry = self.MissingEntry
2425                     if not self.is_incomplete:
2426                         raise self.IndexError()
2427                 if entry is self.MissingEntry:
2428                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2429                 return entry
2430         else:
2431             def get_entry(i):
2432                 try:
2433                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2434                 except (LazyList.IndexError, PagedList.IndexError):
2435                     raise self.IndexError()
2436         return get_entry
2437
2438     def __getitem__(self, idx):
2439         if isinstance(idx, int):
2440             idx = slice(idx, idx)
2441
2442         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2443         step = 1 if idx.step is None else idx.step
2444         if idx.start is None:
2445             start = 0 if step > 0 else len(self) - 1
2446         else:
2447             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2448
2449         # NB: Do not call len(self) when idx == [:]
2450         if idx.stop is None:
2451             stop = 0 if step < 0 else float('inf')
2452         else:
2453             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2454         stop += [-1, 1][step > 0]
2455
2456         for i in frange(start, stop, step):
2457             if i < 0:
2458                 continue
2459             try:
2460                 entry = self._getter(i)
2461             except self.IndexError:
2462                 self.is_exhausted = True
2463                 if step > 0:
2464                     break
2465                 continue
2466             yield i + 1, entry
2467
2468     def __len__(self):
2469         return len(tuple(self[:]))
2470
2471     class IndexError(IndexError):
2472         pass
2473
2474
2475 def uppercase_escape(s):
2476     unicode_escape = codecs.getdecoder('unicode_escape')
2477     return re.sub(
2478         r'\\U[0-9a-fA-F]{8}',
2479         lambda m: unicode_escape(m.group(0))[0],
2480         s)
2481
2482
2483 def lowercase_escape(s):
2484     unicode_escape = codecs.getdecoder('unicode_escape')
2485     return re.sub(
2486         r'\\u[0-9a-fA-F]{4}',
2487         lambda m: unicode_escape(m.group(0))[0],
2488         s)
2489
2490
2491 def parse_qs(url, **kwargs):
2492     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2493
2494
2495 def read_batch_urls(batch_fd):
2496     def fixup(url):
2497         if not isinstance(url, str):
2498             url = url.decode('utf-8', 'replace')
2499         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2500         for bom in BOM_UTF8:
2501             if url.startswith(bom):
2502                 url = url[len(bom):]
2503         url = url.lstrip()
2504         if not url or url.startswith(('#', ';', ']')):
2505             return False
2506         # "#" cannot be stripped out since it is part of the URI
2507         # However, it can be safely stripped out if following a whitespace
2508         return re.split(r'\s#', url, 1)[0].rstrip()
2509
2510     with contextlib.closing(batch_fd) as fd:
2511         return [url for url in map(fixup, fd) if url]
2512
2513
2514 def urlencode_postdata(*args, **kargs):
2515     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2516
2517
2518 def update_url(url, *, query_update=None, **kwargs):
2519     """Replace URL components specified by kwargs
2520        @param url           str or parse url tuple
2521        @param query_update  update query
2522        @returns             str
2523     """
2524     if isinstance(url, str):
2525         if not kwargs and not query_update:
2526             return url
2527         else:
2528             url = urllib.parse.urlparse(url)
2529     if query_update:
2530         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2531         kwargs['query'] = urllib.parse.urlencode({
2532             **urllib.parse.parse_qs(url.query),
2533             **query_update
2534         }, True)
2535     return urllib.parse.urlunparse(url._replace(**kwargs))
2536
2537
2538 def update_url_query(url, query):
2539     return update_url(url, query_update=query)
2540
2541
2542 def _multipart_encode_impl(data, boundary):
2543     content_type = 'multipart/form-data; boundary=%s' % boundary
2544
2545     out = b''
2546     for k, v in data.items():
2547         out += b'--' + boundary.encode('ascii') + b'\r\n'
2548         if isinstance(k, str):
2549             k = k.encode()
2550         if isinstance(v, str):
2551             v = v.encode()
2552         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2553         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2554         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2555         if boundary.encode('ascii') in content:
2556             raise ValueError('Boundary overlaps with data')
2557         out += content
2558
2559     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2560
2561     return out, content_type
2562
2563
2564 def multipart_encode(data, boundary=None):
2565     '''
2566     Encode a dict to RFC 7578-compliant form-data
2567
2568     data:
2569         A dict where keys and values can be either Unicode or bytes-like
2570         objects.
2571     boundary:
2572         If specified a Unicode object, it's used as the boundary. Otherwise
2573         a random boundary is generated.
2574
2575     Reference: https://tools.ietf.org/html/rfc7578
2576     '''
2577     has_specified_boundary = boundary is not None
2578
2579     while True:
2580         if boundary is None:
2581             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2582
2583         try:
2584             out, content_type = _multipart_encode_impl(data, boundary)
2585             break
2586         except ValueError:
2587             if has_specified_boundary:
2588                 raise
2589             boundary = None
2590
2591     return out, content_type
2592
2593
2594 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2595     if blocked_types is NO_DEFAULT:
2596         blocked_types = (str, bytes, collections.abc.Mapping)
2597     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2598
2599
2600 def variadic(x, allowed_types=NO_DEFAULT):
2601     if not isinstance(allowed_types, (tuple, type)):
2602         deprecation_warning('allowed_types should be a tuple or a type')
2603         allowed_types = tuple(allowed_types)
2604     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2605
2606
2607 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2608     for f in funcs:
2609         try:
2610             val = f(*args, **kwargs)
2611         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2612             pass
2613         else:
2614             if expected_type is None or isinstance(val, expected_type):
2615                 return val
2616
2617
2618 def try_get(src, getter, expected_type=None):
2619     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2620
2621
2622 def filter_dict(dct, cndn=lambda _, v: v is not None):
2623     return {k: v for k, v in dct.items() if cndn(k, v)}
2624
2625
2626 def merge_dicts(*dicts):
2627     merged = {}
2628     for a_dict in dicts:
2629         for k, v in a_dict.items():
2630             if (v is not None and k not in merged
2631                     or isinstance(v, str) and merged[k] == ''):
2632                 merged[k] = v
2633     return merged
2634
2635
2636 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2637     return string if isinstance(string, str) else str(string, encoding, errors)
2638
2639
2640 US_RATINGS = {
2641     'G': 0,
2642     'PG': 10,
2643     'PG-13': 13,
2644     'R': 16,
2645     'NC': 18,
2646 }
2647
2648
2649 TV_PARENTAL_GUIDELINES = {
2650     'TV-Y': 0,
2651     'TV-Y7': 7,
2652     'TV-G': 0,
2653     'TV-PG': 0,
2654     'TV-14': 14,
2655     'TV-MA': 17,
2656 }
2657
2658
2659 def parse_age_limit(s):
2660     # isinstance(False, int) is True. So type() must be used instead
2661     if type(s) is int:  # noqa: E721
2662         return s if 0 <= s <= 21 else None
2663     elif not isinstance(s, str):
2664         return None
2665     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2666     if m:
2667         return int(m.group('age'))
2668     s = s.upper()
2669     if s in US_RATINGS:
2670         return US_RATINGS[s]
2671     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2672     if m:
2673         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2674     return None
2675
2676
2677 def strip_jsonp(code):
2678     return re.sub(
2679         r'''(?sx)^
2680             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2681             (?:\s*&&\s*(?P=func_name))?
2682             \s*\(\s*(?P<callback_data>.*)\);?
2683             \s*?(?://[^\n]*)*$''',
2684         r'\g<callback_data>', code)
2685
2686
2687 def js_to_json(code, vars={}, *, strict=False):
2688     # vars is a dict of var, val pairs to substitute
2689     STRING_QUOTES = '\'"`'
2690     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2691     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2692     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2693     INTEGER_TABLE = (
2694         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2695         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2696     )
2697
2698     def process_escape(match):
2699         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2700         escape = match.group(1) or match.group(2)
2701
2702         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2703                 else R'\u00' if escape == 'x'
2704                 else '' if escape == '\n'
2705                 else escape)
2706
2707     def template_substitute(match):
2708         evaluated = js_to_json(match.group(1), vars, strict=strict)
2709         if evaluated[0] == '"':
2710             return json.loads(evaluated)
2711         return evaluated
2712
2713     def fix_kv(m):
2714         v = m.group(0)
2715         if v in ('true', 'false', 'null'):
2716             return v
2717         elif v in ('undefined', 'void 0'):
2718             return 'null'
2719         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2720             return ''
2721
2722         if v[0] in STRING_QUOTES:
2723             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2724             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2725             return f'"{escaped}"'
2726
2727         for regex, base in INTEGER_TABLE:
2728             im = re.match(regex, v)
2729             if im:
2730                 i = int(im.group(1), base)
2731                 return f'"{i}":' if v.endswith(':') else str(i)
2732
2733         if v in vars:
2734             try:
2735                 if not strict:
2736                     json.loads(vars[v])
2737             except json.JSONDecodeError:
2738                 return json.dumps(vars[v])
2739             else:
2740                 return vars[v]
2741
2742         if not strict:
2743             return f'"{v}"'
2744
2745         raise ValueError(f'Unknown value: {v}')
2746
2747     def create_map(mobj):
2748         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2749
2750     code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
2751     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2752     if not strict:
2753         code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
2754         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2755         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2756         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2757
2758     return re.sub(rf'''(?sx)
2759         {STRING_RE}|
2760         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2761         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2762         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2763         [0-9]+(?={SKIP_RE}:)|
2764         !+
2765         ''', fix_kv, code)
2766
2767
2768 def qualities(quality_ids):
2769     """ Get a numeric quality value out of a list of possible values """
2770     def q(qid):
2771         try:
2772             return quality_ids.index(qid)
2773         except ValueError:
2774             return -1
2775     return q
2776
2777
2778 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2779
2780
2781 DEFAULT_OUTTMPL = {
2782     'default': '%(title)s [%(id)s].%(ext)s',
2783     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2784 }
2785 OUTTMPL_TYPES = {
2786     'chapter': None,
2787     'subtitle': None,
2788     'thumbnail': None,
2789     'description': 'description',
2790     'annotation': 'annotations.xml',
2791     'infojson': 'info.json',
2792     'link': None,
2793     'pl_video': None,
2794     'pl_thumbnail': None,
2795     'pl_description': 'description',
2796     'pl_infojson': 'info.json',
2797 }
2798
2799 # As of [1] format syntax is:
2800 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2801 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2802 STR_FORMAT_RE_TMPL = r'''(?x)
2803     (?<!%)(?P<prefix>(?:%%)*)
2804     %
2805     (?P<has_key>\((?P<key>{0})\))?
2806     (?P<format>
2807         (?P<conversion>[#0\-+ ]+)?
2808         (?P<min_width>\d+)?
2809         (?P<precision>\.\d+)?
2810         (?P<len_mod>[hlL])?  # unused in python
2811         {1}  # conversion type
2812     )
2813 '''
2814
2815
2816 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2817
2818
2819 def limit_length(s, length):
2820     """ Add ellipses to overly long strings """
2821     if s is None:
2822         return None
2823     ELLIPSES = '...'
2824     if len(s) > length:
2825         return s[:length - len(ELLIPSES)] + ELLIPSES
2826     return s
2827
2828
2829 def version_tuple(v):
2830     return tuple(int(e) for e in re.split(r'[-.]', v))
2831
2832
2833 def is_outdated_version(version, limit, assume_new=True):
2834     if not version:
2835         return not assume_new
2836     try:
2837         return version_tuple(version) < version_tuple(limit)
2838     except ValueError:
2839         return not assume_new
2840
2841
2842 def ytdl_is_updateable():
2843     """ Returns if yt-dlp can be updated with -U """
2844
2845     from ..update import is_non_updateable
2846
2847     return not is_non_updateable()
2848
2849
2850 def args_to_str(args):
2851     # Get a short string representation for a subprocess command
2852     return ' '.join(compat_shlex_quote(a) for a in args)
2853
2854
2855 def error_to_str(err):
2856     return f'{type(err).__name__}: {err}'
2857
2858
2859 def mimetype2ext(mt, default=NO_DEFAULT):
2860     if not isinstance(mt, str):
2861         if default is not NO_DEFAULT:
2862             return default
2863         return None
2864
2865     MAP = {
2866         # video
2867         '3gpp': '3gp',
2868         'mp2t': 'ts',
2869         'mp4': 'mp4',
2870         'mpeg': 'mpeg',
2871         'mpegurl': 'm3u8',
2872         'quicktime': 'mov',
2873         'webm': 'webm',
2874         'vp9': 'vp9',
2875         'video/ogg': 'ogv',
2876         'x-flv': 'flv',
2877         'x-m4v': 'm4v',
2878         'x-matroska': 'mkv',
2879         'x-mng': 'mng',
2880         'x-mp4-fragmented': 'mp4',
2881         'x-ms-asf': 'asf',
2882         'x-ms-wmv': 'wmv',
2883         'x-msvideo': 'avi',
2884
2885         # application (streaming playlists)
2886         'dash+xml': 'mpd',
2887         'f4m+xml': 'f4m',
2888         'hds+xml': 'f4m',
2889         'vnd.apple.mpegurl': 'm3u8',
2890         'vnd.ms-sstr+xml': 'ism',
2891         'x-mpegurl': 'm3u8',
2892
2893         # audio
2894         'audio/mp4': 'm4a',
2895         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2896         # Using .mp3 as it's the most popular one
2897         'audio/mpeg': 'mp3',
2898         'audio/webm': 'webm',
2899         'audio/x-matroska': 'mka',
2900         'audio/x-mpegurl': 'm3u',
2901         'midi': 'mid',
2902         'ogg': 'ogg',
2903         'wav': 'wav',
2904         'wave': 'wav',
2905         'x-aac': 'aac',
2906         'x-flac': 'flac',
2907         'x-m4a': 'm4a',
2908         'x-realaudio': 'ra',
2909         'x-wav': 'wav',
2910
2911         # image
2912         'avif': 'avif',
2913         'bmp': 'bmp',
2914         'gif': 'gif',
2915         'jpeg': 'jpg',
2916         'png': 'png',
2917         'svg+xml': 'svg',
2918         'tiff': 'tif',
2919         'vnd.wap.wbmp': 'wbmp',
2920         'webp': 'webp',
2921         'x-icon': 'ico',
2922         'x-jng': 'jng',
2923         'x-ms-bmp': 'bmp',
2924
2925         # caption
2926         'filmstrip+json': 'fs',
2927         'smptett+xml': 'tt',
2928         'ttaf+xml': 'dfxp',
2929         'ttml+xml': 'ttml',
2930         'x-ms-sami': 'sami',
2931
2932         # misc
2933         'gzip': 'gz',
2934         'json': 'json',
2935         'xml': 'xml',
2936         'zip': 'zip',
2937     }
2938
2939     mimetype = mt.partition(';')[0].strip().lower()
2940     _, _, subtype = mimetype.rpartition('/')
2941
2942     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2943     if ext:
2944         return ext
2945     elif default is not NO_DEFAULT:
2946         return default
2947     return subtype.replace('+', '.')
2948
2949
2950 def ext2mimetype(ext_or_url):
2951     if not ext_or_url:
2952         return None
2953     if '.' not in ext_or_url:
2954         ext_or_url = f'file.{ext_or_url}'
2955     return mimetypes.guess_type(ext_or_url)[0]
2956
2957
2958 def parse_codecs(codecs_str):
2959     # http://tools.ietf.org/html/rfc6381
2960     if not codecs_str:
2961         return {}
2962     split_codecs = list(filter(None, map(
2963         str.strip, codecs_str.strip().strip(',').split(','))))
2964     vcodec, acodec, scodec, hdr = None, None, None, None
2965     for full_codec in split_codecs:
2966         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2967         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2968                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2969             if vcodec:
2970                 continue
2971             vcodec = full_codec
2972             if parts[0] in ('dvh1', 'dvhe'):
2973                 hdr = 'DV'
2974             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2975                 hdr = 'HDR10'
2976             elif parts[:2] == ['vp9', '2']:
2977                 hdr = 'HDR10'
2978         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
2979                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2980             acodec = acodec or full_codec
2981         elif parts[0] in ('stpp', 'wvtt'):
2982             scodec = scodec or full_codec
2983         else:
2984             write_string(f'WARNING: Unknown codec {full_codec}\n')
2985     if vcodec or acodec or scodec:
2986         return {
2987             'vcodec': vcodec or 'none',
2988             'acodec': acodec or 'none',
2989             'dynamic_range': hdr,
2990             **({'scodec': scodec} if scodec is not None else {}),
2991         }
2992     elif len(split_codecs) == 2:
2993         return {
2994             'vcodec': split_codecs[0],
2995             'acodec': split_codecs[1],
2996         }
2997     return {}
2998
2999
3000 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3001     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3002
3003     allow_mkv = not preferences or 'mkv' in preferences
3004
3005     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3006         return 'mkv'  # TODO: any other format allows this?
3007
3008     # TODO: All codecs supported by parse_codecs isn't handled here
3009     COMPATIBLE_CODECS = {
3010         'mp4': {
3011             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3012             'h264', 'aacl', 'ec-3',  # Set in ISM
3013         },
3014         'webm': {
3015             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3016             'vp9x', 'vp8x',  # in the webm spec
3017         },
3018     }
3019
3020     sanitize_codec = functools.partial(
3021         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3022     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3023
3024     for ext in preferences or COMPATIBLE_CODECS.keys():
3025         codec_set = COMPATIBLE_CODECS.get(ext, set())
3026         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3027             return ext
3028
3029     COMPATIBLE_EXTS = (
3030         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3031         {'webm', 'weba'},
3032     )
3033     for ext in preferences or vexts:
3034         current_exts = {ext, *vexts, *aexts}
3035         if ext == 'mkv' or current_exts == {ext} or any(
3036                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3037             return ext
3038     return 'mkv' if allow_mkv else preferences[-1]
3039
3040
3041 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3042     getheader = url_handle.headers.get
3043
3044     cd = getheader('Content-Disposition')
3045     if cd:
3046         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3047         if m:
3048             e = determine_ext(m.group('filename'), default_ext=None)
3049             if e:
3050                 return e
3051
3052     meta_ext = getheader('x-amz-meta-name')
3053     if meta_ext:
3054         e = meta_ext.rpartition('.')[2]
3055         if e:
3056             return e
3057
3058     return mimetype2ext(getheader('Content-Type'), default=default)
3059
3060
3061 def encode_data_uri(data, mime_type):
3062     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3063
3064
3065 def age_restricted(content_limit, age_limit):
3066     """ Returns True iff the content should be blocked """
3067
3068     if age_limit is None:  # No limit set
3069         return False
3070     if content_limit is None:
3071         return False  # Content available for everyone
3072     return age_limit < content_limit
3073
3074
3075 # List of known byte-order-marks (BOM)
3076 BOMS = [
3077     (b'\xef\xbb\xbf', 'utf-8'),
3078     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3079     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3080     (b'\xff\xfe', 'utf-16-le'),
3081     (b'\xfe\xff', 'utf-16-be'),
3082 ]
3083
3084
3085 def is_html(first_bytes):
3086     """ Detect whether a file contains HTML by examining its first bytes. """
3087
3088     encoding = 'utf-8'
3089     for bom, enc in BOMS:
3090         while first_bytes.startswith(bom):
3091             encoding, first_bytes = enc, first_bytes[len(bom):]
3092
3093     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3094
3095
3096 def determine_protocol(info_dict):
3097     protocol = info_dict.get('protocol')
3098     if protocol is not None:
3099         return protocol
3100
3101     url = sanitize_url(info_dict['url'])
3102     if url.startswith('rtmp'):
3103         return 'rtmp'
3104     elif url.startswith('mms'):
3105         return 'mms'
3106     elif url.startswith('rtsp'):
3107         return 'rtsp'
3108
3109     ext = determine_ext(url)
3110     if ext == 'm3u8':
3111         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3112     elif ext == 'f4m':
3113         return 'f4m'
3114
3115     return urllib.parse.urlparse(url).scheme
3116
3117
3118 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3119     """ Render a list of rows, each as a list of values.
3120     Text after a \t will be right aligned """
3121     def width(string):
3122         return len(remove_terminal_sequences(string).replace('\t', ''))
3123
3124     def get_max_lens(table):
3125         return [max(width(str(v)) for v in col) for col in zip(*table)]
3126
3127     def filter_using_list(row, filterArray):
3128         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3129
3130     max_lens = get_max_lens(data) if hide_empty else []
3131     header_row = filter_using_list(header_row, max_lens)
3132     data = [filter_using_list(row, max_lens) for row in data]
3133
3134     table = [header_row] + data
3135     max_lens = get_max_lens(table)
3136     extra_gap += 1
3137     if delim:
3138         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3139         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3140     for row in table:
3141         for pos, text in enumerate(map(str, row)):
3142             if '\t' in text:
3143                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3144             else:
3145                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3146     ret = '\n'.join(''.join(row).rstrip() for row in table)
3147     return ret
3148
3149
3150 def _match_one(filter_part, dct, incomplete):
3151     # TODO: Generalize code with YoutubeDL._build_format_filter
3152     STRING_OPERATORS = {
3153         '*=': operator.contains,
3154         '^=': lambda attr, value: attr.startswith(value),
3155         '$=': lambda attr, value: attr.endswith(value),
3156         '~=': lambda attr, value: re.search(value, attr),
3157     }
3158     COMPARISON_OPERATORS = {
3159         **STRING_OPERATORS,
3160         '<=': operator.le,  # "<=" must be defined above "<"
3161         '<': operator.lt,
3162         '>=': operator.ge,
3163         '>': operator.gt,
3164         '=': operator.eq,
3165     }
3166
3167     if isinstance(incomplete, bool):
3168         is_incomplete = lambda _: incomplete
3169     else:
3170         is_incomplete = lambda k: k in incomplete
3171
3172     operator_rex = re.compile(r'''(?x)
3173         (?P<key>[a-z_]+)
3174         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3175         (?:
3176             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3177             (?P<strval>.+?)
3178         )
3179         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3180     m = operator_rex.fullmatch(filter_part.strip())
3181     if m:
3182         m = m.groupdict()
3183         unnegated_op = COMPARISON_OPERATORS[m['op']]
3184         if m['negation']:
3185             op = lambda attr, value: not unnegated_op(attr, value)
3186         else:
3187             op = unnegated_op
3188         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3189         if m['quote']:
3190             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3191         actual_value = dct.get(m['key'])
3192         numeric_comparison = None
3193         if isinstance(actual_value, (int, float)):
3194             # If the original field is a string and matching comparisonvalue is
3195             # a number we should respect the origin of the original field
3196             # and process comparison value as a string (see
3197             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3198             try:
3199                 numeric_comparison = int(comparison_value)
3200             except ValueError:
3201                 numeric_comparison = parse_filesize(comparison_value)
3202                 if numeric_comparison is None:
3203                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3204                 if numeric_comparison is None:
3205                     numeric_comparison = parse_duration(comparison_value)
3206         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3207             raise ValueError('Operator %s only supports string values!' % m['op'])
3208         if actual_value is None:
3209             return is_incomplete(m['key']) or m['none_inclusive']
3210         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3211
3212     UNARY_OPERATORS = {
3213         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3214         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3215     }
3216     operator_rex = re.compile(r'''(?x)
3217         (?P<op>%s)\s*(?P<key>[a-z_]+)
3218         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3219     m = operator_rex.fullmatch(filter_part.strip())
3220     if m:
3221         op = UNARY_OPERATORS[m.group('op')]
3222         actual_value = dct.get(m.group('key'))
3223         if is_incomplete(m.group('key')) and actual_value is None:
3224             return True
3225         return op(actual_value)
3226
3227     raise ValueError('Invalid filter part %r' % filter_part)
3228
3229
3230 def match_str(filter_str, dct, incomplete=False):
3231     """ Filter a dictionary with a simple string syntax.
3232     @returns           Whether the filter passes
3233     @param incomplete  Set of keys that is expected to be missing from dct.
3234                        Can be True/False to indicate all/none of the keys may be missing.
3235                        All conditions on incomplete keys pass if the key is missing
3236     """
3237     return all(
3238         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3239         for filter_part in re.split(r'(?<!\\)&', filter_str))
3240
3241
3242 def match_filter_func(filters, breaking_filters=None):
3243     if not filters and not breaking_filters:
3244         return None
3245     repr_ = f'{match_filter_func.__module__}.{match_filter_func.__qualname__}({filters}, {breaking_filters})'
3246
3247     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3248     filters = set(variadic(filters or []))
3249
3250     interactive = '-' in filters
3251     if interactive:
3252         filters.remove('-')
3253
3254     @function_with_repr.set_repr(repr_)
3255     def _match_func(info_dict, incomplete=False):
3256         ret = breaking_filters(info_dict, incomplete)
3257         if ret is not None:
3258             raise RejectedVideoReached(ret)
3259
3260         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3261             return NO_DEFAULT if interactive and not incomplete else None
3262         else:
3263             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3264             filter_str = ') | ('.join(map(str.strip, filters))
3265             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3266     return _match_func
3267
3268
3269 class download_range_func:
3270     def __init__(self, chapters, ranges, from_info=False):
3271         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3272
3273     def __call__(self, info_dict, ydl):
3274
3275         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3276                    else 'Cannot match chapters since chapter information is unavailable')
3277         for regex in self.chapters or []:
3278             for i, chapter in enumerate(info_dict.get('chapters') or []):
3279                 if re.search(regex, chapter['title']):
3280                     warning = None
3281                     yield {**chapter, 'index': i}
3282         if self.chapters and warning:
3283             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3284
3285         for start, end in self.ranges or []:
3286             yield {
3287                 'start_time': self._handle_negative_timestamp(start, info_dict),
3288                 'end_time': self._handle_negative_timestamp(end, info_dict),
3289             }
3290
3291         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3292             yield {
3293                 'start_time': info_dict.get('start_time') or 0,
3294                 'end_time': info_dict.get('end_time') or float('inf'),
3295             }
3296         elif not self.ranges and not self.chapters:
3297             yield {}
3298
3299     @staticmethod
3300     def _handle_negative_timestamp(time, info):
3301         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3302
3303     def __eq__(self, other):
3304         return (isinstance(other, download_range_func)
3305                 and self.chapters == other.chapters and self.ranges == other.ranges)
3306
3307     def __repr__(self):
3308         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3309
3310
3311 def parse_dfxp_time_expr(time_expr):
3312     if not time_expr:
3313         return
3314
3315     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3316     if mobj:
3317         return float(mobj.group('time_offset'))
3318
3319     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3320     if mobj:
3321         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3322
3323
3324 def srt_subtitles_timecode(seconds):
3325     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3326
3327
3328 def ass_subtitles_timecode(seconds):
3329     time = timetuple_from_msec(seconds * 1000)
3330     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3331
3332
3333 def dfxp2srt(dfxp_data):
3334     '''
3335     @param dfxp_data A bytes-like object containing DFXP data
3336     @returns A unicode object containing converted SRT data
3337     '''
3338     LEGACY_NAMESPACES = (
3339         (b'http://www.w3.org/ns/ttml', [
3340             b'http://www.w3.org/2004/11/ttaf1',
3341             b'http://www.w3.org/2006/04/ttaf1',
3342             b'http://www.w3.org/2006/10/ttaf1',
3343         ]),
3344         (b'http://www.w3.org/ns/ttml#styling', [
3345             b'http://www.w3.org/ns/ttml#style',
3346         ]),
3347     )
3348
3349     SUPPORTED_STYLING = [
3350         'color',
3351         'fontFamily',
3352         'fontSize',
3353         'fontStyle',
3354         'fontWeight',
3355         'textDecoration'
3356     ]
3357
3358     _x = functools.partial(xpath_with_ns, ns_map={
3359         'xml': 'http://www.w3.org/XML/1998/namespace',
3360         'ttml': 'http://www.w3.org/ns/ttml',
3361         'tts': 'http://www.w3.org/ns/ttml#styling',
3362     })
3363
3364     styles = {}
3365     default_style = {}
3366
3367     class TTMLPElementParser:
3368         _out = ''
3369         _unclosed_elements = []
3370         _applied_styles = []
3371
3372         def start(self, tag, attrib):
3373             if tag in (_x('ttml:br'), 'br'):
3374                 self._out += '\n'
3375             else:
3376                 unclosed_elements = []
3377                 style = {}
3378                 element_style_id = attrib.get('style')
3379                 if default_style:
3380                     style.update(default_style)
3381                 if element_style_id:
3382                     style.update(styles.get(element_style_id, {}))
3383                 for prop in SUPPORTED_STYLING:
3384                     prop_val = attrib.get(_x('tts:' + prop))
3385                     if prop_val:
3386                         style[prop] = prop_val
3387                 if style:
3388                     font = ''
3389                     for k, v in sorted(style.items()):
3390                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3391                             continue
3392                         if k == 'color':
3393                             font += ' color="%s"' % v
3394                         elif k == 'fontSize':
3395                             font += ' size="%s"' % v
3396                         elif k == 'fontFamily':
3397                             font += ' face="%s"' % v
3398                         elif k == 'fontWeight' and v == 'bold':
3399                             self._out += '<b>'
3400                             unclosed_elements.append('b')
3401                         elif k == 'fontStyle' and v == 'italic':
3402                             self._out += '<i>'
3403                             unclosed_elements.append('i')
3404                         elif k == 'textDecoration' and v == 'underline':
3405                             self._out += '<u>'
3406                             unclosed_elements.append('u')
3407                     if font:
3408                         self._out += '<font' + font + '>'
3409                         unclosed_elements.append('font')
3410                     applied_style = {}
3411                     if self._applied_styles:
3412                         applied_style.update(self._applied_styles[-1])
3413                     applied_style.update(style)
3414                     self._applied_styles.append(applied_style)
3415                 self._unclosed_elements.append(unclosed_elements)
3416
3417         def end(self, tag):
3418             if tag not in (_x('ttml:br'), 'br'):
3419                 unclosed_elements = self._unclosed_elements.pop()
3420                 for element in reversed(unclosed_elements):
3421                     self._out += '</%s>' % element
3422                 if unclosed_elements and self._applied_styles:
3423                     self._applied_styles.pop()
3424
3425         def data(self, data):
3426             self._out += data
3427
3428         def close(self):
3429             return self._out.strip()
3430
3431     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3432     # This will not trigger false positives since only UTF-8 text is being replaced
3433     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3434
3435     def parse_node(node):
3436         target = TTMLPElementParser()
3437         parser = xml.etree.ElementTree.XMLParser(target=target)
3438         parser.feed(xml.etree.ElementTree.tostring(node))
3439         return parser.close()
3440
3441     for k, v in LEGACY_NAMESPACES:
3442         for ns in v:
3443             dfxp_data = dfxp_data.replace(ns, k)
3444
3445     dfxp = compat_etree_fromstring(dfxp_data)
3446     out = []
3447     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3448
3449     if not paras:
3450         raise ValueError('Invalid dfxp/TTML subtitle')
3451
3452     repeat = False
3453     while True:
3454         for style in dfxp.findall(_x('.//ttml:style')):
3455             style_id = style.get('id') or style.get(_x('xml:id'))
3456             if not style_id:
3457                 continue
3458             parent_style_id = style.get('style')
3459             if parent_style_id:
3460                 if parent_style_id not in styles:
3461                     repeat = True
3462                     continue
3463                 styles[style_id] = styles[parent_style_id].copy()
3464             for prop in SUPPORTED_STYLING:
3465                 prop_val = style.get(_x('tts:' + prop))
3466                 if prop_val:
3467                     styles.setdefault(style_id, {})[prop] = prop_val
3468         if repeat:
3469             repeat = False
3470         else:
3471             break
3472
3473     for p in ('body', 'div'):
3474         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3475         if ele is None:
3476             continue
3477         style = styles.get(ele.get('style'))
3478         if not style:
3479             continue
3480         default_style.update(style)
3481
3482     for para, index in zip(paras, itertools.count(1)):
3483         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3484         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3485         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3486         if begin_time is None:
3487             continue
3488         if not end_time:
3489             if not dur:
3490                 continue
3491             end_time = begin_time + dur
3492         out.append('%d\n%s --> %s\n%s\n\n' % (
3493             index,
3494             srt_subtitles_timecode(begin_time),
3495             srt_subtitles_timecode(end_time),
3496             parse_node(para)))
3497
3498     return ''.join(out)
3499
3500
3501 def cli_option(params, command_option, param, separator=None):
3502     param = params.get(param)
3503     return ([] if param is None
3504             else [command_option, str(param)] if separator is None
3505             else [f'{command_option}{separator}{param}'])
3506
3507
3508 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3509     param = params.get(param)
3510     assert param in (True, False, None)
3511     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3512
3513
3514 def cli_valueless_option(params, command_option, param, expected_value=True):
3515     return [command_option] if params.get(param) == expected_value else []
3516
3517
3518 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3519     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3520         if use_compat:
3521             return argdict
3522         else:
3523             argdict = None
3524     if argdict is None:
3525         return default
3526     assert isinstance(argdict, dict)
3527
3528     assert isinstance(keys, (list, tuple))
3529     for key_list in keys:
3530         arg_list = list(filter(
3531             lambda x: x is not None,
3532             [argdict.get(key.lower()) for key in variadic(key_list)]))
3533         if arg_list:
3534             return [arg for args in arg_list for arg in args]
3535     return default
3536
3537
3538 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3539     main_key, exe = main_key.lower(), exe.lower()
3540     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3541     keys = [f'{root_key}{k}' for k in (keys or [''])]
3542     if root_key in keys:
3543         if main_key != exe:
3544             keys.append((main_key, exe))
3545         keys.append('default')
3546     else:
3547         use_compat = False
3548     return cli_configuration_args(argdict, keys, default, use_compat)
3549
3550
3551 class ISO639Utils:
3552     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3553     _lang_map = {
3554         'aa': 'aar',
3555         'ab': 'abk',
3556         'ae': 'ave',
3557         'af': 'afr',
3558         'ak': 'aka',
3559         'am': 'amh',
3560         'an': 'arg',
3561         'ar': 'ara',
3562         'as': 'asm',
3563         'av': 'ava',
3564         'ay': 'aym',
3565         'az': 'aze',
3566         'ba': 'bak',
3567         'be': 'bel',
3568         'bg': 'bul',
3569         'bh': 'bih',
3570         'bi': 'bis',
3571         'bm': 'bam',
3572         'bn': 'ben',
3573         'bo': 'bod',
3574         'br': 'bre',
3575         'bs': 'bos',
3576         'ca': 'cat',
3577         'ce': 'che',
3578         'ch': 'cha',
3579         'co': 'cos',
3580         'cr': 'cre',
3581         'cs': 'ces',
3582         'cu': 'chu',
3583         'cv': 'chv',
3584         'cy': 'cym',
3585         'da': 'dan',
3586         'de': 'deu',
3587         'dv': 'div',
3588         'dz': 'dzo',
3589         'ee': 'ewe',
3590         'el': 'ell',
3591         'en': 'eng',
3592         'eo': 'epo',
3593         'es': 'spa',
3594         'et': 'est',
3595         'eu': 'eus',
3596         'fa': 'fas',
3597         'ff': 'ful',
3598         'fi': 'fin',
3599         'fj': 'fij',
3600         'fo': 'fao',
3601         'fr': 'fra',
3602         'fy': 'fry',
3603         'ga': 'gle',
3604         'gd': 'gla',
3605         'gl': 'glg',
3606         'gn': 'grn',
3607         'gu': 'guj',
3608         'gv': 'glv',
3609         'ha': 'hau',
3610         'he': 'heb',
3611         'iw': 'heb',  # Replaced by he in 1989 revision
3612         'hi': 'hin',
3613         'ho': 'hmo',
3614         'hr': 'hrv',
3615         'ht': 'hat',
3616         'hu': 'hun',
3617         'hy': 'hye',
3618         'hz': 'her',
3619         'ia': 'ina',
3620         'id': 'ind',
3621         'in': 'ind',  # Replaced by id in 1989 revision
3622         'ie': 'ile',
3623         'ig': 'ibo',
3624         'ii': 'iii',
3625         'ik': 'ipk',
3626         'io': 'ido',
3627         'is': 'isl',
3628         'it': 'ita',
3629         'iu': 'iku',
3630         'ja': 'jpn',
3631         'jv': 'jav',
3632         'ka': 'kat',
3633         'kg': 'kon',
3634         'ki': 'kik',
3635         'kj': 'kua',
3636         'kk': 'kaz',
3637         'kl': 'kal',
3638         'km': 'khm',
3639         'kn': 'kan',
3640         'ko': 'kor',
3641         'kr': 'kau',
3642         'ks': 'kas',
3643         'ku': 'kur',
3644         'kv': 'kom',
3645         'kw': 'cor',
3646         'ky': 'kir',
3647         'la': 'lat',
3648         'lb': 'ltz',
3649         'lg': 'lug',
3650         'li': 'lim',
3651         'ln': 'lin',
3652         'lo': 'lao',
3653         'lt': 'lit',
3654         'lu': 'lub',
3655         'lv': 'lav',
3656         'mg': 'mlg',
3657         'mh': 'mah',
3658         'mi': 'mri',
3659         'mk': 'mkd',
3660         'ml': 'mal',
3661         'mn': 'mon',
3662         'mr': 'mar',
3663         'ms': 'msa',
3664         'mt': 'mlt',
3665         'my': 'mya',
3666         'na': 'nau',
3667         'nb': 'nob',
3668         'nd': 'nde',
3669         'ne': 'nep',
3670         'ng': 'ndo',
3671         'nl': 'nld',
3672         'nn': 'nno',
3673         'no': 'nor',
3674         'nr': 'nbl',
3675         'nv': 'nav',
3676         'ny': 'nya',
3677         'oc': 'oci',
3678         'oj': 'oji',
3679         'om': 'orm',
3680         'or': 'ori',
3681         'os': 'oss',
3682         'pa': 'pan',
3683         'pe': 'per',
3684         'pi': 'pli',
3685         'pl': 'pol',
3686         'ps': 'pus',
3687         'pt': 'por',
3688         'qu': 'que',
3689         'rm': 'roh',
3690         'rn': 'run',
3691         'ro': 'ron',
3692         'ru': 'rus',
3693         'rw': 'kin',
3694         'sa': 'san',
3695         'sc': 'srd',
3696         'sd': 'snd',
3697         'se': 'sme',
3698         'sg': 'sag',
3699         'si': 'sin',
3700         'sk': 'slk',
3701         'sl': 'slv',
3702         'sm': 'smo',
3703         'sn': 'sna',
3704         'so': 'som',
3705         'sq': 'sqi',
3706         'sr': 'srp',
3707         'ss': 'ssw',
3708         'st': 'sot',
3709         'su': 'sun',
3710         'sv': 'swe',
3711         'sw': 'swa',
3712         'ta': 'tam',
3713         'te': 'tel',
3714         'tg': 'tgk',
3715         'th': 'tha',
3716         'ti': 'tir',
3717         'tk': 'tuk',
3718         'tl': 'tgl',
3719         'tn': 'tsn',
3720         'to': 'ton',
3721         'tr': 'tur',
3722         'ts': 'tso',
3723         'tt': 'tat',
3724         'tw': 'twi',
3725         'ty': 'tah',
3726         'ug': 'uig',
3727         'uk': 'ukr',
3728         'ur': 'urd',
3729         'uz': 'uzb',
3730         've': 'ven',
3731         'vi': 'vie',
3732         'vo': 'vol',
3733         'wa': 'wln',
3734         'wo': 'wol',
3735         'xh': 'xho',
3736         'yi': 'yid',
3737         'ji': 'yid',  # Replaced by yi in 1989 revision
3738         'yo': 'yor',
3739         'za': 'zha',
3740         'zh': 'zho',
3741         'zu': 'zul',
3742     }
3743
3744     @classmethod
3745     def short2long(cls, code):
3746         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3747         return cls._lang_map.get(code[:2])
3748
3749     @classmethod
3750     def long2short(cls, code):
3751         """Convert language code from ISO 639-2/T to ISO 639-1"""
3752         for short_name, long_name in cls._lang_map.items():
3753             if long_name == code:
3754                 return short_name
3755
3756
3757 class ISO3166Utils:
3758     # From http://data.okfn.org/data/core/country-list
3759     _country_map = {
3760         'AF': 'Afghanistan',
3761         'AX': 'Åland Islands',
3762         'AL': 'Albania',
3763         'DZ': 'Algeria',
3764         'AS': 'American Samoa',
3765         'AD': 'Andorra',
3766         'AO': 'Angola',
3767         'AI': 'Anguilla',
3768         'AQ': 'Antarctica',
3769         'AG': 'Antigua and Barbuda',
3770         'AR': 'Argentina',
3771         'AM': 'Armenia',
3772         'AW': 'Aruba',
3773         'AU': 'Australia',
3774         'AT': 'Austria',
3775         'AZ': 'Azerbaijan',
3776         'BS': 'Bahamas',
3777         'BH': 'Bahrain',
3778         'BD': 'Bangladesh',
3779         'BB': 'Barbados',
3780         'BY': 'Belarus',
3781         'BE': 'Belgium',
3782         'BZ': 'Belize',
3783         'BJ': 'Benin',
3784         'BM': 'Bermuda',
3785         'BT': 'Bhutan',
3786         'BO': 'Bolivia, Plurinational State of',
3787         'BQ': 'Bonaire, Sint Eustatius and Saba',
3788         'BA': 'Bosnia and Herzegovina',
3789         'BW': 'Botswana',
3790         'BV': 'Bouvet Island',
3791         'BR': 'Brazil',
3792         'IO': 'British Indian Ocean Territory',
3793         'BN': 'Brunei Darussalam',
3794         'BG': 'Bulgaria',
3795         'BF': 'Burkina Faso',
3796         'BI': 'Burundi',
3797         'KH': 'Cambodia',
3798         'CM': 'Cameroon',
3799         'CA': 'Canada',
3800         'CV': 'Cape Verde',
3801         'KY': 'Cayman Islands',
3802         'CF': 'Central African Republic',
3803         'TD': 'Chad',
3804         'CL': 'Chile',
3805         'CN': 'China',
3806         'CX': 'Christmas Island',
3807         'CC': 'Cocos (Keeling) Islands',
3808         'CO': 'Colombia',
3809         'KM': 'Comoros',
3810         'CG': 'Congo',
3811         'CD': 'Congo, the Democratic Republic of the',
3812         'CK': 'Cook Islands',
3813         'CR': 'Costa Rica',
3814         'CI': 'Côte d\'Ivoire',
3815         'HR': 'Croatia',
3816         'CU': 'Cuba',
3817         'CW': 'Curaçao',
3818         'CY': 'Cyprus',
3819         'CZ': 'Czech Republic',
3820         'DK': 'Denmark',
3821         'DJ': 'Djibouti',
3822         'DM': 'Dominica',
3823         'DO': 'Dominican Republic',
3824         'EC': 'Ecuador',
3825         'EG': 'Egypt',
3826         'SV': 'El Salvador',
3827         'GQ': 'Equatorial Guinea',
3828         'ER': 'Eritrea',
3829         'EE': 'Estonia',
3830         'ET': 'Ethiopia',
3831         'FK': 'Falkland Islands (Malvinas)',
3832         'FO': 'Faroe Islands',
3833         'FJ': 'Fiji',
3834         'FI': 'Finland',
3835         'FR': 'France',
3836         'GF': 'French Guiana',
3837         'PF': 'French Polynesia',
3838         'TF': 'French Southern Territories',
3839         'GA': 'Gabon',
3840         'GM': 'Gambia',
3841         'GE': 'Georgia',
3842         'DE': 'Germany',
3843         'GH': 'Ghana',
3844         'GI': 'Gibraltar',
3845         'GR': 'Greece',
3846         'GL': 'Greenland',
3847         'GD': 'Grenada',
3848         'GP': 'Guadeloupe',
3849         'GU': 'Guam',
3850         'GT': 'Guatemala',
3851         'GG': 'Guernsey',
3852         'GN': 'Guinea',
3853         'GW': 'Guinea-Bissau',
3854         'GY': 'Guyana',
3855         'HT': 'Haiti',
3856         'HM': 'Heard Island and McDonald Islands',
3857         'VA': 'Holy See (Vatican City State)',
3858         'HN': 'Honduras',
3859         'HK': 'Hong Kong',
3860         'HU': 'Hungary',
3861         'IS': 'Iceland',
3862         'IN': 'India',
3863         'ID': 'Indonesia',
3864         'IR': 'Iran, Islamic Republic of',
3865         'IQ': 'Iraq',
3866         'IE': 'Ireland',
3867         'IM': 'Isle of Man',
3868         'IL': 'Israel',
3869         'IT': 'Italy',
3870         'JM': 'Jamaica',
3871         'JP': 'Japan',
3872         'JE': 'Jersey',
3873         'JO': 'Jordan',
3874         'KZ': 'Kazakhstan',
3875         'KE': 'Kenya',
3876         'KI': 'Kiribati',
3877         'KP': 'Korea, Democratic People\'s Republic of',
3878         'KR': 'Korea, Republic of',
3879         'KW': 'Kuwait',
3880         'KG': 'Kyrgyzstan',
3881         'LA': 'Lao People\'s Democratic Republic',
3882         'LV': 'Latvia',
3883         'LB': 'Lebanon',
3884         'LS': 'Lesotho',
3885         'LR': 'Liberia',
3886         'LY': 'Libya',
3887         'LI': 'Liechtenstein',
3888         'LT': 'Lithuania',
3889         'LU': 'Luxembourg',
3890         'MO': 'Macao',
3891         'MK': 'Macedonia, the Former Yugoslav Republic of',
3892         'MG': 'Madagascar',
3893         'MW': 'Malawi',
3894         'MY': 'Malaysia',
3895         'MV': 'Maldives',
3896         'ML': 'Mali',
3897         'MT': 'Malta',
3898         'MH': 'Marshall Islands',
3899         'MQ': 'Martinique',
3900         'MR': 'Mauritania',
3901         'MU': 'Mauritius',
3902         'YT': 'Mayotte',
3903         'MX': 'Mexico',
3904         'FM': 'Micronesia, Federated States of',
3905         'MD': 'Moldova, Republic of',
3906         'MC': 'Monaco',
3907         'MN': 'Mongolia',
3908         'ME': 'Montenegro',
3909         'MS': 'Montserrat',
3910         'MA': 'Morocco',
3911         'MZ': 'Mozambique',
3912         'MM': 'Myanmar',
3913         'NA': 'Namibia',
3914         'NR': 'Nauru',
3915         'NP': 'Nepal',
3916         'NL': 'Netherlands',
3917         'NC': 'New Caledonia',
3918         'NZ': 'New Zealand',
3919         'NI': 'Nicaragua',
3920         'NE': 'Niger',
3921         'NG': 'Nigeria',
3922         'NU': 'Niue',
3923         'NF': 'Norfolk Island',
3924         'MP': 'Northern Mariana Islands',
3925         'NO': 'Norway',
3926         'OM': 'Oman',
3927         'PK': 'Pakistan',
3928         'PW': 'Palau',
3929         'PS': 'Palestine, State of',
3930         'PA': 'Panama',
3931         'PG': 'Papua New Guinea',
3932         'PY': 'Paraguay',
3933         'PE': 'Peru',
3934         'PH': 'Philippines',
3935         'PN': 'Pitcairn',
3936         'PL': 'Poland',
3937         'PT': 'Portugal',
3938         'PR': 'Puerto Rico',
3939         'QA': 'Qatar',
3940         'RE': 'Réunion',
3941         'RO': 'Romania',
3942         'RU': 'Russian Federation',
3943         'RW': 'Rwanda',
3944         'BL': 'Saint Barthélemy',
3945         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3946         'KN': 'Saint Kitts and Nevis',
3947         'LC': 'Saint Lucia',
3948         'MF': 'Saint Martin (French part)',
3949         'PM': 'Saint Pierre and Miquelon',
3950         'VC': 'Saint Vincent and the Grenadines',
3951         'WS': 'Samoa',
3952         'SM': 'San Marino',
3953         'ST': 'Sao Tome and Principe',
3954         'SA': 'Saudi Arabia',
3955         'SN': 'Senegal',
3956         'RS': 'Serbia',
3957         'SC': 'Seychelles',
3958         'SL': 'Sierra Leone',
3959         'SG': 'Singapore',
3960         'SX': 'Sint Maarten (Dutch part)',
3961         'SK': 'Slovakia',
3962         'SI': 'Slovenia',
3963         'SB': 'Solomon Islands',
3964         'SO': 'Somalia',
3965         'ZA': 'South Africa',
3966         'GS': 'South Georgia and the South Sandwich Islands',
3967         'SS': 'South Sudan',
3968         'ES': 'Spain',
3969         'LK': 'Sri Lanka',
3970         'SD': 'Sudan',
3971         'SR': 'Suriname',
3972         'SJ': 'Svalbard and Jan Mayen',
3973         'SZ': 'Swaziland',
3974         'SE': 'Sweden',
3975         'CH': 'Switzerland',
3976         'SY': 'Syrian Arab Republic',
3977         'TW': 'Taiwan, Province of China',
3978         'TJ': 'Tajikistan',
3979         'TZ': 'Tanzania, United Republic of',
3980         'TH': 'Thailand',
3981         'TL': 'Timor-Leste',
3982         'TG': 'Togo',
3983         'TK': 'Tokelau',
3984         'TO': 'Tonga',
3985         'TT': 'Trinidad and Tobago',
3986         'TN': 'Tunisia',
3987         'TR': 'Turkey',
3988         'TM': 'Turkmenistan',
3989         'TC': 'Turks and Caicos Islands',
3990         'TV': 'Tuvalu',
3991         'UG': 'Uganda',
3992         'UA': 'Ukraine',
3993         'AE': 'United Arab Emirates',
3994         'GB': 'United Kingdom',
3995         'US': 'United States',
3996         'UM': 'United States Minor Outlying Islands',
3997         'UY': 'Uruguay',
3998         'UZ': 'Uzbekistan',
3999         'VU': 'Vanuatu',
4000         'VE': 'Venezuela, Bolivarian Republic of',
4001         'VN': 'Viet Nam',
4002         'VG': 'Virgin Islands, British',
4003         'VI': 'Virgin Islands, U.S.',
4004         'WF': 'Wallis and Futuna',
4005         'EH': 'Western Sahara',
4006         'YE': 'Yemen',
4007         'ZM': 'Zambia',
4008         'ZW': 'Zimbabwe',
4009         # Not ISO 3166 codes, but used for IP blocks
4010         'AP': 'Asia/Pacific Region',
4011         'EU': 'Europe',
4012     }
4013
4014     @classmethod
4015     def short2full(cls, code):
4016         """Convert an ISO 3166-2 country code to the corresponding full name"""
4017         return cls._country_map.get(code.upper())
4018
4019
4020 class GeoUtils:
4021     # Major IPv4 address blocks per country
4022     _country_ip_map = {
4023         'AD': '46.172.224.0/19',
4024         'AE': '94.200.0.0/13',
4025         'AF': '149.54.0.0/17',
4026         'AG': '209.59.64.0/18',
4027         'AI': '204.14.248.0/21',
4028         'AL': '46.99.0.0/16',
4029         'AM': '46.70.0.0/15',
4030         'AO': '105.168.0.0/13',
4031         'AP': '182.50.184.0/21',
4032         'AQ': '23.154.160.0/24',
4033         'AR': '181.0.0.0/12',
4034         'AS': '202.70.112.0/20',
4035         'AT': '77.116.0.0/14',
4036         'AU': '1.128.0.0/11',
4037         'AW': '181.41.0.0/18',
4038         'AX': '185.217.4.0/22',
4039         'AZ': '5.197.0.0/16',
4040         'BA': '31.176.128.0/17',
4041         'BB': '65.48.128.0/17',
4042         'BD': '114.130.0.0/16',
4043         'BE': '57.0.0.0/8',
4044         'BF': '102.178.0.0/15',
4045         'BG': '95.42.0.0/15',
4046         'BH': '37.131.0.0/17',
4047         'BI': '154.117.192.0/18',
4048         'BJ': '137.255.0.0/16',
4049         'BL': '185.212.72.0/23',
4050         'BM': '196.12.64.0/18',
4051         'BN': '156.31.0.0/16',
4052         'BO': '161.56.0.0/16',
4053         'BQ': '161.0.80.0/20',
4054         'BR': '191.128.0.0/12',
4055         'BS': '24.51.64.0/18',
4056         'BT': '119.2.96.0/19',
4057         'BW': '168.167.0.0/16',
4058         'BY': '178.120.0.0/13',
4059         'BZ': '179.42.192.0/18',
4060         'CA': '99.224.0.0/11',
4061         'CD': '41.243.0.0/16',
4062         'CF': '197.242.176.0/21',
4063         'CG': '160.113.0.0/16',
4064         'CH': '85.0.0.0/13',
4065         'CI': '102.136.0.0/14',
4066         'CK': '202.65.32.0/19',
4067         'CL': '152.172.0.0/14',
4068         'CM': '102.244.0.0/14',
4069         'CN': '36.128.0.0/10',
4070         'CO': '181.240.0.0/12',
4071         'CR': '201.192.0.0/12',
4072         'CU': '152.206.0.0/15',
4073         'CV': '165.90.96.0/19',
4074         'CW': '190.88.128.0/17',
4075         'CY': '31.153.0.0/16',
4076         'CZ': '88.100.0.0/14',
4077         'DE': '53.0.0.0/8',
4078         'DJ': '197.241.0.0/17',
4079         'DK': '87.48.0.0/12',
4080         'DM': '192.243.48.0/20',
4081         'DO': '152.166.0.0/15',
4082         'DZ': '41.96.0.0/12',
4083         'EC': '186.68.0.0/15',
4084         'EE': '90.190.0.0/15',
4085         'EG': '156.160.0.0/11',
4086         'ER': '196.200.96.0/20',
4087         'ES': '88.0.0.0/11',
4088         'ET': '196.188.0.0/14',
4089         'EU': '2.16.0.0/13',
4090         'FI': '91.152.0.0/13',
4091         'FJ': '144.120.0.0/16',
4092         'FK': '80.73.208.0/21',
4093         'FM': '119.252.112.0/20',
4094         'FO': '88.85.32.0/19',
4095         'FR': '90.0.0.0/9',
4096         'GA': '41.158.0.0/15',
4097         'GB': '25.0.0.0/8',
4098         'GD': '74.122.88.0/21',
4099         'GE': '31.146.0.0/16',
4100         'GF': '161.22.64.0/18',
4101         'GG': '62.68.160.0/19',
4102         'GH': '154.160.0.0/12',
4103         'GI': '95.164.0.0/16',
4104         'GL': '88.83.0.0/19',
4105         'GM': '160.182.0.0/15',
4106         'GN': '197.149.192.0/18',
4107         'GP': '104.250.0.0/19',
4108         'GQ': '105.235.224.0/20',
4109         'GR': '94.64.0.0/13',
4110         'GT': '168.234.0.0/16',
4111         'GU': '168.123.0.0/16',
4112         'GW': '197.214.80.0/20',
4113         'GY': '181.41.64.0/18',
4114         'HK': '113.252.0.0/14',
4115         'HN': '181.210.0.0/16',
4116         'HR': '93.136.0.0/13',
4117         'HT': '148.102.128.0/17',
4118         'HU': '84.0.0.0/14',
4119         'ID': '39.192.0.0/10',
4120         'IE': '87.32.0.0/12',
4121         'IL': '79.176.0.0/13',
4122         'IM': '5.62.80.0/20',
4123         'IN': '117.192.0.0/10',
4124         'IO': '203.83.48.0/21',
4125         'IQ': '37.236.0.0/14',
4126         'IR': '2.176.0.0/12',
4127         'IS': '82.221.0.0/16',
4128         'IT': '79.0.0.0/10',
4129         'JE': '87.244.64.0/18',
4130         'JM': '72.27.0.0/17',
4131         'JO': '176.29.0.0/16',
4132         'JP': '133.0.0.0/8',
4133         'KE': '105.48.0.0/12',
4134         'KG': '158.181.128.0/17',
4135         'KH': '36.37.128.0/17',
4136         'KI': '103.25.140.0/22',
4137         'KM': '197.255.224.0/20',
4138         'KN': '198.167.192.0/19',
4139         'KP': '175.45.176.0/22',
4140         'KR': '175.192.0.0/10',
4141         'KW': '37.36.0.0/14',
4142         'KY': '64.96.0.0/15',
4143         'KZ': '2.72.0.0/13',
4144         'LA': '115.84.64.0/18',
4145         'LB': '178.135.0.0/16',
4146         'LC': '24.92.144.0/20',
4147         'LI': '82.117.0.0/19',
4148         'LK': '112.134.0.0/15',
4149         'LR': '102.183.0.0/16',
4150         'LS': '129.232.0.0/17',
4151         'LT': '78.56.0.0/13',
4152         'LU': '188.42.0.0/16',
4153         'LV': '46.109.0.0/16',
4154         'LY': '41.252.0.0/14',
4155         'MA': '105.128.0.0/11',
4156         'MC': '88.209.64.0/18',
4157         'MD': '37.246.0.0/16',
4158         'ME': '178.175.0.0/17',
4159         'MF': '74.112.232.0/21',
4160         'MG': '154.126.0.0/17',
4161         'MH': '117.103.88.0/21',
4162         'MK': '77.28.0.0/15',
4163         'ML': '154.118.128.0/18',
4164         'MM': '37.111.0.0/17',
4165         'MN': '49.0.128.0/17',
4166         'MO': '60.246.0.0/16',
4167         'MP': '202.88.64.0/20',
4168         'MQ': '109.203.224.0/19',
4169         'MR': '41.188.64.0/18',
4170         'MS': '208.90.112.0/22',
4171         'MT': '46.11.0.0/16',
4172         'MU': '105.16.0.0/12',
4173         'MV': '27.114.128.0/18',
4174         'MW': '102.70.0.0/15',
4175         'MX': '187.192.0.0/11',
4176         'MY': '175.136.0.0/13',
4177         'MZ': '197.218.0.0/15',
4178         'NA': '41.182.0.0/16',
4179         'NC': '101.101.0.0/18',
4180         'NE': '197.214.0.0/18',
4181         'NF': '203.17.240.0/22',
4182         'NG': '105.112.0.0/12',
4183         'NI': '186.76.0.0/15',
4184         'NL': '145.96.0.0/11',
4185         'NO': '84.208.0.0/13',
4186         'NP': '36.252.0.0/15',
4187         'NR': '203.98.224.0/19',
4188         'NU': '49.156.48.0/22',
4189         'NZ': '49.224.0.0/14',
4190         'OM': '5.36.0.0/15',
4191         'PA': '186.72.0.0/15',
4192         'PE': '186.160.0.0/14',
4193         'PF': '123.50.64.0/18',
4194         'PG': '124.240.192.0/19',
4195         'PH': '49.144.0.0/13',
4196         'PK': '39.32.0.0/11',
4197         'PL': '83.0.0.0/11',
4198         'PM': '70.36.0.0/20',
4199         'PR': '66.50.0.0/16',
4200         'PS': '188.161.0.0/16',
4201         'PT': '85.240.0.0/13',
4202         'PW': '202.124.224.0/20',
4203         'PY': '181.120.0.0/14',
4204         'QA': '37.210.0.0/15',
4205         'RE': '102.35.0.0/16',
4206         'RO': '79.112.0.0/13',
4207         'RS': '93.86.0.0/15',
4208         'RU': '5.136.0.0/13',
4209         'RW': '41.186.0.0/16',
4210         'SA': '188.48.0.0/13',
4211         'SB': '202.1.160.0/19',
4212         'SC': '154.192.0.0/11',
4213         'SD': '102.120.0.0/13',
4214         'SE': '78.64.0.0/12',
4215         'SG': '8.128.0.0/10',
4216         'SI': '188.196.0.0/14',
4217         'SK': '78.98.0.0/15',
4218         'SL': '102.143.0.0/17',
4219         'SM': '89.186.32.0/19',
4220         'SN': '41.82.0.0/15',
4221         'SO': '154.115.192.0/18',
4222         'SR': '186.179.128.0/17',
4223         'SS': '105.235.208.0/21',
4224         'ST': '197.159.160.0/19',
4225         'SV': '168.243.0.0/16',
4226         'SX': '190.102.0.0/20',
4227         'SY': '5.0.0.0/16',
4228         'SZ': '41.84.224.0/19',
4229         'TC': '65.255.48.0/20',
4230         'TD': '154.68.128.0/19',
4231         'TG': '196.168.0.0/14',
4232         'TH': '171.96.0.0/13',
4233         'TJ': '85.9.128.0/18',
4234         'TK': '27.96.24.0/21',
4235         'TL': '180.189.160.0/20',
4236         'TM': '95.85.96.0/19',
4237         'TN': '197.0.0.0/11',
4238         'TO': '175.176.144.0/21',
4239         'TR': '78.160.0.0/11',
4240         'TT': '186.44.0.0/15',
4241         'TV': '202.2.96.0/19',
4242         'TW': '120.96.0.0/11',
4243         'TZ': '156.156.0.0/14',
4244         'UA': '37.52.0.0/14',
4245         'UG': '102.80.0.0/13',
4246         'US': '6.0.0.0/8',
4247         'UY': '167.56.0.0/13',
4248         'UZ': '84.54.64.0/18',
4249         'VA': '212.77.0.0/19',
4250         'VC': '207.191.240.0/21',
4251         'VE': '186.88.0.0/13',
4252         'VG': '66.81.192.0/20',
4253         'VI': '146.226.0.0/16',
4254         'VN': '14.160.0.0/11',
4255         'VU': '202.80.32.0/20',
4256         'WF': '117.20.32.0/21',
4257         'WS': '202.4.32.0/19',
4258         'YE': '134.35.0.0/16',
4259         'YT': '41.242.116.0/22',
4260         'ZA': '41.0.0.0/11',
4261         'ZM': '102.144.0.0/13',
4262         'ZW': '102.177.192.0/18',
4263     }
4264
4265     @classmethod
4266     def random_ipv4(cls, code_or_block):
4267         if len(code_or_block) == 2:
4268             block = cls._country_ip_map.get(code_or_block.upper())
4269             if not block:
4270                 return None
4271         else:
4272             block = code_or_block
4273         addr, preflen = block.split('/')
4274         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4275         addr_max = addr_min | (0xffffffff >> int(preflen))
4276         return str(socket.inet_ntoa(
4277             struct.pack('!L', random.randint(addr_min, addr_max))))
4278
4279
4280 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4281 # released into Public Domain
4282 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4283
4284 def long_to_bytes(n, blocksize=0):
4285     """long_to_bytes(n:long, blocksize:int) : string
4286     Convert a long integer to a byte string.
4287
4288     If optional blocksize is given and greater than zero, pad the front of the
4289     byte string with binary zeros so that the length is a multiple of
4290     blocksize.
4291     """
4292     # after much testing, this algorithm was deemed to be the fastest
4293     s = b''
4294     n = int(n)
4295     while n > 0:
4296         s = struct.pack('>I', n & 0xffffffff) + s
4297         n = n >> 32
4298     # strip off leading zeros
4299     for i in range(len(s)):
4300         if s[i] != b'\000'[0]:
4301             break
4302     else:
4303         # only happens when n == 0
4304         s = b'\000'
4305         i = 0
4306     s = s[i:]
4307     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4308     # de-padding being done above, but sigh...
4309     if blocksize > 0 and len(s) % blocksize:
4310         s = (blocksize - len(s) % blocksize) * b'\000' + s
4311     return s
4312
4313
4314 def bytes_to_long(s):
4315     """bytes_to_long(string) : long
4316     Convert a byte string to a long integer.
4317
4318     This is (essentially) the inverse of long_to_bytes().
4319     """
4320     acc = 0
4321     length = len(s)
4322     if length % 4:
4323         extra = (4 - length % 4)
4324         s = b'\000' * extra + s
4325         length = length + extra
4326     for i in range(0, length, 4):
4327         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4328     return acc
4329
4330
4331 def ohdave_rsa_encrypt(data, exponent, modulus):
4332     '''
4333     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4334
4335     Input:
4336         data: data to encrypt, bytes-like object
4337         exponent, modulus: parameter e and N of RSA algorithm, both integer
4338     Output: hex string of encrypted data
4339
4340     Limitation: supports one block encryption only
4341     '''
4342
4343     payload = int(binascii.hexlify(data[::-1]), 16)
4344     encrypted = pow(payload, exponent, modulus)
4345     return '%x' % encrypted
4346
4347
4348 def pkcs1pad(data, length):
4349     """
4350     Padding input data with PKCS#1 scheme
4351
4352     @param {int[]} data        input data
4353     @param {int}   length      target length
4354     @returns {int[]}           padded data
4355     """
4356     if len(data) > length - 11:
4357         raise ValueError('Input data too long for PKCS#1 padding')
4358
4359     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4360     return [0, 2] + pseudo_random + [0] + data
4361
4362
4363 def _base_n_table(n, table):
4364     if not table and not n:
4365         raise ValueError('Either table or n must be specified')
4366     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4367
4368     if n and n != len(table):
4369         raise ValueError(f'base {n} exceeds table length {len(table)}')
4370     return table
4371
4372
4373 def encode_base_n(num, n=None, table=None):
4374     """Convert given int to a base-n string"""
4375     table = _base_n_table(n, table)
4376     if not num:
4377         return table[0]
4378
4379     result, base = '', len(table)
4380     while num:
4381         result = table[num % base] + result
4382         num = num // base
4383     return result
4384
4385
4386 def decode_base_n(string, n=None, table=None):
4387     """Convert given base-n string to int"""
4388     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4389     result, base = 0, len(table)
4390     for char in string:
4391         result = result * base + table[char]
4392     return result
4393
4394
4395 def decode_packed_codes(code):
4396     mobj = re.search(PACKED_CODES_RE, code)
4397     obfuscated_code, base, count, symbols = mobj.groups()
4398     base = int(base)
4399     count = int(count)
4400     symbols = symbols.split('|')
4401     symbol_table = {}
4402
4403     while count:
4404         count -= 1
4405         base_n_count = encode_base_n(count, base)
4406         symbol_table[base_n_count] = symbols[count] or base_n_count
4407
4408     return re.sub(
4409         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4410         obfuscated_code)
4411
4412
4413 def caesar(s, alphabet, shift):
4414     if shift == 0:
4415         return s
4416     l = len(alphabet)
4417     return ''.join(
4418         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4419         for c in s)
4420
4421
4422 def rot47(s):
4423     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4424
4425
4426 def parse_m3u8_attributes(attrib):
4427     info = {}
4428     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4429         if val.startswith('"'):
4430             val = val[1:-1]
4431         info[key] = val
4432     return info
4433
4434
4435 def urshift(val, n):
4436     return val >> n if val >= 0 else (val + 0x100000000) >> n
4437
4438
4439 def write_xattr(path, key, value):
4440     # Windows: Write xattrs to NTFS Alternate Data Streams:
4441     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4442     if compat_os_name == 'nt':
4443         assert ':' not in key
4444         assert os.path.exists(path)
4445
4446         try:
4447             with open(f'{path}:{key}', 'wb') as f:
4448                 f.write(value)
4449         except OSError as e:
4450             raise XAttrMetadataError(e.errno, e.strerror)
4451         return
4452
4453     # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
4454
4455     setxattr = None
4456     if callable(getattr(os, 'setxattr', None)):
4457         setxattr = os.setxattr
4458     elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4459         # Unicode arguments are not supported in pyxattr until version 0.5.0
4460         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4461         if version_tuple(xattr.__version__) >= (0, 5, 0):
4462             setxattr = xattr.set
4463     elif xattr:
4464         setxattr = xattr.setxattr
4465
4466     if setxattr:
4467         try:
4468             setxattr(path, key, value)
4469         except OSError as e:
4470             raise XAttrMetadataError(e.errno, e.strerror)
4471         return
4472
4473     # UNIX Method 2. Use setfattr/xattr executables
4474     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4475            else 'xattr' if check_executable('xattr', ['-h']) else None)
4476     if not exe:
4477         raise XAttrUnavailableError(
4478             'Couldn\'t find a tool to set the xattrs. Install either the "xattr" or "pyxattr" Python modules or the '
4479             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4480
4481     value = value.decode()
4482     try:
4483         _, stderr, returncode = Popen.run(
4484             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4485             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4486     except OSError as e:
4487         raise XAttrMetadataError(e.errno, e.strerror)
4488     if returncode:
4489         raise XAttrMetadataError(returncode, stderr)
4490
4491
4492 def random_birthday(year_field, month_field, day_field):
4493     start_date = datetime.date(1950, 1, 1)
4494     end_date = datetime.date(1995, 12, 31)
4495     offset = random.randint(0, (end_date - start_date).days)
4496     random_date = start_date + datetime.timedelta(offset)
4497     return {
4498         year_field: str(random_date.year),
4499         month_field: str(random_date.month),
4500         day_field: str(random_date.day),
4501     }
4502
4503
4504 def find_available_port(interface=''):
4505     try:
4506         with socket.socket() as sock:
4507             sock.bind((interface, 0))
4508             return sock.getsockname()[1]
4509     except OSError:
4510         return None
4511
4512
4513 # Templates for internet shortcut files, which are plain text files.
4514 DOT_URL_LINK_TEMPLATE = '''\
4515 [InternetShortcut]
4516 URL=%(url)s
4517 '''
4518
4519 DOT_WEBLOC_LINK_TEMPLATE = '''\
4520 <?xml version="1.0" encoding="UTF-8"?>
4521 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4522 <plist version="1.0">
4523 <dict>
4524 \t<key>URL</key>
4525 \t<string>%(url)s</string>
4526 </dict>
4527 </plist>
4528 '''
4529
4530 DOT_DESKTOP_LINK_TEMPLATE = '''\
4531 [Desktop Entry]
4532 Encoding=UTF-8
4533 Name=%(filename)s
4534 Type=Link
4535 URL=%(url)s
4536 Icon=text-html
4537 '''
4538
4539 LINK_TEMPLATES = {
4540     'url': DOT_URL_LINK_TEMPLATE,
4541     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4542     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4543 }
4544
4545
4546 def iri_to_uri(iri):
4547     """
4548     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4549
4550     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4551     """
4552
4553     iri_parts = urllib.parse.urlparse(iri)
4554
4555     if '[' in iri_parts.netloc:
4556         raise ValueError('IPv6 URIs are not, yet, supported.')
4557         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4558
4559     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4560
4561     net_location = ''
4562     if iri_parts.username:
4563         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4564         if iri_parts.password is not None:
4565             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4566         net_location += '@'
4567
4568     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4569     # The 'idna' encoding produces ASCII text.
4570     if iri_parts.port is not None and iri_parts.port != 80:
4571         net_location += ':' + str(iri_parts.port)
4572
4573     return urllib.parse.urlunparse(
4574         (iri_parts.scheme,
4575             net_location,
4576
4577             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4578
4579             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4580             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4581
4582             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4583             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4584
4585             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4586
4587     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4588
4589
4590 def to_high_limit_path(path):
4591     if sys.platform in ['win32', 'cygwin']:
4592         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4593         return '\\\\?\\' + os.path.abspath(path)
4594
4595     return path
4596
4597
4598 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4599     val = traversal.traverse_obj(obj, *variadic(field))
4600     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4601         return default
4602     return template % func(val)
4603
4604
4605 def clean_podcast_url(url):
4606     url = re.sub(r'''(?x)
4607         (?:
4608             (?:
4609                 chtbl\.com/track|
4610                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4611                 play\.podtrac\.com|
4612                 chrt\.fm/track|
4613                 mgln\.ai/e
4614             )(?:/[^/.]+)?|
4615             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4616             flex\.acast\.com|
4617             pd(?:
4618                 cn\.co| # https://podcorn.com/analytics-prefix/
4619                 st\.fm # https://podsights.com/docs/
4620             )/e|
4621             [0-9]\.gum\.fm|
4622             pscrb\.fm/rss/p
4623         )/''', '', url)
4624     return re.sub(r'^\w+://(\w+://)', r'\1', url)
4625
4626
4627 _HEX_TABLE = '0123456789abcdef'
4628
4629
4630 def random_uuidv4():
4631     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4632
4633
4634 def make_dir(path, to_screen=None):
4635     try:
4636         dn = os.path.dirname(path)
4637         if dn:
4638             os.makedirs(dn, exist_ok=True)
4639         return True
4640     except OSError as err:
4641         if callable(to_screen) is not None:
4642             to_screen(f'unable to create directory {err}')
4643         return False
4644
4645
4646 def get_executable_path():
4647     from ..update import _get_variant_and_executable_path
4648
4649     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4650
4651
4652 def get_user_config_dirs(package_name):
4653     # .config (e.g. ~/.config/package_name)
4654     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4655     yield os.path.join(xdg_config_home, package_name)
4656
4657     # appdata (%APPDATA%/package_name)
4658     appdata_dir = os.getenv('appdata')
4659     if appdata_dir:
4660         yield os.path.join(appdata_dir, package_name)
4661
4662     # home (~/.package_name)
4663     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4664
4665
4666 def get_system_config_dirs(package_name):
4667     # /etc/package_name
4668     yield os.path.join('/etc', package_name)
4669
4670
4671 def time_seconds(**kwargs):
4672     """
4673     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4674     """
4675     return time.time() + datetime.timedelta(**kwargs).total_seconds()
4676
4677
4678 # create a JSON Web Signature (jws) with HS256 algorithm
4679 # the resulting format is in JWS Compact Serialization
4680 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4681 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4682 def jwt_encode_hs256(payload_data, key, headers={}):
4683     header_data = {
4684         'alg': 'HS256',
4685         'typ': 'JWT',
4686     }
4687     if headers:
4688         header_data.update(headers)
4689     header_b64 = base64.b64encode(json.dumps(header_data).encode())
4690     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4691     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4692     signature_b64 = base64.b64encode(h.digest())
4693     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4694     return token
4695
4696
4697 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4698 def jwt_decode_hs256(jwt):
4699     header_b64, payload_b64, signature_b64 = jwt.split('.')
4700     # add trailing ='s that may have been stripped, superfluous ='s are ignored
4701     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4702     return payload_data
4703
4704
4705 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4706
4707
4708 @functools.cache
4709 def supports_terminal_sequences(stream):
4710     if compat_os_name == 'nt':
4711         if not WINDOWS_VT_MODE:
4712             return False
4713     elif not os.getenv('TERM'):
4714         return False
4715     try:
4716         return stream.isatty()
4717     except BaseException:
4718         return False
4719
4720
4721 def windows_enable_vt_mode():
4722     """Ref: https://bugs.python.org/issue30075 """
4723     if get_windows_version() < (10, 0, 10586):
4724         return
4725
4726     import ctypes
4727     import ctypes.wintypes
4728     import msvcrt
4729
4730     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4731
4732     dll = ctypes.WinDLL('kernel32', use_last_error=False)
4733     handle = os.open('CONOUT$', os.O_RDWR)
4734     try:
4735         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4736         dw_original_mode = ctypes.wintypes.DWORD()
4737         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4738         if not success:
4739             raise Exception('GetConsoleMode failed')
4740
4741         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4742             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4743         if not success:
4744             raise Exception('SetConsoleMode failed')
4745     finally:
4746         os.close(handle)
4747
4748     global WINDOWS_VT_MODE
4749     WINDOWS_VT_MODE = True
4750     supports_terminal_sequences.cache_clear()
4751
4752
4753 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4754
4755
4756 def remove_terminal_sequences(string):
4757     return _terminal_sequences_re.sub('', string)
4758
4759
4760 def number_of_digits(number):
4761     return len('%d' % number)
4762
4763
4764 def join_nonempty(*values, delim='-', from_dict=None):
4765     if from_dict is not None:
4766         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4767     return delim.join(map(str, filter(None, values)))
4768
4769
4770 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4771     """
4772     Find the largest format dimensions in terms of video width and, for each thumbnail:
4773     * Modify the URL: Match the width with the provided regex and replace with the former width
4774     * Update dimensions
4775
4776     This function is useful with video services that scale the provided thumbnails on demand
4777     """
4778     _keys = ('width', 'height')
4779     max_dimensions = max(
4780         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4781         default=(0, 0))
4782     if not max_dimensions[0]:
4783         return thumbnails
4784     return [
4785         merge_dicts(
4786             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4787             dict(zip(_keys, max_dimensions)), thumbnail)
4788         for thumbnail in thumbnails
4789     ]
4790
4791
4792 def parse_http_range(range):
4793     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4794     if not range:
4795         return None, None, None
4796     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4797     if not crg:
4798         return None, None, None
4799     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4800
4801
4802 def read_stdin(what):
4803     if what:
4804         eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4805         write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4806     return sys.stdin
4807
4808
4809 def determine_file_encoding(data):
4810     """
4811     Detect the text encoding used
4812     @returns (encoding, bytes to skip)
4813     """
4814
4815     # BOM marks are given priority over declarations
4816     for bom, enc in BOMS:
4817         if data.startswith(bom):
4818             return enc, len(bom)
4819
4820     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4821     # We ignore the endianness to get a good enough match
4822     data = data.replace(b'\0', b'')
4823     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4824     return mobj.group(1).decode() if mobj else None, 0
4825
4826
4827 class Config:
4828     own_args = None
4829     parsed_args = None
4830     filename = None
4831     __initialized = False
4832
4833     def __init__(self, parser, label=None):
4834         self.parser, self.label = parser, label
4835         self._loaded_paths, self.configs = set(), []
4836
4837     def init(self, args=None, filename=None):
4838         assert not self.__initialized
4839         self.own_args, self.filename = args, filename
4840         return self.load_configs()
4841
4842     def load_configs(self):
4843         directory = ''
4844         if self.filename:
4845             location = os.path.realpath(self.filename)
4846             directory = os.path.dirname(location)
4847             if location in self._loaded_paths:
4848                 return False
4849             self._loaded_paths.add(location)
4850
4851         self.__initialized = True
4852         opts, _ = self.parser.parse_known_args(self.own_args)
4853         self.parsed_args = self.own_args
4854         for location in opts.config_locations or []:
4855             if location == '-':
4856                 if location in self._loaded_paths:
4857                     continue
4858                 self._loaded_paths.add(location)
4859                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4860                 continue
4861             location = os.path.join(directory, expand_path(location))
4862             if os.path.isdir(location):
4863                 location = os.path.join(location, 'yt-dlp.conf')
4864             if not os.path.exists(location):
4865                 self.parser.error(f'config location {location} does not exist')
4866             self.append_config(self.read_file(location), location)
4867         return True
4868
4869     def __str__(self):
4870         label = join_nonempty(
4871             self.label, 'config', f'"{self.filename}"' if self.filename else '',
4872             delim=' ')
4873         return join_nonempty(
4874             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4875             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4876             delim='\n')
4877
4878     @staticmethod
4879     def read_file(filename, default=[]):
4880         try:
4881             optionf = open(filename, 'rb')
4882         except OSError:
4883             return default  # silently skip if file is not present
4884         try:
4885             enc, skip = determine_file_encoding(optionf.read(512))
4886             optionf.seek(skip, io.SEEK_SET)
4887         except OSError:
4888             enc = None  # silently skip read errors
4889         try:
4890             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4891             contents = optionf.read().decode(enc or preferredencoding())
4892             res = shlex.split(contents, comments=True)
4893         except Exception as err:
4894             raise ValueError(f'Unable to parse "{filename}": {err}')
4895         finally:
4896             optionf.close()
4897         return res
4898
4899     @staticmethod
4900     def hide_login_info(opts):
4901         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4902         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4903
4904         def _scrub_eq(o):
4905             m = eqre.match(o)
4906             if m:
4907                 return m.group('key') + '=PRIVATE'
4908             else:
4909                 return o
4910
4911         opts = list(map(_scrub_eq, opts))
4912         for idx, opt in enumerate(opts):
4913             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4914                 opts[idx + 1] = 'PRIVATE'
4915         return opts
4916
4917     def append_config(self, *args, label=None):
4918         config = type(self)(self.parser, label)
4919         config._loaded_paths = self._loaded_paths
4920         if config.init(*args):
4921             self.configs.append(config)
4922
4923     @property
4924     def all_args(self):
4925         for config in reversed(self.configs):
4926             yield from config.all_args
4927         yield from self.parsed_args or []
4928
4929     def parse_known_args(self, **kwargs):
4930         return self.parser.parse_known_args(self.all_args, **kwargs)
4931
4932     def parse_args(self):
4933         return self.parser.parse_args(self.all_args)
4934
4935
4936 def merge_headers(*dicts):
4937     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4938     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
4939
4940
4941 def cached_method(f):
4942     """Cache a method"""
4943     signature = inspect.signature(f)
4944
4945     @functools.wraps(f)
4946     def wrapper(self, *args, **kwargs):
4947         bound_args = signature.bind(self, *args, **kwargs)
4948         bound_args.apply_defaults()
4949         key = tuple(bound_args.arguments.values())[1:]
4950
4951         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
4952         if key not in cache:
4953             cache[key] = f(self, *args, **kwargs)
4954         return cache[key]
4955     return wrapper
4956
4957
4958 class classproperty:
4959     """property access for class methods with optional caching"""
4960     def __new__(cls, func=None, *args, **kwargs):
4961         if not func:
4962             return functools.partial(cls, *args, **kwargs)
4963         return super().__new__(cls)
4964
4965     def __init__(self, func, *, cache=False):
4966         functools.update_wrapper(self, func)
4967         self.func = func
4968         self._cache = {} if cache else None
4969
4970     def __get__(self, _, cls):
4971         if self._cache is None:
4972             return self.func(cls)
4973         elif cls not in self._cache:
4974             self._cache[cls] = self.func(cls)
4975         return self._cache[cls]
4976
4977
4978 class function_with_repr:
4979     def __init__(self, func, repr_=None):
4980         functools.update_wrapper(self, func)
4981         self.func, self.__repr = func, repr_
4982
4983     def __call__(self, *args, **kwargs):
4984         return self.func(*args, **kwargs)
4985
4986     @classmethod
4987     def set_repr(cls, repr_):
4988         return functools.partial(cls, repr_=repr_)
4989
4990     def __repr__(self):
4991         if self.__repr:
4992             return self.__repr
4993         return f'{self.func.__module__}.{self.func.__qualname__}'
4994
4995
4996 class Namespace(types.SimpleNamespace):
4997     """Immutable namespace"""
4998
4999     def __iter__(self):
5000         return iter(self.__dict__.values())
5001
5002     @property
5003     def items_(self):
5004         return self.__dict__.items()
5005
5006
5007 MEDIA_EXTENSIONS = Namespace(
5008     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5009     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5010     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5011     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5012     thumbnails=('jpg', 'png', 'webp'),
5013     storyboards=('mhtml', ),
5014     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5015     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5016 )
5017 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5018 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5019
5020 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5021
5022
5023 class RetryManager:
5024     """Usage:
5025         for retry in RetryManager(...):
5026             try:
5027                 ...
5028             except SomeException as err:
5029                 retry.error = err
5030                 continue
5031     """
5032     attempt, _error = 0, None
5033
5034     def __init__(self, _retries, _error_callback, **kwargs):
5035         self.retries = _retries or 0
5036         self.error_callback = functools.partial(_error_callback, **kwargs)
5037
5038     def _should_retry(self):
5039         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5040
5041     @property
5042     def error(self):
5043         if self._error is NO_DEFAULT:
5044             return None
5045         return self._error
5046
5047     @error.setter
5048     def error(self, value):
5049         self._error = value
5050
5051     def __iter__(self):
5052         while self._should_retry():
5053             self.error = NO_DEFAULT
5054             self.attempt += 1
5055             yield self
5056             if self.error:
5057                 self.error_callback(self.error, self.attempt, self.retries)
5058
5059     @staticmethod
5060     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5061         """Utility function for reporting retries"""
5062         if count > retries:
5063             if error:
5064                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5065             raise e
5066
5067         if not count:
5068             return warn(e)
5069         elif isinstance(e, ExtractorError):
5070             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5071         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5072
5073         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5074         if delay:
5075             info(f'Sleeping {delay:.2f} seconds ...')
5076             time.sleep(delay)
5077
5078
5079 def make_archive_id(ie, video_id):
5080     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5081     return f'{ie_key.lower()} {video_id}'
5082
5083
5084 def truncate_string(s, left, right=0):
5085     assert left > 3 and right >= 0
5086     if s is None or len(s) <= left + right:
5087         return s
5088     return f'{s[:left - 3]}...{s[-right:] if right else ""}'
5089
5090
5091 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5092     assert 'all' in alias_dict, '"all" alias is required'
5093     requested = list(start or [])
5094     for val in options:
5095         discard = val.startswith('-')
5096         if discard:
5097             val = val[1:]
5098
5099         if val in alias_dict:
5100             val = alias_dict[val] if not discard else [
5101                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5102             # NB: Do not allow regex in aliases for performance
5103             requested = orderedSet_from_options(val, alias_dict, start=requested)
5104             continue
5105
5106         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5107                    else [val] if val in alias_dict['all'] else None)
5108         if current is None:
5109             raise ValueError(val)
5110
5111         if discard:
5112             for item in current:
5113                 while item in requested:
5114                     requested.remove(item)
5115         else:
5116             requested.extend(current)
5117
5118     return orderedSet(requested)
5119
5120
5121 # TODO: Rewrite
5122 class FormatSorter:
5123     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5124
5125     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5126                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5127                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5128     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5129                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5130                     'fps', 'fs_approx', 'source', 'id')
5131
5132     settings = {
5133         'vcodec': {'type': 'ordered', 'regex': True,
5134                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5135         'acodec': {'type': 'ordered', 'regex': True,
5136                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5137         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5138                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5139         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5140                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5141         'vext': {'type': 'ordered', 'field': 'video_ext',
5142                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5143                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5144         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5145                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5146                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5147         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5148         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5149                        'field': ('vcodec', 'acodec'),
5150                        'function': lambda it: int(any(v != 'none' for v in it))},
5151         'ie_pref': {'priority': True, 'type': 'extractor'},
5152         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5153         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5154         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5155         'quality': {'convert': 'float', 'default': -1},
5156         'filesize': {'convert': 'bytes'},
5157         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5158         'id': {'convert': 'string', 'field': 'format_id'},
5159         'height': {'convert': 'float_none'},
5160         'width': {'convert': 'float_none'},
5161         'fps': {'convert': 'float_none'},
5162         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5163         'tbr': {'convert': 'float_none'},
5164         'vbr': {'convert': 'float_none'},
5165         'abr': {'convert': 'float_none'},
5166         'asr': {'convert': 'float_none'},
5167         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5168
5169         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5170         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5171                'function': lambda it: next(filter(None, it), None)},
5172         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5173                  'function': lambda it: next(filter(None, it), None)},
5174         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5175         'res': {'type': 'multiple', 'field': ('height', 'width'),
5176                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5177
5178         # Actual field names
5179         'format_id': {'type': 'alias', 'field': 'id'},
5180         'preference': {'type': 'alias', 'field': 'ie_pref'},
5181         'language_preference': {'type': 'alias', 'field': 'lang'},
5182         'source_preference': {'type': 'alias', 'field': 'source'},
5183         'protocol': {'type': 'alias', 'field': 'proto'},
5184         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5185         'audio_channels': {'type': 'alias', 'field': 'channels'},
5186
5187         # Deprecated
5188         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5189         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5190         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5191         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5192         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5193         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5194         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5195         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5196         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5197         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5198         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5199         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5200         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5201         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5202         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5203         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5204         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5205         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5206         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5207         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5208     }
5209
5210     def __init__(self, ydl, field_preference):
5211         self.ydl = ydl
5212         self._order = []
5213         self.evaluate_params(self.ydl.params, field_preference)
5214         if ydl.params.get('verbose'):
5215             self.print_verbose_info(self.ydl.write_debug)
5216
5217     def _get_field_setting(self, field, key):
5218         if field not in self.settings:
5219             if key in ('forced', 'priority'):
5220                 return False
5221             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5222                                         'deprecated and may be removed in a future version')
5223             self.settings[field] = {}
5224         propObj = self.settings[field]
5225         if key not in propObj:
5226             type = propObj.get('type')
5227             if key == 'field':
5228                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5229             elif key == 'convert':
5230                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5231             else:
5232                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5233             propObj[key] = default
5234         return propObj[key]
5235
5236     def _resolve_field_value(self, field, value, convertNone=False):
5237         if value is None:
5238             if not convertNone:
5239                 return None
5240         else:
5241             value = value.lower()
5242         conversion = self._get_field_setting(field, 'convert')
5243         if conversion == 'ignore':
5244             return None
5245         if conversion == 'string':
5246             return value
5247         elif conversion == 'float_none':
5248             return float_or_none(value)
5249         elif conversion == 'bytes':
5250             return parse_bytes(value)
5251         elif conversion == 'order':
5252             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5253             use_regex = self._get_field_setting(field, 'regex')
5254             list_length = len(order_list)
5255             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5256             if use_regex and value is not None:
5257                 for i, regex in enumerate(order_list):
5258                     if regex and re.match(regex, value):
5259                         return list_length - i
5260                 return list_length - empty_pos  # not in list
5261             else:  # not regex or  value = None
5262                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5263         else:
5264             if value.isnumeric():
5265                 return float(value)
5266             else:
5267                 self.settings[field]['convert'] = 'string'
5268                 return value
5269
5270     def evaluate_params(self, params, sort_extractor):
5271         self._use_free_order = params.get('prefer_free_formats', False)
5272         self._sort_user = params.get('format_sort', [])
5273         self._sort_extractor = sort_extractor
5274
5275         def add_item(field, reverse, closest, limit_text):
5276             field = field.lower()
5277             if field in self._order:
5278                 return
5279             self._order.append(field)
5280             limit = self._resolve_field_value(field, limit_text)
5281             data = {
5282                 'reverse': reverse,
5283                 'closest': False if limit is None else closest,
5284                 'limit_text': limit_text,
5285                 'limit': limit}
5286             if field in self.settings:
5287                 self.settings[field].update(data)
5288             else:
5289                 self.settings[field] = data
5290
5291         sort_list = (
5292             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5293             + (tuple() if params.get('format_sort_force', False)
5294                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5295             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5296
5297         for item in sort_list:
5298             match = re.match(self.regex, item)
5299             if match is None:
5300                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5301             field = match.group('field')
5302             if field is None:
5303                 continue
5304             if self._get_field_setting(field, 'type') == 'alias':
5305                 alias, field = field, self._get_field_setting(field, 'field')
5306                 if self._get_field_setting(alias, 'deprecated'):
5307                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5308                                                 f'be removed in a future version. Please use {field} instead')
5309             reverse = match.group('reverse') is not None
5310             closest = match.group('separator') == '~'
5311             limit_text = match.group('limit')
5312
5313             has_limit = limit_text is not None
5314             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5315             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5316
5317             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5318             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5319             limit_count = len(limits)
5320             for (i, f) in enumerate(fields):
5321                 add_item(f, reverse, closest,
5322                          limits[i] if i < limit_count
5323                          else limits[0] if has_limit and not has_multiple_limits
5324                          else None)
5325
5326     def print_verbose_info(self, write_debug):
5327         if self._sort_user:
5328             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5329         if self._sort_extractor:
5330             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5331         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5332             '+' if self._get_field_setting(field, 'reverse') else '', field,
5333             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5334                           self._get_field_setting(field, 'limit_text'),
5335                           self._get_field_setting(field, 'limit'))
5336             if self._get_field_setting(field, 'limit_text') is not None else '')
5337             for field in self._order if self._get_field_setting(field, 'visible')]))
5338
5339     def _calculate_field_preference_from_value(self, format, field, type, value):
5340         reverse = self._get_field_setting(field, 'reverse')
5341         closest = self._get_field_setting(field, 'closest')
5342         limit = self._get_field_setting(field, 'limit')
5343
5344         if type == 'extractor':
5345             maximum = self._get_field_setting(field, 'max')
5346             if value is None or (maximum is not None and value >= maximum):
5347                 value = -1
5348         elif type == 'boolean':
5349             in_list = self._get_field_setting(field, 'in_list')
5350             not_in_list = self._get_field_setting(field, 'not_in_list')
5351             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5352         elif type == 'ordered':
5353             value = self._resolve_field_value(field, value, True)
5354
5355         # try to convert to number
5356         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5357         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5358         if is_num:
5359             value = val_num
5360
5361         return ((-10, 0) if value is None
5362                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5363                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5364                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5365                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5366                 else (-1, value, 0))
5367
5368     def _calculate_field_preference(self, format, field):
5369         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5370         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5371         if type == 'multiple':
5372             type = 'field'  # Only 'field' is allowed in multiple for now
5373             actual_fields = self._get_field_setting(field, 'field')
5374
5375             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5376         else:
5377             value = get_value(field)
5378         return self._calculate_field_preference_from_value(format, field, type, value)
5379
5380     def calculate_preference(self, format):
5381         # Determine missing protocol
5382         if not format.get('protocol'):
5383             format['protocol'] = determine_protocol(format)
5384
5385         # Determine missing ext
5386         if not format.get('ext') and 'url' in format:
5387             format['ext'] = determine_ext(format['url'])
5388         if format.get('vcodec') == 'none':
5389             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5390             format['video_ext'] = 'none'
5391         else:
5392             format['video_ext'] = format['ext']
5393             format['audio_ext'] = 'none'
5394         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5395         #    format['preference'] = -1000
5396
5397         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5398             # HEVC-over-FLV is out-of-spec by FLV's original spec
5399             # ref. https://trac.ffmpeg.org/ticket/6389
5400             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5401             format['preference'] = -100
5402
5403         # Determine missing bitrates
5404         if format.get('vcodec') == 'none':
5405             format['vbr'] = 0
5406         if format.get('acodec') == 'none':
5407             format['abr'] = 0
5408         if not format.get('vbr') and format.get('vcodec') != 'none':
5409             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5410         if not format.get('abr') and format.get('acodec') != 'none':
5411             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5412         if not format.get('tbr'):
5413             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5414
5415         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5416
5417
5418 # XXX: Temporary
5419 class _YDLLogger:
5420     def __init__(self, ydl=None):
5421         self._ydl = ydl
5422
5423     def debug(self, message):
5424         if self._ydl:
5425             self._ydl.write_debug(message)
5426
5427     def info(self, message):
5428         if self._ydl:
5429             self._ydl.to_screen(message)
5430
5431     def warning(self, message, *, once=False):
5432         if self._ydl:
5433             self._ydl.report_warning(message, once)
5434
5435     def error(self, message, *, is_error=True):
5436         if self._ydl:
5437             self._ydl.report_error(message, is_error=is_error)
5438
5439     def stdout(self, message):
5440         if self._ydl:
5441             self._ydl.to_stdout(message)
5442
5443     def stderr(self, message):
5444         if self._ydl:
5445             self._ydl.to_stderr(message)