yt_dlp/utils/_utils.py

   1 import base64
   2 import binascii
   3 import calendar
   4 import codecs
   5 import collections
   6 import collections.abc
   7 import contextlib
   8 import datetime
   9 import email.header
  10 import email.utils
  11 import errno
  12 import hashlib
  13 import hmac
  14 import html.entities
  15 import html.parser
  16 import inspect
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import mimetypes
  23 import netrc
  24 import operator
  25 import os
  26 import platform
  27 import random
  28 import re
  29 import shlex
  30 import socket
  31 import ssl
  32 import struct
  33 import subprocess
  34 import sys
  35 import tempfile
  36 import time
  37 import traceback
  38 import types
  39 import unicodedata
  40 import urllib.error
  41 import urllib.parse
  42 import urllib.request
  43 import xml.etree.ElementTree
  44
  45 from . import traversal
  46
  47 from ..compat import functools  # isort: split
  48 from ..compat import (
  49     compat_etree_fromstring,
  50     compat_expanduser,
  51     compat_HTMLParseError,
  52     compat_os_name,
  53     compat_shlex_quote,
  54 )
  55 from ..dependencies import xattr
  56
  57 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  58
  59 # This is not clearly defined otherwise
  60 compiled_regex_type = type(re.compile(''))
  61
  62
  63 class NO_DEFAULT:
  64     pass
  65
  66
  67 def IDENTITY(x):
  68     return x
  69
  70
  71 ENGLISH_MONTH_NAMES = [
  72     'January', 'February', 'March', 'April', 'May', 'June',
  73     'July', 'August', 'September', 'October', 'November', 'December']
  74
  75 MONTH_NAMES = {
  76     'en': ENGLISH_MONTH_NAMES,
  77     'fr': [
  78         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  79         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  80     # these follow the genitive grammatical case (dopełniacz)
  81     # some websites might be using nominative, which will require another month list
  82     # https://en.wikibooks.org/wiki/Polish/Noun_cases
  83     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  84            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  85 }
  86
  87 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  88 TIMEZONE_NAMES = {
  89     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  90     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
  91     'EST': -5, 'EDT': -4,  # Eastern
  92     'CST': -6, 'CDT': -5,  # Central
  93     'MST': -7, 'MDT': -6,  # Mountain
  94     'PST': -8, 'PDT': -7   # Pacific
  95 }
  96
  97 # needed for sanitizing filenames in restricted mode
  98 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
  99                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 100                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 101
 102 DATE_FORMATS = (
 103     '%d %B %Y',
 104     '%d %b %Y',
 105     '%B %d %Y',
 106     '%B %dst %Y',
 107     '%B %dnd %Y',
 108     '%B %drd %Y',
 109     '%B %dth %Y',
 110     '%b %d %Y',
 111     '%b %dst %Y',
 112     '%b %dnd %Y',
 113     '%b %drd %Y',
 114     '%b %dth %Y',
 115     '%b %dst %Y %I:%M',
 116     '%b %dnd %Y %I:%M',
 117     '%b %drd %Y %I:%M',
 118     '%b %dth %Y %I:%M',
 119     '%Y %m %d',
 120     '%Y-%m-%d',
 121     '%Y.%m.%d.',
 122     '%Y/%m/%d',
 123     '%Y/%m/%d %H:%M',
 124     '%Y/%m/%d %H:%M:%S',
 125     '%Y%m%d%H%M',
 126     '%Y%m%d%H%M%S',
 127     '%Y%m%d',
 128     '%Y-%m-%d %H:%M',
 129     '%Y-%m-%d %H:%M:%S',
 130     '%Y-%m-%d %H:%M:%S.%f',
 131     '%Y-%m-%d %H:%M:%S:%f',
 132     '%d.%m.%Y %H:%M',
 133     '%d.%m.%Y %H.%M',
 134     '%Y-%m-%dT%H:%M:%SZ',
 135     '%Y-%m-%dT%H:%M:%S.%fZ',
 136     '%Y-%m-%dT%H:%M:%S.%f0Z',
 137     '%Y-%m-%dT%H:%M:%S',
 138     '%Y-%m-%dT%H:%M:%S.%f',
 139     '%Y-%m-%dT%H:%M',
 140     '%b %d %Y at %H:%M',
 141     '%b %d %Y at %H:%M:%S',
 142     '%B %d %Y at %H:%M',
 143     '%B %d %Y at %H:%M:%S',
 144     '%H:%M %d-%b-%Y',
 145 )
 146
 147 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 148 DATE_FORMATS_DAY_FIRST.extend([
 149     '%d-%m-%Y',
 150     '%d.%m.%Y',
 151     '%d.%m.%y',
 152     '%d/%m/%Y',
 153     '%d/%m/%y',
 154     '%d/%m/%Y %H:%M:%S',
 155     '%d-%m-%Y %H:%M',
 156     '%H:%M %d/%m/%Y',
 157 ])
 158
 159 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 160 DATE_FORMATS_MONTH_FIRST.extend([
 161     '%m-%d-%Y',
 162     '%m.%d.%Y',
 163     '%m/%d/%Y',
 164     '%m/%d/%y',
 165     '%m/%d/%Y %H:%M:%S',
 166 ])
 167
 168 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 169 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 170
 171 NUMBER_RE = r'\d+(?:\.\d+)?'
 172
 173
 174 @functools.cache
 175 def preferredencoding():
 176     """Get preferred encoding.
 177
 178     Returns the best encoding scheme for the system, based on
 179     locale.getpreferredencoding() and some further tweaks.
 180     """
 181     try:
 182         pref = locale.getpreferredencoding()
 183         'TEST'.encode(pref)
 184     except Exception:
 185         pref = 'UTF-8'
 186
 187     return pref
 188
 189
 190 def write_json_file(obj, fn):
 191     """ Encode obj as JSON and write it to fn, atomically if possible """
 192
 193     tf = tempfile.NamedTemporaryFile(
 194         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 195         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 196
 197     try:
 198         with tf:
 199             json.dump(obj, tf, ensure_ascii=False)
 200         if sys.platform == 'win32':
 201             # Need to remove existing file on Windows, else os.rename raises
 202             # WindowsError or FileExistsError.
 203             with contextlib.suppress(OSError):
 204                 os.unlink(fn)
 205         with contextlib.suppress(OSError):
 206             mask = os.umask(0)
 207             os.umask(mask)
 208             os.chmod(tf.name, 0o666 & ~mask)
 209         os.rename(tf.name, fn)
 210     except Exception:
 211         with contextlib.suppress(OSError):
 212             os.remove(tf.name)
 213         raise
 214
 215
 216 def find_xpath_attr(node, xpath, key, val=None):
 217     """ Find the xpath xpath[@key=val] """
 218     assert re.match(r'^[a-zA-Z_-]+$', key)
 219     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 220     return node.find(expr)
 221
 222 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 223 # the namespace parameter
 224
 225
 226 def xpath_with_ns(path, ns_map):
 227     components = [c.split(':') for c in path.split('/')]
 228     replaced = []
 229     for c in components:
 230         if len(c) == 1:
 231             replaced.append(c[0])
 232         else:
 233             ns, tag = c
 234             replaced.append('{%s}%s' % (ns_map[ns], tag))
 235     return '/'.join(replaced)
 236
 237
 238 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 239     def _find_xpath(xpath):
 240         return node.find(xpath)
 241
 242     if isinstance(xpath, str):
 243         n = _find_xpath(xpath)
 244     else:
 245         for xp in xpath:
 246             n = _find_xpath(xp)
 247             if n is not None:
 248                 break
 249
 250     if n is None:
 251         if default is not NO_DEFAULT:
 252             return default
 253         elif fatal:
 254             name = xpath if name is None else name
 255             raise ExtractorError('Could not find XML element %s' % name)
 256         else:
 257             return None
 258     return n
 259
 260
 261 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 262     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 263     if n is None or n == default:
 264         return n
 265     if n.text is None:
 266         if default is not NO_DEFAULT:
 267             return default
 268         elif fatal:
 269             name = xpath if name is None else name
 270             raise ExtractorError('Could not find XML element\'s text %s' % name)
 271         else:
 272             return None
 273     return n.text
 274
 275
 276 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 277     n = find_xpath_attr(node, xpath, key)
 278     if n is None:
 279         if default is not NO_DEFAULT:
 280             return default
 281         elif fatal:
 282             name = f'{xpath}[@{key}]' if name is None else name
 283             raise ExtractorError('Could not find XML attribute %s' % name)
 284         else:
 285             return None
 286     return n.attrib[key]
 287
 288
 289 def get_element_by_id(id, html, **kwargs):
 290     """Return the content of the tag with the specified ID in the passed HTML document"""
 291     return get_element_by_attribute('id', id, html, **kwargs)
 292
 293
 294 def get_element_html_by_id(id, html, **kwargs):
 295     """Return the html of the tag with the specified ID in the passed HTML document"""
 296     return get_element_html_by_attribute('id', id, html, **kwargs)
 297
 298
 299 def get_element_by_class(class_name, html):
 300     """Return the content of the first tag with the specified class in the passed HTML document"""
 301     retval = get_elements_by_class(class_name, html)
 302     return retval[0] if retval else None
 303
 304
 305 def get_element_html_by_class(class_name, html):
 306     """Return the html of the first tag with the specified class in the passed HTML document"""
 307     retval = get_elements_html_by_class(class_name, html)
 308     return retval[0] if retval else None
 309
 310
 311 def get_element_by_attribute(attribute, value, html, **kwargs):
 312     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 313     return retval[0] if retval else None
 314
 315
 316 def get_element_html_by_attribute(attribute, value, html, **kargs):
 317     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 318     return retval[0] if retval else None
 319
 320
 321 def get_elements_by_class(class_name, html, **kargs):
 322     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 323     return get_elements_by_attribute(
 324         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 325         html, escape_value=False)
 326
 327
 328 def get_elements_html_by_class(class_name, html):
 329     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 330     return get_elements_html_by_attribute(
 331         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 332         html, escape_value=False)
 333
 334
 335 def get_elements_by_attribute(*args, **kwargs):
 336     """Return the content of the tag with the specified attribute in the passed HTML document"""
 337     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 338
 339
 340 def get_elements_html_by_attribute(*args, **kwargs):
 341     """Return the html of the tag with the specified attribute in the passed HTML document"""
 342     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 343
 344
 345 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 346     """
 347     Return the text (content) and the html (whole) of the tag with the specified
 348     attribute in the passed HTML document
 349     """
 350     if not value:
 351         return
 352
 353     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 354
 355     value = re.escape(value) if escape_value else value
 356
 357     partial_element_re = rf'''(?x)
 358         <(?P<tag>{tag})
 359          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 360          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 361         '''
 362
 363     for m in re.finditer(partial_element_re, html):
 364         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 365
 366         yield (
 367             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 368             whole
 369         )
 370
 371
 372 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 373     """
 374     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 375     closing tag for the first opening tag it has encountered, and can be used
 376     as a context manager
 377     """
 378
 379     class HTMLBreakOnClosingTagException(Exception):
 380         pass
 381
 382     def __init__(self):
 383         self.tagstack = collections.deque()
 384         html.parser.HTMLParser.__init__(self)
 385
 386     def __enter__(self):
 387         return self
 388
 389     def __exit__(self, *_):
 390         self.close()
 391
 392     def close(self):
 393         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 394         # so data remains buffered; we no longer have any interest in it, thus
 395         # override this method to discard it
 396         pass
 397
 398     def handle_starttag(self, tag, _):
 399         self.tagstack.append(tag)
 400
 401     def handle_endtag(self, tag):
 402         if not self.tagstack:
 403             raise compat_HTMLParseError('no tags in the stack')
 404         while self.tagstack:
 405             inner_tag = self.tagstack.pop()
 406             if inner_tag == tag:
 407                 break
 408         else:
 409             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 410         if not self.tagstack:
 411             raise self.HTMLBreakOnClosingTagException()
 412
 413
 414 # XXX: This should be far less strict
 415 def get_element_text_and_html_by_tag(tag, html):
 416     """
 417     For the first element with the specified tag in the passed HTML document
 418     return its' content (text) and the whole element (html)
 419     """
 420     def find_or_raise(haystack, needle, exc):
 421         try:
 422             return haystack.index(needle)
 423         except ValueError:
 424             raise exc
 425     closing_tag = f'</{tag}>'
 426     whole_start = find_or_raise(
 427         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 428     content_start = find_or_raise(
 429         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 430     content_start += whole_start + 1
 431     with HTMLBreakOnClosingTagParser() as parser:
 432         parser.feed(html[whole_start:content_start])
 433         if not parser.tagstack or parser.tagstack[0] != tag:
 434             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 435         offset = content_start
 436         while offset < len(html):
 437             next_closing_tag_start = find_or_raise(
 438                 html[offset:], closing_tag,
 439                 compat_HTMLParseError(f'closing {tag} tag not found'))
 440             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 441             try:
 442                 parser.feed(html[offset:offset + next_closing_tag_end])
 443                 offset += next_closing_tag_end
 444             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 445                 return html[content_start:offset + next_closing_tag_start], \
 446                     html[whole_start:offset + next_closing_tag_end]
 447         raise compat_HTMLParseError('unexpected end of html')
 448
 449
 450 class HTMLAttributeParser(html.parser.HTMLParser):
 451     """Trivial HTML parser to gather the attributes for a single element"""
 452
 453     def __init__(self):
 454         self.attrs = {}
 455         html.parser.HTMLParser.__init__(self)
 456
 457     def handle_starttag(self, tag, attrs):
 458         self.attrs = dict(attrs)
 459         raise compat_HTMLParseError('done')
 460
 461
 462 class HTMLListAttrsParser(html.parser.HTMLParser):
 463     """HTML parser to gather the attributes for the elements of a list"""
 464
 465     def __init__(self):
 466         html.parser.HTMLParser.__init__(self)
 467         self.items = []
 468         self._level = 0
 469
 470     def handle_starttag(self, tag, attrs):
 471         if tag == 'li' and self._level == 0:
 472             self.items.append(dict(attrs))
 473         self._level += 1
 474
 475     def handle_endtag(self, tag):
 476         self._level -= 1
 477
 478
 479 def extract_attributes(html_element):
 480     """Given a string for an HTML element such as
 481     <el
 482          a="foo" B="bar" c="&98;az" d=boz
 483          empty= noval entity="&amp;"
 484          sq='"' dq="'"
 485     >
 486     Decode and return a dictionary of attributes.
 487     {
 488         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 489         'empty': '', 'noval': None, 'entity': '&',
 490         'sq': '"', 'dq': '\''
 491     }.
 492     """
 493     parser = HTMLAttributeParser()
 494     with contextlib.suppress(compat_HTMLParseError):
 495         parser.feed(html_element)
 496         parser.close()
 497     return parser.attrs
 498
 499
 500 def parse_list(webpage):
 501     """Given a string for an series of HTML <li> elements,
 502     return a dictionary of their attributes"""
 503     parser = HTMLListAttrsParser()
 504     parser.feed(webpage)
 505     parser.close()
 506     return parser.items
 507
 508
 509 def clean_html(html):
 510     """Clean an HTML snippet into a readable string"""
 511
 512     if html is None:  # Convenience for sanitizing descriptions etc.
 513         return html
 514
 515     html = re.sub(r'\s+', ' ', html)
 516     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 517     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 518     # Strip html tags
 519     html = re.sub('<.*?>', '', html)
 520     # Replace html entities
 521     html = unescapeHTML(html)
 522     return html.strip()
 523
 524
 525 class LenientJSONDecoder(json.JSONDecoder):
 526     # TODO: Write tests
 527     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 528         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 529         self._close_attempts = 2 * close_objects
 530         super().__init__(*args, **kwargs)
 531
 532     @staticmethod
 533     def _close_object(err):
 534         doc = err.doc[:err.pos]
 535         # We need to add comma first to get the correct error message
 536         if err.msg.startswith('Expecting \',\''):
 537             return doc + ','
 538         elif not doc.endswith(','):
 539             return
 540
 541         if err.msg.startswith('Expecting property name'):
 542             return doc[:-1] + '}'
 543         elif err.msg.startswith('Expecting value'):
 544             return doc[:-1] + ']'
 545
 546     def decode(self, s):
 547         if self.transform_source:
 548             s = self.transform_source(s)
 549         for attempt in range(self._close_attempts + 1):
 550             try:
 551                 if self.ignore_extra:
 552                     return self.raw_decode(s.lstrip())[0]
 553                 return super().decode(s)
 554             except json.JSONDecodeError as e:
 555                 if e.pos is None:
 556                     raise
 557                 elif attempt < self._close_attempts:
 558                     s = self._close_object(e)
 559                     if s is not None:
 560                         continue
 561                 raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
 562         assert False, 'Too many attempts to decode JSON'
 563
 564
 565 def sanitize_open(filename, open_mode):
 566     """Try to open the given filename, and slightly tweak it if this fails.
 567
 568     Attempts to open the given filename. If this fails, it tries to change
 569     the filename slightly, step by step, until it's either able to open it
 570     or it fails and raises a final exception, like the standard open()
 571     function.
 572
 573     It returns the tuple (stream, definitive_file_name).
 574     """
 575     if filename == '-':
 576         if sys.platform == 'win32':
 577             import msvcrt
 578
 579             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 580             with contextlib.suppress(io.UnsupportedOperation):
 581                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 582         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 583
 584     for attempt in range(2):
 585         try:
 586             try:
 587                 if sys.platform == 'win32':
 588                     # FIXME: An exclusive lock also locks the file from being read.
 589                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 590                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 591                     raise LockingUnsupportedError()
 592                 stream = locked_file(filename, open_mode, block=False).__enter__()
 593             except OSError:
 594                 stream = open(filename, open_mode)
 595             return stream, filename
 596         except OSError as err:
 597             if attempt or err.errno in (errno.EACCES,):
 598                 raise
 599             old_filename, filename = filename, sanitize_path(filename)
 600             if old_filename == filename:
 601                 raise
 602
 603
 604 def timeconvert(timestr):
 605     """Convert RFC 2822 defined time string into system timestamp"""
 606     timestamp = None
 607     timetuple = email.utils.parsedate_tz(timestr)
 608     if timetuple is not None:
 609         timestamp = email.utils.mktime_tz(timetuple)
 610     return timestamp
 611
 612
 613 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 614     """Sanitizes a string so it could be used as part of a filename.
 615     @param restricted   Use a stricter subset of allowed characters
 616     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 617                         If unset, yt-dlp's new sanitization rules are in effect
 618     """
 619     if s == '':
 620         return ''
 621
 622     def replace_insane(char):
 623         if restricted and char in ACCENT_CHARS:
 624             return ACCENT_CHARS[char]
 625         elif not restricted and char == '\n':
 626             return '\0 '
 627         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 628             # Replace with their full-width unicode counterparts
 629             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 630         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 631             return ''
 632         elif char == '"':
 633             return '' if restricted else '\''
 634         elif char == ':':
 635             return '\0_\0-' if restricted else '\0 \0-'
 636         elif char in '\\/|*<>':
 637             return '\0_'
 638         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 639             return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
 640         return char
 641
 642     # Replace look-alike Unicode glyphs
 643     if restricted and (is_id is NO_DEFAULT or not is_id):
 644         s = unicodedata.normalize('NFKC', s)
 645     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 646     result = ''.join(map(replace_insane, s))
 647     if is_id is NO_DEFAULT:
 648         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 649         STRIP_RE = r'(?:\0.|[ _-])*'
 650         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 651     result = result.replace('\0', '') or '_'
 652
 653     if not is_id:
 654         while '__' in result:
 655             result = result.replace('__', '_')
 656         result = result.strip('_')
 657         # Common case of "Foreign band name - English song title"
 658         if restricted and result.startswith('-_'):
 659             result = result[2:]
 660         if result.startswith('-'):
 661             result = '_' + result[len('-'):]
 662         result = result.lstrip('.')
 663         if not result:
 664             result = '_'
 665     return result
 666
 667
 668 def sanitize_path(s, force=False):
 669     """Sanitizes and normalizes path on Windows"""
 670     # XXX: this handles drive relative paths (c:sth) incorrectly
 671     if sys.platform == 'win32':
 672         force = False
 673         drive_or_unc, _ = os.path.splitdrive(s)
 674     elif force:
 675         drive_or_unc = ''
 676     else:
 677         return s
 678
 679     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 680     if drive_or_unc:
 681         norm_path.pop(0)
 682     sanitized_path = [
 683         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 684         for path_part in norm_path]
 685     if drive_or_unc:
 686         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 687     elif force and s and s[0] == os.path.sep:
 688         sanitized_path.insert(0, os.path.sep)
 689     # TODO: Fix behavioral differences <3.12
 690     # The workaround using `normpath` only superficially passes tests
 691     # Ref: https://github.com/python/cpython/pull/100351
 692     return os.path.normpath(os.path.join(*sanitized_path))
 693
 694
 695 def sanitize_url(url, *, scheme='http'):
 696     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 697     # the number of unwanted failures due to missing protocol
 698     if url is None:
 699         return
 700     elif url.startswith('//'):
 701         return f'{scheme}:{url}'
 702     # Fix some common typos seen so far
 703     COMMON_TYPOS = (
 704         # https://github.com/ytdl-org/youtube-dl/issues/15649
 705         (r'^httpss://', r'https://'),
 706         # https://bx1.be/lives/direct-tv/
 707         (r'^rmtp([es]?)://', r'rtmp\1://'),
 708     )
 709     for mistake, fixup in COMMON_TYPOS:
 710         if re.match(mistake, url):
 711             return re.sub(mistake, fixup, url)
 712     return url
 713
 714
 715 def extract_basic_auth(url):
 716     parts = urllib.parse.urlsplit(url)
 717     if parts.username is None:
 718         return url, None
 719     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 720         parts.hostname if parts.port is None
 721         else '%s:%d' % (parts.hostname, parts.port))))
 722     auth_payload = base64.b64encode(
 723         ('%s:%s' % (parts.username, parts.password or '')).encode())
 724     return url, f'Basic {auth_payload.decode()}'
 725
 726
 727 def expand_path(s):
 728     """Expand shell variables and ~"""
 729     return os.path.expandvars(compat_expanduser(s))
 730
 731
 732 def orderedSet(iterable, *, lazy=False):
 733     """Remove all duplicates from the input iterable"""
 734     def _iter():
 735         seen = []  # Do not use set since the items can be unhashable
 736         for x in iterable:
 737             if x not in seen:
 738                 seen.append(x)
 739                 yield x
 740
 741     return _iter() if lazy else list(_iter())
 742
 743
 744 def _htmlentity_transform(entity_with_semicolon):
 745     """Transforms an HTML entity to a character."""
 746     entity = entity_with_semicolon[:-1]
 747
 748     # Known non-numeric HTML entity
 749     if entity in html.entities.name2codepoint:
 750         return chr(html.entities.name2codepoint[entity])
 751
 752     # TODO: HTML5 allows entities without a semicolon.
 753     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 754     if entity_with_semicolon in html.entities.html5:
 755         return html.entities.html5[entity_with_semicolon]
 756
 757     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 758     if mobj is not None:
 759         numstr = mobj.group(1)
 760         if numstr.startswith('x'):
 761             base = 16
 762             numstr = '0%s' % numstr
 763         else:
 764             base = 10
 765         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 766         with contextlib.suppress(ValueError):
 767             return chr(int(numstr, base))
 768
 769     # Unknown entity in name, return its literal representation
 770     return '&%s;' % entity
 771
 772
 773 def unescapeHTML(s):
 774     if s is None:
 775         return None
 776     assert isinstance(s, str)
 777
 778     return re.sub(
 779         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 780
 781
 782 def escapeHTML(text):
 783     return (
 784         text
 785         .replace('&', '&amp;')
 786         .replace('<', '&lt;')
 787         .replace('>', '&gt;')
 788         .replace('"', '&quot;')
 789         .replace("'", '&#39;')
 790     )
 791
 792
 793 class netrc_from_content(netrc.netrc):
 794     def __init__(self, content):
 795         self.hosts, self.macros = {}, {}
 796         with io.StringIO(content) as stream:
 797             self._parse('-', stream, False)
 798
 799
 800 class Popen(subprocess.Popen):
 801     if sys.platform == 'win32':
 802         _startupinfo = subprocess.STARTUPINFO()
 803         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 804     else:
 805         _startupinfo = None
 806
 807     @staticmethod
 808     def _fix_pyinstaller_ld_path(env):
 809         """Restore LD_LIBRARY_PATH when using PyInstaller
 810             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 811                  https://github.com/yt-dlp/yt-dlp/issues/4573
 812         """
 813         if not hasattr(sys, '_MEIPASS'):
 814             return
 815
 816         def _fix(key):
 817             orig = env.get(f'{key}_ORIG')
 818             if orig is None:
 819                 env.pop(key, None)
 820             else:
 821                 env[key] = orig
 822
 823         _fix('LD_LIBRARY_PATH')  # Linux
 824         _fix('DYLD_LIBRARY_PATH')  # macOS
 825
 826     def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
 827         if env is None:
 828             env = os.environ.copy()
 829         self._fix_pyinstaller_ld_path(env)
 830
 831         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 832         if text is True:
 833             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 834             kwargs.setdefault('encoding', 'utf-8')
 835             kwargs.setdefault('errors', 'replace')
 836
 837         if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
 838             if not isinstance(args, str):
 839                 args = ' '.join(compat_shlex_quote(a) for a in args)
 840             shell = False
 841             args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"'
 842
 843         super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
 844
 845     def __comspec(self):
 846         comspec = os.environ.get('ComSpec') or os.path.join(
 847             os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
 848         if os.path.isabs(comspec):
 849             return comspec
 850         raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
 851
 852     def communicate_or_kill(self, *args, **kwargs):
 853         try:
 854             return self.communicate(*args, **kwargs)
 855         except BaseException:  # Including KeyboardInterrupt
 856             self.kill(timeout=None)
 857             raise
 858
 859     def kill(self, *, timeout=0):
 860         super().kill()
 861         if timeout != 0:
 862             self.wait(timeout=timeout)
 863
 864     @classmethod
 865     def run(cls, *args, timeout=None, **kwargs):
 866         with cls(*args, **kwargs) as proc:
 867             default = '' if proc.__text_mode else b''
 868             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 869             return stdout or default, stderr or default, proc.returncode
 870
 871
 872 def encodeArgument(s):
 873     # Legacy code that uses byte strings
 874     # Uncomment the following line after fixing all post processors
 875     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 876     return s if isinstance(s, str) else s.decode('ascii')
 877
 878
 879 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 880
 881
 882 def timetuple_from_msec(msec):
 883     secs, msec = divmod(msec, 1000)
 884     mins, secs = divmod(secs, 60)
 885     hrs, mins = divmod(mins, 60)
 886     return _timetuple(hrs, mins, secs, msec)
 887
 888
 889 def formatSeconds(secs, delim=':', msec=False):
 890     time = timetuple_from_msec(secs * 1000)
 891     if time.hours:
 892         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 893     elif time.minutes:
 894         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 895     else:
 896         ret = '%d' % time.seconds
 897     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 898
 899
 900 def bug_reports_message(before=';'):
 901     from ..update import REPOSITORY
 902
 903     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 904            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 905
 906     before = before.rstrip()
 907     if not before or before.endswith(('.', '!', '?')):
 908         msg = msg[0].title() + msg[1:]
 909
 910     return (before + ' ' if before else '') + msg
 911
 912
 913 class YoutubeDLError(Exception):
 914     """Base exception for YoutubeDL errors."""
 915     msg = None
 916
 917     def __init__(self, msg=None):
 918         if msg is not None:
 919             self.msg = msg
 920         elif self.msg is None:
 921             self.msg = type(self).__name__
 922         super().__init__(self.msg)
 923
 924
 925 class ExtractorError(YoutubeDLError):
 926     """Error during info extraction."""
 927
 928     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 929         """ tb, if given, is the original traceback (so that it can be printed out).
 930         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 931         """
 932         from ..networking.exceptions import network_exceptions
 933         if sys.exc_info()[0] in network_exceptions:
 934             expected = True
 935
 936         self.orig_msg = str(msg)
 937         self.traceback = tb
 938         self.expected = expected
 939         self.cause = cause
 940         self.video_id = video_id
 941         self.ie = ie
 942         self.exc_info = sys.exc_info()  # preserve original exception
 943         if isinstance(self.exc_info[1], ExtractorError):
 944             self.exc_info = self.exc_info[1].exc_info
 945         super().__init__(self.__msg)
 946
 947     @property
 948     def __msg(self):
 949         return ''.join((
 950             format_field(self.ie, None, '[%s] '),
 951             format_field(self.video_id, None, '%s: '),
 952             self.orig_msg,
 953             format_field(self.cause, None, ' (caused by %r)'),
 954             '' if self.expected else bug_reports_message()))
 955
 956     def format_traceback(self):
 957         return join_nonempty(
 958             self.traceback and ''.join(traceback.format_tb(self.traceback)),
 959             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
 960             delim='\n') or None
 961
 962     def __setattr__(self, name, value):
 963         super().__setattr__(name, value)
 964         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
 965             self.msg = self.__msg or type(self).__name__
 966             self.args = (self.msg, )  # Cannot be property
 967
 968
 969 class UnsupportedError(ExtractorError):
 970     def __init__(self, url):
 971         super().__init__(
 972             'Unsupported URL: %s' % url, expected=True)
 973         self.url = url
 974
 975
 976 class RegexNotFoundError(ExtractorError):
 977     """Error when a regex didn't match"""
 978     pass
 979
 980
 981 class GeoRestrictedError(ExtractorError):
 982     """Geographic restriction Error exception.
 983
 984     This exception may be thrown when a video is not available from your
 985     geographic location due to geographic restrictions imposed by a website.
 986     """
 987
 988     def __init__(self, msg, countries=None, **kwargs):
 989         kwargs['expected'] = True
 990         super().__init__(msg, **kwargs)
 991         self.countries = countries
 992
 993
 994 class UserNotLive(ExtractorError):
 995     """Error when a channel/user is not live"""
 996
 997     def __init__(self, msg=None, **kwargs):
 998         kwargs['expected'] = True
 999         super().__init__(msg or 'The channel is not currently live', **kwargs)
1000
1001
1002 class DownloadError(YoutubeDLError):
1003     """Download Error exception.
1004
1005     This exception may be thrown by FileDownloader objects if they are not
1006     configured to continue on errors. They will contain the appropriate
1007     error message.
1008     """
1009
1010     def __init__(self, msg, exc_info=None):
1011         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1012         super().__init__(msg)
1013         self.exc_info = exc_info
1014
1015
1016 class EntryNotInPlaylist(YoutubeDLError):
1017     """Entry not in playlist exception.
1018
1019     This exception will be thrown by YoutubeDL when a requested entry
1020     is not found in the playlist info_dict
1021     """
1022     msg = 'Entry not found in info'
1023
1024
1025 class SameFileError(YoutubeDLError):
1026     """Same File exception.
1027
1028     This exception will be thrown by FileDownloader objects if they detect
1029     multiple files would have to be downloaded to the same file on disk.
1030     """
1031     msg = 'Fixed output name but more than one file to download'
1032
1033     def __init__(self, filename=None):
1034         if filename is not None:
1035             self.msg += f': {filename}'
1036         super().__init__(self.msg)
1037
1038
1039 class PostProcessingError(YoutubeDLError):
1040     """Post Processing exception.
1041
1042     This exception may be raised by PostProcessor's .run() method to
1043     indicate an error in the postprocessing task.
1044     """
1045
1046
1047 class DownloadCancelled(YoutubeDLError):
1048     """ Exception raised when the download queue should be interrupted """
1049     msg = 'The download was cancelled'
1050
1051
1052 class ExistingVideoReached(DownloadCancelled):
1053     """ --break-on-existing triggered """
1054     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1055
1056
1057 class RejectedVideoReached(DownloadCancelled):
1058     """ --break-match-filter triggered """
1059     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1060
1061
1062 class MaxDownloadsReached(DownloadCancelled):
1063     """ --max-downloads limit has been reached. """
1064     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1065
1066
1067 class ReExtractInfo(YoutubeDLError):
1068     """ Video info needs to be re-extracted. """
1069
1070     def __init__(self, msg, expected=False):
1071         super().__init__(msg)
1072         self.expected = expected
1073
1074
1075 class ThrottledDownload(ReExtractInfo):
1076     """ Download speed below --throttled-rate. """
1077     msg = 'The download speed is below throttle limit'
1078
1079     def __init__(self):
1080         super().__init__(self.msg, expected=False)
1081
1082
1083 class UnavailableVideoError(YoutubeDLError):
1084     """Unavailable Format exception.
1085
1086     This exception will be thrown when a video is requested
1087     in a format that is not available for that video.
1088     """
1089     msg = 'Unable to download video'
1090
1091     def __init__(self, err=None):
1092         if err is not None:
1093             self.msg += f': {err}'
1094         super().__init__(self.msg)
1095
1096
1097 class ContentTooShortError(YoutubeDLError):
1098     """Content Too Short exception.
1099
1100     This exception may be raised by FileDownloader objects when a file they
1101     download is too small for what the server announced first, indicating
1102     the connection was probably interrupted.
1103     """
1104
1105     def __init__(self, downloaded, expected):
1106         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1107         # Both in bytes
1108         self.downloaded = downloaded
1109         self.expected = expected
1110
1111
1112 class XAttrMetadataError(YoutubeDLError):
1113     def __init__(self, code=None, msg='Unknown error'):
1114         super().__init__(msg)
1115         self.code = code
1116         self.msg = msg
1117
1118         # Parsing code and msg
1119         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1120                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1121             self.reason = 'NO_SPACE'
1122         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1123             self.reason = 'VALUE_TOO_LONG'
1124         else:
1125             self.reason = 'NOT_SUPPORTED'
1126
1127
1128 class XAttrUnavailableError(YoutubeDLError):
1129     pass
1130
1131
1132 def is_path_like(f):
1133     return isinstance(f, (str, bytes, os.PathLike))
1134
1135
1136 def extract_timezone(date_str):
1137     m = re.search(
1138         r'''(?x)
1139             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1140             (?P<tz>Z|                                            # just the UTC Z, or
1141                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1142                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1143                    [ ]?                                          # optional space
1144                 (?P<sign>\+|-)                                   # +/-
1145                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1146             $)
1147         ''', date_str)
1148     if not m:
1149         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1150         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1151         if timezone is not None:
1152             date_str = date_str[:-len(m.group('tz'))]
1153         timezone = datetime.timedelta(hours=timezone or 0)
1154     else:
1155         date_str = date_str[:-len(m.group('tz'))]
1156         if not m.group('sign'):
1157             timezone = datetime.timedelta()
1158         else:
1159             sign = 1 if m.group('sign') == '+' else -1
1160             timezone = datetime.timedelta(
1161                 hours=sign * int(m.group('hours')),
1162                 minutes=sign * int(m.group('minutes')))
1163     return timezone, date_str
1164
1165
1166 def parse_iso8601(date_str, delimiter='T', timezone=None):
1167     """ Return a UNIX timestamp from the given date """
1168
1169     if date_str is None:
1170         return None
1171
1172     date_str = re.sub(r'\.[0-9]+', '', date_str)
1173
1174     if timezone is None:
1175         timezone, date_str = extract_timezone(date_str)
1176
1177     with contextlib.suppress(ValueError):
1178         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1179         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1180         return calendar.timegm(dt.timetuple())
1181
1182
1183 def date_formats(day_first=True):
1184     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1185
1186
1187 def unified_strdate(date_str, day_first=True):
1188     """Return a string with the date in the format YYYYMMDD"""
1189
1190     if date_str is None:
1191         return None
1192     upload_date = None
1193     # Replace commas
1194     date_str = date_str.replace(',', ' ')
1195     # Remove AM/PM + timezone
1196     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1197     _, date_str = extract_timezone(date_str)
1198
1199     for expression in date_formats(day_first):
1200         with contextlib.suppress(ValueError):
1201             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1202     if upload_date is None:
1203         timetuple = email.utils.parsedate_tz(date_str)
1204         if timetuple:
1205             with contextlib.suppress(ValueError):
1206                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1207     if upload_date is not None:
1208         return str(upload_date)
1209
1210
1211 def unified_timestamp(date_str, day_first=True):
1212     if not isinstance(date_str, str):
1213         return None
1214
1215     date_str = re.sub(r'\s+', ' ', re.sub(
1216         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1217
1218     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1219     timezone, date_str = extract_timezone(date_str)
1220
1221     # Remove AM/PM + timezone
1222     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1223
1224     # Remove unrecognized timezones from ISO 8601 alike timestamps
1225     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1226     if m:
1227         date_str = date_str[:-len(m.group('tz'))]
1228
1229     # Python only supports microseconds, so remove nanoseconds
1230     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1231     if m:
1232         date_str = m.group(1)
1233
1234     for expression in date_formats(day_first):
1235         with contextlib.suppress(ValueError):
1236             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1237             return calendar.timegm(dt.timetuple())
1238
1239     timetuple = email.utils.parsedate_tz(date_str)
1240     if timetuple:
1241         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1242
1243
1244 def determine_ext(url, default_ext='unknown_video'):
1245     if url is None or '.' not in url:
1246         return default_ext
1247     guess = url.partition('?')[0].rpartition('.')[2]
1248     if re.match(r'^[A-Za-z0-9]+$', guess):
1249         return guess
1250     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1251     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1252         return guess.rstrip('/')
1253     else:
1254         return default_ext
1255
1256
1257 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1258     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1259
1260
1261 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1262     R"""
1263     Return a datetime object from a string.
1264     Supported format:
1265         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1266
1267     @param format       strftime format of DATE
1268     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1269                         auto: round to the unit provided in date_str (if applicable).
1270     """
1271     auto_precision = False
1272     if precision == 'auto':
1273         auto_precision = True
1274         precision = 'microsecond'
1275     today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
1276     if date_str in ('now', 'today'):
1277         return today
1278     if date_str == 'yesterday':
1279         return today - datetime.timedelta(days=1)
1280     match = re.match(
1281         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1282         date_str)
1283     if match is not None:
1284         start_time = datetime_from_str(match.group('start'), precision, format)
1285         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1286         unit = match.group('unit')
1287         if unit == 'month' or unit == 'year':
1288             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1289             unit = 'day'
1290         else:
1291             if unit == 'week':
1292                 unit = 'day'
1293                 time *= 7
1294             delta = datetime.timedelta(**{unit + 's': time})
1295             new_date = start_time + delta
1296         if auto_precision:
1297             return datetime_round(new_date, unit)
1298         return new_date
1299
1300     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1301
1302
1303 def date_from_str(date_str, format='%Y%m%d', strict=False):
1304     R"""
1305     Return a date object from a string using datetime_from_str
1306
1307     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1308                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1309     """
1310     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1311         raise ValueError(f'Invalid date format "{date_str}"')
1312     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1313
1314
1315 def datetime_add_months(dt, months):
1316     """Increment/Decrement a datetime object by months."""
1317     month = dt.month + months - 1
1318     year = dt.year + month // 12
1319     month = month % 12 + 1
1320     day = min(dt.day, calendar.monthrange(year, month)[1])
1321     return dt.replace(year, month, day)
1322
1323
1324 def datetime_round(dt, precision='day'):
1325     """
1326     Round a datetime object's time to a specific precision
1327     """
1328     if precision == 'microsecond':
1329         return dt
1330
1331     unit_seconds = {
1332         'day': 86400,
1333         'hour': 3600,
1334         'minute': 60,
1335         'second': 1,
1336     }
1337     roundto = lambda x, n: ((x + n / 2) // n) * n
1338     timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
1339     return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
1340
1341
1342 def hyphenate_date(date_str):
1343     """
1344     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1345     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1346     if match is not None:
1347         return '-'.join(match.groups())
1348     else:
1349         return date_str
1350
1351
1352 class DateRange:
1353     """Represents a time interval between two dates"""
1354
1355     def __init__(self, start=None, end=None):
1356         """start and end must be strings in the format accepted by date"""
1357         if start is not None:
1358             self.start = date_from_str(start, strict=True)
1359         else:
1360             self.start = datetime.datetime.min.date()
1361         if end is not None:
1362             self.end = date_from_str(end, strict=True)
1363         else:
1364             self.end = datetime.datetime.max.date()
1365         if self.start > self.end:
1366             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1367
1368     @classmethod
1369     def day(cls, day):
1370         """Returns a range that only contains the given day"""
1371         return cls(day, day)
1372
1373     def __contains__(self, date):
1374         """Check if the date is in the range"""
1375         if not isinstance(date, datetime.date):
1376             date = date_from_str(date)
1377         return self.start <= date <= self.end
1378
1379     def __repr__(self):
1380         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1381
1382     def __eq__(self, other):
1383         return (isinstance(other, DateRange)
1384                 and self.start == other.start and self.end == other.end)
1385
1386
1387 @functools.cache
1388 def system_identifier():
1389     python_implementation = platform.python_implementation()
1390     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1391         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1392     libc_ver = []
1393     with contextlib.suppress(OSError):  # We may not have access to the executable
1394         libc_ver = platform.libc_ver()
1395
1396     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1397         platform.python_version(),
1398         python_implementation,
1399         platform.machine(),
1400         platform.architecture()[0],
1401         platform.platform(),
1402         ssl.OPENSSL_VERSION,
1403         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1404     )
1405
1406
1407 @functools.cache
1408 def get_windows_version():
1409     ''' Get Windows version. returns () if it's not running on Windows '''
1410     if compat_os_name == 'nt':
1411         return version_tuple(platform.win32_ver()[1])
1412     else:
1413         return ()
1414
1415
1416 def write_string(s, out=None, encoding=None):
1417     assert isinstance(s, str)
1418     out = out or sys.stderr
1419     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1420     if not out:
1421         return
1422
1423     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1424         s = re.sub(r'([\r\n]+)', r' \1', s)
1425
1426     enc, buffer = None, out
1427     if 'b' in getattr(out, 'mode', ''):
1428         enc = encoding or preferredencoding()
1429     elif hasattr(out, 'buffer'):
1430         buffer = out.buffer
1431         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1432
1433     buffer.write(s.encode(enc, 'ignore') if enc else s)
1434     out.flush()
1435
1436
1437 # TODO: Use global logger
1438 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1439     from .. import _IN_CLI
1440     if _IN_CLI:
1441         if msg in deprecation_warning._cache:
1442             return
1443         deprecation_warning._cache.add(msg)
1444         if printer:
1445             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1446         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1447     else:
1448         import warnings
1449         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1450
1451
1452 deprecation_warning._cache = set()
1453
1454
1455 def bytes_to_intlist(bs):
1456     if not bs:
1457         return []
1458     if isinstance(bs[0], int):  # Python 3
1459         return list(bs)
1460     else:
1461         return [ord(c) for c in bs]
1462
1463
1464 def intlist_to_bytes(xs):
1465     if not xs:
1466         return b''
1467     return struct.pack('%dB' % len(xs), *xs)
1468
1469
1470 class LockingUnsupportedError(OSError):
1471     msg = 'File locking is not supported'
1472
1473     def __init__(self):
1474         super().__init__(self.msg)
1475
1476
1477 # Cross-platform file locking
1478 if sys.platform == 'win32':
1479     import ctypes
1480     import ctypes.wintypes
1481     import msvcrt
1482
1483     class OVERLAPPED(ctypes.Structure):
1484         _fields_ = [
1485             ('Internal', ctypes.wintypes.LPVOID),
1486             ('InternalHigh', ctypes.wintypes.LPVOID),
1487             ('Offset', ctypes.wintypes.DWORD),
1488             ('OffsetHigh', ctypes.wintypes.DWORD),
1489             ('hEvent', ctypes.wintypes.HANDLE),
1490         ]
1491
1492     kernel32 = ctypes.WinDLL('kernel32')
1493     LockFileEx = kernel32.LockFileEx
1494     LockFileEx.argtypes = [
1495         ctypes.wintypes.HANDLE,     # hFile
1496         ctypes.wintypes.DWORD,      # dwFlags
1497         ctypes.wintypes.DWORD,      # dwReserved
1498         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1499         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1500         ctypes.POINTER(OVERLAPPED)  # Overlapped
1501     ]
1502     LockFileEx.restype = ctypes.wintypes.BOOL
1503     UnlockFileEx = kernel32.UnlockFileEx
1504     UnlockFileEx.argtypes = [
1505         ctypes.wintypes.HANDLE,     # hFile
1506         ctypes.wintypes.DWORD,      # dwReserved
1507         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1508         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1509         ctypes.POINTER(OVERLAPPED)  # Overlapped
1510     ]
1511     UnlockFileEx.restype = ctypes.wintypes.BOOL
1512     whole_low = 0xffffffff
1513     whole_high = 0x7fffffff
1514
1515     def _lock_file(f, exclusive, block):
1516         overlapped = OVERLAPPED()
1517         overlapped.Offset = 0
1518         overlapped.OffsetHigh = 0
1519         overlapped.hEvent = 0
1520         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1521
1522         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1523                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1524                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1525             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1526             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1527
1528     def _unlock_file(f):
1529         assert f._lock_file_overlapped_p
1530         handle = msvcrt.get_osfhandle(f.fileno())
1531         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1532             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1533
1534 else:
1535     try:
1536         import fcntl
1537
1538         def _lock_file(f, exclusive, block):
1539             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1540             if not block:
1541                 flags |= fcntl.LOCK_NB
1542             try:
1543                 fcntl.flock(f, flags)
1544             except BlockingIOError:
1545                 raise
1546             except OSError:  # AOSP does not have flock()
1547                 fcntl.lockf(f, flags)
1548
1549         def _unlock_file(f):
1550             with contextlib.suppress(OSError):
1551                 return fcntl.flock(f, fcntl.LOCK_UN)
1552             with contextlib.suppress(OSError):
1553                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1554             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1555
1556     except ImportError:
1557
1558         def _lock_file(f, exclusive, block):
1559             raise LockingUnsupportedError()
1560
1561         def _unlock_file(f):
1562             raise LockingUnsupportedError()
1563
1564
1565 class locked_file:
1566     locked = False
1567
1568     def __init__(self, filename, mode, block=True, encoding=None):
1569         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1570             raise NotImplementedError(mode)
1571         self.mode, self.block = mode, block
1572
1573         writable = any(f in mode for f in 'wax+')
1574         readable = any(f in mode for f in 'r+')
1575         flags = functools.reduce(operator.ior, (
1576             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
1577             getattr(os, 'O_BINARY', 0),  # Windows only
1578             getattr(os, 'O_NOINHERIT', 0),  # Windows only
1579             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
1580             os.O_APPEND if 'a' in mode else 0,
1581             os.O_EXCL if 'x' in mode else 0,
1582             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1583         ))
1584
1585         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1586
1587     def __enter__(self):
1588         exclusive = 'r' not in self.mode
1589         try:
1590             _lock_file(self.f, exclusive, self.block)
1591             self.locked = True
1592         except OSError:
1593             self.f.close()
1594             raise
1595         if 'w' in self.mode:
1596             try:
1597                 self.f.truncate()
1598             except OSError as e:
1599                 if e.errno not in (
1600                     errno.ESPIPE,  # Illegal seek - expected for FIFO
1601                     errno.EINVAL,  # Invalid argument - expected for /dev/null
1602                 ):
1603                     raise
1604         return self
1605
1606     def unlock(self):
1607         if not self.locked:
1608             return
1609         try:
1610             _unlock_file(self.f)
1611         finally:
1612             self.locked = False
1613
1614     def __exit__(self, *_):
1615         try:
1616             self.unlock()
1617         finally:
1618             self.f.close()
1619
1620     open = __enter__
1621     close = __exit__
1622
1623     def __getattr__(self, attr):
1624         return getattr(self.f, attr)
1625
1626     def __iter__(self):
1627         return iter(self.f)
1628
1629
1630 @functools.cache
1631 def get_filesystem_encoding():
1632     encoding = sys.getfilesystemencoding()
1633     return encoding if encoding is not None else 'utf-8'
1634
1635
1636 def shell_quote(args):
1637     quoted_args = []
1638     encoding = get_filesystem_encoding()
1639     for a in args:
1640         if isinstance(a, bytes):
1641             # We may get a filename encoded with 'encodeFilename'
1642             a = a.decode(encoding)
1643         quoted_args.append(compat_shlex_quote(a))
1644     return ' '.join(quoted_args)
1645
1646
1647 def smuggle_url(url, data):
1648     """ Pass additional data in a URL for internal use. """
1649
1650     url, idata = unsmuggle_url(url, {})
1651     data.update(idata)
1652     sdata = urllib.parse.urlencode(
1653         {'__youtubedl_smuggle': json.dumps(data)})
1654     return url + '#' + sdata
1655
1656
1657 def unsmuggle_url(smug_url, default=None):
1658     if '#__youtubedl_smuggle' not in smug_url:
1659         return smug_url, default
1660     url, _, sdata = smug_url.rpartition('#')
1661     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1662     data = json.loads(jsond)
1663     return url, data
1664
1665
1666 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1667     """ Formats numbers with decimal sufixes like K, M, etc """
1668     num, factor = float_or_none(num), float(factor)
1669     if num is None or num < 0:
1670         return None
1671     POSSIBLE_SUFFIXES = 'kMGTPEZY'
1672     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1673     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1674     if factor == 1024:
1675         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1676     converted = num / (factor ** exponent)
1677     return fmt % (converted, suffix)
1678
1679
1680 def format_bytes(bytes):
1681     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1682
1683
1684 def lookup_unit_table(unit_table, s, strict=False):
1685     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1686     units_re = '|'.join(re.escape(u) for u in unit_table)
1687     m = (re.fullmatch if strict else re.match)(
1688         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1689     if not m:
1690         return None
1691
1692     num = float(m.group('num').replace(',', '.'))
1693     mult = unit_table[m.group('unit')]
1694     return round(num * mult)
1695
1696
1697 def parse_bytes(s):
1698     """Parse a string indicating a byte quantity into an integer"""
1699     return lookup_unit_table(
1700         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1701         s.upper(), strict=True)
1702
1703
1704 def parse_filesize(s):
1705     if s is None:
1706         return None
1707
1708     # The lower-case forms are of course incorrect and unofficial,
1709     # but we support those too
1710     _UNIT_TABLE = {
1711         'B': 1,
1712         'b': 1,
1713         'bytes': 1,
1714         'KiB': 1024,
1715         'KB': 1000,
1716         'kB': 1024,
1717         'Kb': 1000,
1718         'kb': 1000,
1719         'kilobytes': 1000,
1720         'kibibytes': 1024,
1721         'MiB': 1024 ** 2,
1722         'MB': 1000 ** 2,
1723         'mB': 1024 ** 2,
1724         'Mb': 1000 ** 2,
1725         'mb': 1000 ** 2,
1726         'megabytes': 1000 ** 2,
1727         'mebibytes': 1024 ** 2,
1728         'GiB': 1024 ** 3,
1729         'GB': 1000 ** 3,
1730         'gB': 1024 ** 3,
1731         'Gb': 1000 ** 3,
1732         'gb': 1000 ** 3,
1733         'gigabytes': 1000 ** 3,
1734         'gibibytes': 1024 ** 3,
1735         'TiB': 1024 ** 4,
1736         'TB': 1000 ** 4,
1737         'tB': 1024 ** 4,
1738         'Tb': 1000 ** 4,
1739         'tb': 1000 ** 4,
1740         'terabytes': 1000 ** 4,
1741         'tebibytes': 1024 ** 4,
1742         'PiB': 1024 ** 5,
1743         'PB': 1000 ** 5,
1744         'pB': 1024 ** 5,
1745         'Pb': 1000 ** 5,
1746         'pb': 1000 ** 5,
1747         'petabytes': 1000 ** 5,
1748         'pebibytes': 1024 ** 5,
1749         'EiB': 1024 ** 6,
1750         'EB': 1000 ** 6,
1751         'eB': 1024 ** 6,
1752         'Eb': 1000 ** 6,
1753         'eb': 1000 ** 6,
1754         'exabytes': 1000 ** 6,
1755         'exbibytes': 1024 ** 6,
1756         'ZiB': 1024 ** 7,
1757         'ZB': 1000 ** 7,
1758         'zB': 1024 ** 7,
1759         'Zb': 1000 ** 7,
1760         'zb': 1000 ** 7,
1761         'zettabytes': 1000 ** 7,
1762         'zebibytes': 1024 ** 7,
1763         'YiB': 1024 ** 8,
1764         'YB': 1000 ** 8,
1765         'yB': 1024 ** 8,
1766         'Yb': 1000 ** 8,
1767         'yb': 1000 ** 8,
1768         'yottabytes': 1000 ** 8,
1769         'yobibytes': 1024 ** 8,
1770     }
1771
1772     return lookup_unit_table(_UNIT_TABLE, s)
1773
1774
1775 def parse_count(s):
1776     if s is None:
1777         return None
1778
1779     s = re.sub(r'^[^\d]+\s', '', s).strip()
1780
1781     if re.match(r'^[\d,.]+$', s):
1782         return str_to_int(s)
1783
1784     _UNIT_TABLE = {
1785         'k': 1000,
1786         'K': 1000,
1787         'm': 1000 ** 2,
1788         'M': 1000 ** 2,
1789         'kk': 1000 ** 2,
1790         'KK': 1000 ** 2,
1791         'b': 1000 ** 3,
1792         'B': 1000 ** 3,
1793     }
1794
1795     ret = lookup_unit_table(_UNIT_TABLE, s)
1796     if ret is not None:
1797         return ret
1798
1799     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1800     if mobj:
1801         return str_to_int(mobj.group(1))
1802
1803
1804 def parse_resolution(s, *, lenient=False):
1805     if s is None:
1806         return {}
1807
1808     if lenient:
1809         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1810     else:
1811         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1812     if mobj:
1813         return {
1814             'width': int(mobj.group('w')),
1815             'height': int(mobj.group('h')),
1816         }
1817
1818     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1819     if mobj:
1820         return {'height': int(mobj.group(1))}
1821
1822     mobj = re.search(r'\b([48])[kK]\b', s)
1823     if mobj:
1824         return {'height': int(mobj.group(1)) * 540}
1825
1826     return {}
1827
1828
1829 def parse_bitrate(s):
1830     if not isinstance(s, str):
1831         return
1832     mobj = re.search(r'\b(\d+)\s*kbps', s)
1833     if mobj:
1834         return int(mobj.group(1))
1835
1836
1837 def month_by_name(name, lang='en'):
1838     """ Return the number of a month by (locale-independently) English name """
1839
1840     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1841
1842     try:
1843         return month_names.index(name) + 1
1844     except ValueError:
1845         return None
1846
1847
1848 def month_by_abbreviation(abbrev):
1849     """ Return the number of a month by (locale-independently) English
1850         abbreviations """
1851
1852     try:
1853         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1854     except ValueError:
1855         return None
1856
1857
1858 def fix_xml_ampersands(xml_str):
1859     """Replace all the '&' by '&amp;' in XML"""
1860     return re.sub(
1861         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1862         '&amp;',
1863         xml_str)
1864
1865
1866 def setproctitle(title):
1867     assert isinstance(title, str)
1868
1869     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1870     try:
1871         import ctypes
1872     except ImportError:
1873         return
1874
1875     try:
1876         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1877     except OSError:
1878         return
1879     except TypeError:
1880         # LoadLibrary in Windows Python 2.7.13 only expects
1881         # a bytestring, but since unicode_literals turns
1882         # every string into a unicode string, it fails.
1883         return
1884     title_bytes = title.encode()
1885     buf = ctypes.create_string_buffer(len(title_bytes))
1886     buf.value = title_bytes
1887     try:
1888         # PR_SET_NAME = 15      Ref: /usr/include/linux/prctl.h
1889         libc.prctl(15, buf, 0, 0, 0)
1890     except AttributeError:
1891         return  # Strange libc, just skip this
1892
1893
1894 def remove_start(s, start):
1895     return s[len(start):] if s is not None and s.startswith(start) else s
1896
1897
1898 def remove_end(s, end):
1899     return s[:-len(end)] if s is not None and s.endswith(end) else s
1900
1901
1902 def remove_quotes(s):
1903     if s is None or len(s) < 2:
1904         return s
1905     for quote in ('"', "'", ):
1906         if s[0] == quote and s[-1] == quote:
1907             return s[1:-1]
1908     return s
1909
1910
1911 def get_domain(url):
1912     """
1913     This implementation is inconsistent, but is kept for compatibility.
1914     Use this only for "webpage_url_domain"
1915     """
1916     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1917
1918
1919 def url_basename(url):
1920     path = urllib.parse.urlparse(url).path
1921     return path.strip('/').split('/')[-1]
1922
1923
1924 def base_url(url):
1925     return re.match(r'https?://[^?#]+/', url).group()
1926
1927
1928 def urljoin(base, path):
1929     if isinstance(path, bytes):
1930         path = path.decode()
1931     if not isinstance(path, str) or not path:
1932         return None
1933     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1934         return path
1935     if isinstance(base, bytes):
1936         base = base.decode()
1937     if not isinstance(base, str) or not re.match(
1938             r'^(?:https?:)?//', base):
1939         return None
1940     return urllib.parse.urljoin(base, path)
1941
1942
1943 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1944     if get_attr and v is not None:
1945         v = getattr(v, get_attr, None)
1946     try:
1947         return int(v) * invscale // scale
1948     except (ValueError, TypeError, OverflowError):
1949         return default
1950
1951
1952 def str_or_none(v, default=None):
1953     return default if v is None else str(v)
1954
1955
1956 def str_to_int(int_str):
1957     """ A more relaxed version of int_or_none """
1958     if isinstance(int_str, int):
1959         return int_str
1960     elif isinstance(int_str, str):
1961         int_str = re.sub(r'[,\.\+]', '', int_str)
1962         return int_or_none(int_str)
1963
1964
1965 def float_or_none(v, scale=1, invscale=1, default=None):
1966     if v is None:
1967         return default
1968     try:
1969         return float(v) * invscale / scale
1970     except (ValueError, TypeError):
1971         return default
1972
1973
1974 def bool_or_none(v, default=None):
1975     return v if isinstance(v, bool) else default
1976
1977
1978 def strip_or_none(v, default=None):
1979     return v.strip() if isinstance(v, str) else default
1980
1981
1982 def url_or_none(url):
1983     if not url or not isinstance(url, str):
1984         return None
1985     url = url.strip()
1986     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
1987
1988
1989 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
1990     datetime_object = None
1991     try:
1992         if isinstance(timestamp, (int, float)):  # unix timestamp
1993             # Using naive datetime here can break timestamp() in Windows
1994             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
1995             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
1996             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
1997             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
1998                                + datetime.timedelta(seconds=timestamp))
1999         elif isinstance(timestamp, str):  # assume YYYYMMDD
2000             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2001         date_format = re.sub(  # Support %s on windows
2002             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2003         return datetime_object.strftime(date_format)
2004     except (ValueError, TypeError, AttributeError):
2005         return default
2006
2007
2008 def parse_duration(s):
2009     if not isinstance(s, str):
2010         return None
2011     s = s.strip()
2012     if not s:
2013         return None
2014
2015     days, hours, mins, secs, ms = [None] * 5
2016     m = re.match(r'''(?x)
2017             (?P<before_secs>
2018                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2019             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2020             (?P<ms>[.:][0-9]+)?Z?$
2021         ''', s)
2022     if m:
2023         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2024     else:
2025         m = re.match(
2026             r'''(?ix)(?:P?
2027                 (?:
2028                     [0-9]+\s*y(?:ears?)?,?\s*
2029                 )?
2030                 (?:
2031                     [0-9]+\s*m(?:onths?)?,?\s*
2032                 )?
2033                 (?:
2034                     [0-9]+\s*w(?:eeks?)?,?\s*
2035                 )?
2036                 (?:
2037                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2038                 )?
2039                 T)?
2040                 (?:
2041                     (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2042                 )?
2043                 (?:
2044                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2045                 )?
2046                 (?:
2047                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2048                 )?Z?$''', s)
2049         if m:
2050             days, hours, mins, secs, ms = m.groups()
2051         else:
2052             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2053             if m:
2054                 hours, mins = m.groups()
2055             else:
2056                 return None
2057
2058     if ms:
2059         ms = ms.replace(':', '.')
2060     return sum(float(part or 0) * mult for part, mult in (
2061         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2062
2063
2064 def prepend_extension(filename, ext, expected_real_ext=None):
2065     name, real_ext = os.path.splitext(filename)
2066     return (
2067         f'{name}.{ext}{real_ext}'
2068         if not expected_real_ext or real_ext[1:] == expected_real_ext
2069         else f'{filename}.{ext}')
2070
2071
2072 def replace_extension(filename, ext, expected_real_ext=None):
2073     name, real_ext = os.path.splitext(filename)
2074     return '{}.{}'.format(
2075         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2076         ext)
2077
2078
2079 def check_executable(exe, args=[]):
2080     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2081     args can be a list of arguments for a short output (like -version) """
2082     try:
2083         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2084     except OSError:
2085         return False
2086     return exe
2087
2088
2089 def _get_exe_version_output(exe, args):
2090     try:
2091         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2092         # SIGTTOU if yt-dlp is run in the background.
2093         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2094         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2095                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2096         if ret:
2097             return None
2098     except OSError:
2099         return False
2100     return stdout
2101
2102
2103 def detect_exe_version(output, version_re=None, unrecognized='present'):
2104     assert isinstance(output, str)
2105     if version_re is None:
2106         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2107     m = re.search(version_re, output)
2108     if m:
2109         return m.group(1)
2110     else:
2111         return unrecognized
2112
2113
2114 def get_exe_version(exe, args=['--version'],
2115                     version_re=None, unrecognized=('present', 'broken')):
2116     """ Returns the version of the specified executable,
2117     or False if the executable is not present """
2118     unrecognized = variadic(unrecognized)
2119     assert len(unrecognized) in (1, 2)
2120     out = _get_exe_version_output(exe, args)
2121     if out is None:
2122         return unrecognized[-1]
2123     return out and detect_exe_version(out, version_re, unrecognized[0])
2124
2125
2126 def frange(start=0, stop=None, step=1):
2127     """Float range"""
2128     if stop is None:
2129         start, stop = 0, start
2130     sign = [-1, 1][step > 0] if step else 0
2131     while sign * start < sign * stop:
2132         yield start
2133         start += step
2134
2135
2136 class LazyList(collections.abc.Sequence):
2137     """Lazy immutable list from an iterable
2138     Note that slices of a LazyList are lists and not LazyList"""
2139
2140     class IndexError(IndexError):
2141         pass
2142
2143     def __init__(self, iterable, *, reverse=False, _cache=None):
2144         self._iterable = iter(iterable)
2145         self._cache = [] if _cache is None else _cache
2146         self._reversed = reverse
2147
2148     def __iter__(self):
2149         if self._reversed:
2150             # We need to consume the entire iterable to iterate in reverse
2151             yield from self.exhaust()
2152             return
2153         yield from self._cache
2154         for item in self._iterable:
2155             self._cache.append(item)
2156             yield item
2157
2158     def _exhaust(self):
2159         self._cache.extend(self._iterable)
2160         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2161         return self._cache
2162
2163     def exhaust(self):
2164         """Evaluate the entire iterable"""
2165         return self._exhaust()[::-1 if self._reversed else 1]
2166
2167     @staticmethod
2168     def _reverse_index(x):
2169         return None if x is None else ~x
2170
2171     def __getitem__(self, idx):
2172         if isinstance(idx, slice):
2173             if self._reversed:
2174                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2175             start, stop, step = idx.start, idx.stop, idx.step or 1
2176         elif isinstance(idx, int):
2177             if self._reversed:
2178                 idx = self._reverse_index(idx)
2179             start, stop, step = idx, idx, 0
2180         else:
2181             raise TypeError('indices must be integers or slices')
2182         if ((start or 0) < 0 or (stop or 0) < 0
2183                 or (start is None and step < 0)
2184                 or (stop is None and step > 0)):
2185             # We need to consume the entire iterable to be able to slice from the end
2186             # Obviously, never use this with infinite iterables
2187             self._exhaust()
2188             try:
2189                 return self._cache[idx]
2190             except IndexError as e:
2191                 raise self.IndexError(e) from e
2192         n = max(start or 0, stop or 0) - len(self._cache) + 1
2193         if n > 0:
2194             self._cache.extend(itertools.islice(self._iterable, n))
2195         try:
2196             return self._cache[idx]
2197         except IndexError as e:
2198             raise self.IndexError(e) from e
2199
2200     def __bool__(self):
2201         try:
2202             self[-1] if self._reversed else self[0]
2203         except self.IndexError:
2204             return False
2205         return True
2206
2207     def __len__(self):
2208         self._exhaust()
2209         return len(self._cache)
2210
2211     def __reversed__(self):
2212         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2213
2214     def __copy__(self):
2215         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2216
2217     def __repr__(self):
2218         # repr and str should mimic a list. So we exhaust the iterable
2219         return repr(self.exhaust())
2220
2221     def __str__(self):
2222         return repr(self.exhaust())
2223
2224
2225 class PagedList:
2226
2227     class IndexError(IndexError):
2228         pass
2229
2230     def __len__(self):
2231         # This is only useful for tests
2232         return len(self.getslice())
2233
2234     def __init__(self, pagefunc, pagesize, use_cache=True):
2235         self._pagefunc = pagefunc
2236         self._pagesize = pagesize
2237         self._pagecount = float('inf')
2238         self._use_cache = use_cache
2239         self._cache = {}
2240
2241     def getpage(self, pagenum):
2242         page_results = self._cache.get(pagenum)
2243         if page_results is None:
2244             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2245         if self._use_cache:
2246             self._cache[pagenum] = page_results
2247         return page_results
2248
2249     def getslice(self, start=0, end=None):
2250         return list(self._getslice(start, end))
2251
2252     def _getslice(self, start, end):
2253         raise NotImplementedError('This method must be implemented by subclasses')
2254
2255     def __getitem__(self, idx):
2256         assert self._use_cache, 'Indexing PagedList requires cache'
2257         if not isinstance(idx, int) or idx < 0:
2258             raise TypeError('indices must be non-negative integers')
2259         entries = self.getslice(idx, idx + 1)
2260         if not entries:
2261             raise self.IndexError()
2262         return entries[0]
2263
2264     def __bool__(self):
2265         return bool(self.getslice(0, 1))
2266
2267
2268 class OnDemandPagedList(PagedList):
2269     """Download pages until a page with less than maximum results"""
2270
2271     def _getslice(self, start, end):
2272         for pagenum in itertools.count(start // self._pagesize):
2273             firstid = pagenum * self._pagesize
2274             nextfirstid = pagenum * self._pagesize + self._pagesize
2275             if start >= nextfirstid:
2276                 continue
2277
2278             startv = (
2279                 start % self._pagesize
2280                 if firstid <= start < nextfirstid
2281                 else 0)
2282             endv = (
2283                 ((end - 1) % self._pagesize) + 1
2284                 if (end is not None and firstid <= end <= nextfirstid)
2285                 else None)
2286
2287             try:
2288                 page_results = self.getpage(pagenum)
2289             except Exception:
2290                 self._pagecount = pagenum - 1
2291                 raise
2292             if startv != 0 or endv is not None:
2293                 page_results = page_results[startv:endv]
2294             yield from page_results
2295
2296             # A little optimization - if current page is not "full", ie. does
2297             # not contain page_size videos then we can assume that this page
2298             # is the last one - there are no more ids on further pages -
2299             # i.e. no need to query again.
2300             if len(page_results) + startv < self._pagesize:
2301                 break
2302
2303             # If we got the whole page, but the next page is not interesting,
2304             # break out early as well
2305             if end == nextfirstid:
2306                 break
2307
2308
2309 class InAdvancePagedList(PagedList):
2310     """PagedList with total number of pages known in advance"""
2311
2312     def __init__(self, pagefunc, pagecount, pagesize):
2313         PagedList.__init__(self, pagefunc, pagesize, True)
2314         self._pagecount = pagecount
2315
2316     def _getslice(self, start, end):
2317         start_page = start // self._pagesize
2318         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2319         skip_elems = start - start_page * self._pagesize
2320         only_more = None if end is None else end - start
2321         for pagenum in range(start_page, end_page):
2322             page_results = self.getpage(pagenum)
2323             if skip_elems:
2324                 page_results = page_results[skip_elems:]
2325                 skip_elems = None
2326             if only_more is not None:
2327                 if len(page_results) < only_more:
2328                     only_more -= len(page_results)
2329                 else:
2330                     yield from page_results[:only_more]
2331                     break
2332             yield from page_results
2333
2334
2335 class PlaylistEntries:
2336     MissingEntry = object()
2337     is_exhausted = False
2338
2339     def __init__(self, ydl, info_dict):
2340         self.ydl = ydl
2341
2342         # _entries must be assigned now since infodict can change during iteration
2343         entries = info_dict.get('entries')
2344         if entries is None:
2345             raise EntryNotInPlaylist('There are no entries')
2346         elif isinstance(entries, list):
2347             self.is_exhausted = True
2348
2349         requested_entries = info_dict.get('requested_entries')
2350         self.is_incomplete = requested_entries is not None
2351         if self.is_incomplete:
2352             assert self.is_exhausted
2353             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2354             for i, entry in zip(requested_entries, entries):
2355                 self._entries[i - 1] = entry
2356         elif isinstance(entries, (list, PagedList, LazyList)):
2357             self._entries = entries
2358         else:
2359             self._entries = LazyList(entries)
2360
2361     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2362         (?P<start>[+-]?\d+)?
2363         (?P<range>[:-]
2364             (?P<end>[+-]?\d+|inf(?:inite)?)?
2365             (?::(?P<step>[+-]?\d+))?
2366         )?''')
2367
2368     @classmethod
2369     def parse_playlist_items(cls, string):
2370         for segment in string.split(','):
2371             if not segment:
2372                 raise ValueError('There is two or more consecutive commas')
2373             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2374             if not mobj:
2375                 raise ValueError(f'{segment!r} is not a valid specification')
2376             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2377             if int_or_none(step) == 0:
2378                 raise ValueError(f'Step in {segment!r} cannot be zero')
2379             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2380
2381     def get_requested_items(self):
2382         playlist_items = self.ydl.params.get('playlist_items')
2383         playlist_start = self.ydl.params.get('playliststart', 1)
2384         playlist_end = self.ydl.params.get('playlistend')
2385         # For backwards compatibility, interpret -1 as whole list
2386         if playlist_end in (-1, None):
2387             playlist_end = ''
2388         if not playlist_items:
2389             playlist_items = f'{playlist_start}:{playlist_end}'
2390         elif playlist_start != 1 or playlist_end:
2391             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2392
2393         for index in self.parse_playlist_items(playlist_items):
2394             for i, entry in self[index]:
2395                 yield i, entry
2396                 if not entry:
2397                     continue
2398                 try:
2399                     # The item may have just been added to archive. Don't break due to it
2400                     if not self.ydl.params.get('lazy_playlist'):
2401                         # TODO: Add auto-generated fields
2402                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2403                 except (ExistingVideoReached, RejectedVideoReached):
2404                     return
2405
2406     def get_full_count(self):
2407         if self.is_exhausted and not self.is_incomplete:
2408             return len(self)
2409         elif isinstance(self._entries, InAdvancePagedList):
2410             if self._entries._pagesize == 1:
2411                 return self._entries._pagecount
2412
2413     @functools.cached_property
2414     def _getter(self):
2415         if isinstance(self._entries, list):
2416             def get_entry(i):
2417                 try:
2418                     entry = self._entries[i]
2419                 except IndexError:
2420                     entry = self.MissingEntry
2421                     if not self.is_incomplete:
2422                         raise self.IndexError()
2423                 if entry is self.MissingEntry:
2424                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2425                 return entry
2426         else:
2427             def get_entry(i):
2428                 try:
2429                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2430                 except (LazyList.IndexError, PagedList.IndexError):
2431                     raise self.IndexError()
2432         return get_entry
2433
2434     def __getitem__(self, idx):
2435         if isinstance(idx, int):
2436             idx = slice(idx, idx)
2437
2438         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2439         step = 1 if idx.step is None else idx.step
2440         if idx.start is None:
2441             start = 0 if step > 0 else len(self) - 1
2442         else:
2443             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2444
2445         # NB: Do not call len(self) when idx == [:]
2446         if idx.stop is None:
2447             stop = 0 if step < 0 else float('inf')
2448         else:
2449             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2450         stop += [-1, 1][step > 0]
2451
2452         for i in frange(start, stop, step):
2453             if i < 0:
2454                 continue
2455             try:
2456                 entry = self._getter(i)
2457             except self.IndexError:
2458                 self.is_exhausted = True
2459                 if step > 0:
2460                     break
2461                 continue
2462             yield i + 1, entry
2463
2464     def __len__(self):
2465         return len(tuple(self[:]))
2466
2467     class IndexError(IndexError):
2468         pass
2469
2470
2471 def uppercase_escape(s):
2472     unicode_escape = codecs.getdecoder('unicode_escape')
2473     return re.sub(
2474         r'\\U[0-9a-fA-F]{8}',
2475         lambda m: unicode_escape(m.group(0))[0],
2476         s)
2477
2478
2479 def lowercase_escape(s):
2480     unicode_escape = codecs.getdecoder('unicode_escape')
2481     return re.sub(
2482         r'\\u[0-9a-fA-F]{4}',
2483         lambda m: unicode_escape(m.group(0))[0],
2484         s)
2485
2486
2487 def parse_qs(url, **kwargs):
2488     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2489
2490
2491 def read_batch_urls(batch_fd):
2492     def fixup(url):
2493         if not isinstance(url, str):
2494             url = url.decode('utf-8', 'replace')
2495         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2496         for bom in BOM_UTF8:
2497             if url.startswith(bom):
2498                 url = url[len(bom):]
2499         url = url.lstrip()
2500         if not url or url.startswith(('#', ';', ']')):
2501             return False
2502         # "#" cannot be stripped out since it is part of the URI
2503         # However, it can be safely stripped out if following a whitespace
2504         return re.split(r'\s#', url, 1)[0].rstrip()
2505
2506     with contextlib.closing(batch_fd) as fd:
2507         return [url for url in map(fixup, fd) if url]
2508
2509
2510 def urlencode_postdata(*args, **kargs):
2511     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2512
2513
2514 def update_url(url, *, query_update=None, **kwargs):
2515     """Replace URL components specified by kwargs
2516        @param url           str or parse url tuple
2517        @param query_update  update query
2518        @returns             str
2519     """
2520     if isinstance(url, str):
2521         if not kwargs and not query_update:
2522             return url
2523         else:
2524             url = urllib.parse.urlparse(url)
2525     if query_update:
2526         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2527         kwargs['query'] = urllib.parse.urlencode({
2528             **urllib.parse.parse_qs(url.query),
2529             **query_update
2530         }, True)
2531     return urllib.parse.urlunparse(url._replace(**kwargs))
2532
2533
2534 def update_url_query(url, query):
2535     return update_url(url, query_update=query)
2536
2537
2538 def _multipart_encode_impl(data, boundary):
2539     content_type = 'multipart/form-data; boundary=%s' % boundary
2540
2541     out = b''
2542     for k, v in data.items():
2543         out += b'--' + boundary.encode('ascii') + b'\r\n'
2544         if isinstance(k, str):
2545             k = k.encode()
2546         if isinstance(v, str):
2547             v = v.encode()
2548         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2549         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2550         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2551         if boundary.encode('ascii') in content:
2552             raise ValueError('Boundary overlaps with data')
2553         out += content
2554
2555     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2556
2557     return out, content_type
2558
2559
2560 def multipart_encode(data, boundary=None):
2561     '''
2562     Encode a dict to RFC 7578-compliant form-data
2563
2564     data:
2565         A dict where keys and values can be either Unicode or bytes-like
2566         objects.
2567     boundary:
2568         If specified a Unicode object, it's used as the boundary. Otherwise
2569         a random boundary is generated.
2570
2571     Reference: https://tools.ietf.org/html/rfc7578
2572     '''
2573     has_specified_boundary = boundary is not None
2574
2575     while True:
2576         if boundary is None:
2577             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2578
2579         try:
2580             out, content_type = _multipart_encode_impl(data, boundary)
2581             break
2582         except ValueError:
2583             if has_specified_boundary:
2584                 raise
2585             boundary = None
2586
2587     return out, content_type
2588
2589
2590 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2591     if blocked_types is NO_DEFAULT:
2592         blocked_types = (str, bytes, collections.abc.Mapping)
2593     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2594
2595
2596 def variadic(x, allowed_types=NO_DEFAULT):
2597     if not isinstance(allowed_types, (tuple, type)):
2598         deprecation_warning('allowed_types should be a tuple or a type')
2599         allowed_types = tuple(allowed_types)
2600     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2601
2602
2603 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2604     for f in funcs:
2605         try:
2606             val = f(*args, **kwargs)
2607         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2608             pass
2609         else:
2610             if expected_type is None or isinstance(val, expected_type):
2611                 return val
2612
2613
2614 def try_get(src, getter, expected_type=None):
2615     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2616
2617
2618 def filter_dict(dct, cndn=lambda _, v: v is not None):
2619     return {k: v for k, v in dct.items() if cndn(k, v)}
2620
2621
2622 def merge_dicts(*dicts):
2623     merged = {}
2624     for a_dict in dicts:
2625         for k, v in a_dict.items():
2626             if (v is not None and k not in merged
2627                     or isinstance(v, str) and merged[k] == ''):
2628                 merged[k] = v
2629     return merged
2630
2631
2632 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2633     return string if isinstance(string, str) else str(string, encoding, errors)
2634
2635
2636 US_RATINGS = {
2637     'G': 0,
2638     'PG': 10,
2639     'PG-13': 13,
2640     'R': 16,
2641     'NC': 18,
2642 }
2643
2644
2645 TV_PARENTAL_GUIDELINES = {
2646     'TV-Y': 0,
2647     'TV-Y7': 7,
2648     'TV-G': 0,
2649     'TV-PG': 0,
2650     'TV-14': 14,
2651     'TV-MA': 17,
2652 }
2653
2654
2655 def parse_age_limit(s):
2656     # isinstance(False, int) is True. So type() must be used instead
2657     if type(s) is int:  # noqa: E721
2658         return s if 0 <= s <= 21 else None
2659     elif not isinstance(s, str):
2660         return None
2661     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2662     if m:
2663         return int(m.group('age'))
2664     s = s.upper()
2665     if s in US_RATINGS:
2666         return US_RATINGS[s]
2667     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2668     if m:
2669         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2670     return None
2671
2672
2673 def strip_jsonp(code):
2674     return re.sub(
2675         r'''(?sx)^
2676             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2677             (?:\s*&&\s*(?P=func_name))?
2678             \s*\(\s*(?P<callback_data>.*)\);?
2679             \s*?(?://[^\n]*)*$''',
2680         r'\g<callback_data>', code)
2681
2682
2683 def js_to_json(code, vars={}, *, strict=False):
2684     # vars is a dict of var, val pairs to substitute
2685     STRING_QUOTES = '\'"`'
2686     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2687     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2688     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2689     INTEGER_TABLE = (
2690         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2691         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2692     )
2693
2694     def process_escape(match):
2695         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2696         escape = match.group(1) or match.group(2)
2697
2698         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2699                 else R'\u00' if escape == 'x'
2700                 else '' if escape == '\n'
2701                 else escape)
2702
2703     def template_substitute(match):
2704         evaluated = js_to_json(match.group(1), vars, strict=strict)
2705         if evaluated[0] == '"':
2706             return json.loads(evaluated)
2707         return evaluated
2708
2709     def fix_kv(m):
2710         v = m.group(0)
2711         if v in ('true', 'false', 'null'):
2712             return v
2713         elif v in ('undefined', 'void 0'):
2714             return 'null'
2715         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2716             return ''
2717
2718         if v[0] in STRING_QUOTES:
2719             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2720             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2721             return f'"{escaped}"'
2722
2723         for regex, base in INTEGER_TABLE:
2724             im = re.match(regex, v)
2725             if im:
2726                 i = int(im.group(1), base)
2727                 return f'"{i}":' if v.endswith(':') else str(i)
2728
2729         if v in vars:
2730             try:
2731                 if not strict:
2732                     json.loads(vars[v])
2733             except json.JSONDecodeError:
2734                 return json.dumps(vars[v])
2735             else:
2736                 return vars[v]
2737
2738         if not strict:
2739             return f'"{v}"'
2740
2741         raise ValueError(f'Unknown value: {v}')
2742
2743     def create_map(mobj):
2744         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2745
2746     code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
2747     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2748     if not strict:
2749         code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
2750         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2751         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2752         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2753
2754     return re.sub(rf'''(?sx)
2755         {STRING_RE}|
2756         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2757         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2758         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2759         [0-9]+(?={SKIP_RE}:)|
2760         !+
2761         ''', fix_kv, code)
2762
2763
2764 def qualities(quality_ids):
2765     """ Get a numeric quality value out of a list of possible values """
2766     def q(qid):
2767         try:
2768             return quality_ids.index(qid)
2769         except ValueError:
2770             return -1
2771     return q
2772
2773
2774 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2775
2776
2777 DEFAULT_OUTTMPL = {
2778     'default': '%(title)s [%(id)s].%(ext)s',
2779     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2780 }
2781 OUTTMPL_TYPES = {
2782     'chapter': None,
2783     'subtitle': None,
2784     'thumbnail': None,
2785     'description': 'description',
2786     'annotation': 'annotations.xml',
2787     'infojson': 'info.json',
2788     'link': None,
2789     'pl_video': None,
2790     'pl_thumbnail': None,
2791     'pl_description': 'description',
2792     'pl_infojson': 'info.json',
2793 }
2794
2795 # As of [1] format syntax is:
2796 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2797 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2798 STR_FORMAT_RE_TMPL = r'''(?x)
2799     (?<!%)(?P<prefix>(?:%%)*)
2800     %
2801     (?P<has_key>\((?P<key>{0})\))?
2802     (?P<format>
2803         (?P<conversion>[#0\-+ ]+)?
2804         (?P<min_width>\d+)?
2805         (?P<precision>\.\d+)?
2806         (?P<len_mod>[hlL])?  # unused in python
2807         {1}  # conversion type
2808     )
2809 '''
2810
2811
2812 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2813
2814
2815 def limit_length(s, length):
2816     """ Add ellipses to overly long strings """
2817     if s is None:
2818         return None
2819     ELLIPSES = '...'
2820     if len(s) > length:
2821         return s[:length - len(ELLIPSES)] + ELLIPSES
2822     return s
2823
2824
2825 def version_tuple(v):
2826     return tuple(int(e) for e in re.split(r'[-.]', v))
2827
2828
2829 def is_outdated_version(version, limit, assume_new=True):
2830     if not version:
2831         return not assume_new
2832     try:
2833         return version_tuple(version) < version_tuple(limit)
2834     except ValueError:
2835         return not assume_new
2836
2837
2838 def ytdl_is_updateable():
2839     """ Returns if yt-dlp can be updated with -U """
2840
2841     from ..update import is_non_updateable
2842
2843     return not is_non_updateable()
2844
2845
2846 def args_to_str(args):
2847     # Get a short string representation for a subprocess command
2848     return ' '.join(compat_shlex_quote(a) for a in args)
2849
2850
2851 def error_to_str(err):
2852     return f'{type(err).__name__}: {err}'
2853
2854
2855 def mimetype2ext(mt, default=NO_DEFAULT):
2856     if not isinstance(mt, str):
2857         if default is not NO_DEFAULT:
2858             return default
2859         return None
2860
2861     MAP = {
2862         # video
2863         '3gpp': '3gp',
2864         'mp2t': 'ts',
2865         'mp4': 'mp4',
2866         'mpeg': 'mpeg',
2867         'mpegurl': 'm3u8',
2868         'quicktime': 'mov',
2869         'webm': 'webm',
2870         'vp9': 'vp9',
2871         'video/ogg': 'ogv',
2872         'x-flv': 'flv',
2873         'x-m4v': 'm4v',
2874         'x-matroska': 'mkv',
2875         'x-mng': 'mng',
2876         'x-mp4-fragmented': 'mp4',
2877         'x-ms-asf': 'asf',
2878         'x-ms-wmv': 'wmv',
2879         'x-msvideo': 'avi',
2880
2881         # application (streaming playlists)
2882         'dash+xml': 'mpd',
2883         'f4m+xml': 'f4m',
2884         'hds+xml': 'f4m',
2885         'vnd.apple.mpegurl': 'm3u8',
2886         'vnd.ms-sstr+xml': 'ism',
2887         'x-mpegurl': 'm3u8',
2888
2889         # audio
2890         'audio/mp4': 'm4a',
2891         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2892         # Using .mp3 as it's the most popular one
2893         'audio/mpeg': 'mp3',
2894         'audio/webm': 'webm',
2895         'audio/x-matroska': 'mka',
2896         'audio/x-mpegurl': 'm3u',
2897         'midi': 'mid',
2898         'ogg': 'ogg',
2899         'wav': 'wav',
2900         'wave': 'wav',
2901         'x-aac': 'aac',
2902         'x-flac': 'flac',
2903         'x-m4a': 'm4a',
2904         'x-realaudio': 'ra',
2905         'x-wav': 'wav',
2906
2907         # image
2908         'avif': 'avif',
2909         'bmp': 'bmp',
2910         'gif': 'gif',
2911         'jpeg': 'jpg',
2912         'png': 'png',
2913         'svg+xml': 'svg',
2914         'tiff': 'tif',
2915         'vnd.wap.wbmp': 'wbmp',
2916         'webp': 'webp',
2917         'x-icon': 'ico',
2918         'x-jng': 'jng',
2919         'x-ms-bmp': 'bmp',
2920
2921         # caption
2922         'filmstrip+json': 'fs',
2923         'smptett+xml': 'tt',
2924         'ttaf+xml': 'dfxp',
2925         'ttml+xml': 'ttml',
2926         'x-ms-sami': 'sami',
2927
2928         # misc
2929         'gzip': 'gz',
2930         'json': 'json',
2931         'xml': 'xml',
2932         'zip': 'zip',
2933     }
2934
2935     mimetype = mt.partition(';')[0].strip().lower()
2936     _, _, subtype = mimetype.rpartition('/')
2937
2938     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2939     if ext:
2940         return ext
2941     elif default is not NO_DEFAULT:
2942         return default
2943     return subtype.replace('+', '.')
2944
2945
2946 def ext2mimetype(ext_or_url):
2947     if not ext_or_url:
2948         return None
2949     if '.' not in ext_or_url:
2950         ext_or_url = f'file.{ext_or_url}'
2951     return mimetypes.guess_type(ext_or_url)[0]
2952
2953
2954 def parse_codecs(codecs_str):
2955     # http://tools.ietf.org/html/rfc6381
2956     if not codecs_str:
2957         return {}
2958     split_codecs = list(filter(None, map(
2959         str.strip, codecs_str.strip().strip(',').split(','))))
2960     vcodec, acodec, scodec, hdr = None, None, None, None
2961     for full_codec in split_codecs:
2962         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2963         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2964                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2965             if vcodec:
2966                 continue
2967             vcodec = full_codec
2968             if parts[0] in ('dvh1', 'dvhe'):
2969                 hdr = 'DV'
2970             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2971                 hdr = 'HDR10'
2972             elif parts[:2] == ['vp9', '2']:
2973                 hdr = 'HDR10'
2974         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
2975                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2976             acodec = acodec or full_codec
2977         elif parts[0] in ('stpp', 'wvtt'):
2978             scodec = scodec or full_codec
2979         else:
2980             write_string(f'WARNING: Unknown codec {full_codec}\n')
2981     if vcodec or acodec or scodec:
2982         return {
2983             'vcodec': vcodec or 'none',
2984             'acodec': acodec or 'none',
2985             'dynamic_range': hdr,
2986             **({'scodec': scodec} if scodec is not None else {}),
2987         }
2988     elif len(split_codecs) == 2:
2989         return {
2990             'vcodec': split_codecs[0],
2991             'acodec': split_codecs[1],
2992         }
2993     return {}
2994
2995
2996 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
2997     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
2998
2999     allow_mkv = not preferences or 'mkv' in preferences
3000
3001     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3002         return 'mkv'  # TODO: any other format allows this?
3003
3004     # TODO: All codecs supported by parse_codecs isn't handled here
3005     COMPATIBLE_CODECS = {
3006         'mp4': {
3007             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3008             'h264', 'aacl', 'ec-3',  # Set in ISM
3009         },
3010         'webm': {
3011             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3012             'vp9x', 'vp8x',  # in the webm spec
3013         },
3014     }
3015
3016     sanitize_codec = functools.partial(
3017         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3018     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3019
3020     for ext in preferences or COMPATIBLE_CODECS.keys():
3021         codec_set = COMPATIBLE_CODECS.get(ext, set())
3022         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3023             return ext
3024
3025     COMPATIBLE_EXTS = (
3026         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3027         {'webm', 'weba'},
3028     )
3029     for ext in preferences or vexts:
3030         current_exts = {ext, *vexts, *aexts}
3031         if ext == 'mkv' or current_exts == {ext} or any(
3032                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3033             return ext
3034     return 'mkv' if allow_mkv else preferences[-1]
3035
3036
3037 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3038     getheader = url_handle.headers.get
3039
3040     cd = getheader('Content-Disposition')
3041     if cd:
3042         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3043         if m:
3044             e = determine_ext(m.group('filename'), default_ext=None)
3045             if e:
3046                 return e
3047
3048     meta_ext = getheader('x-amz-meta-name')
3049     if meta_ext:
3050         e = meta_ext.rpartition('.')[2]
3051         if e:
3052             return e
3053
3054     return mimetype2ext(getheader('Content-Type'), default=default)
3055
3056
3057 def encode_data_uri(data, mime_type):
3058     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3059
3060
3061 def age_restricted(content_limit, age_limit):
3062     """ Returns True iff the content should be blocked """
3063
3064     if age_limit is None:  # No limit set
3065         return False
3066     if content_limit is None:
3067         return False  # Content available for everyone
3068     return age_limit < content_limit
3069
3070
3071 # List of known byte-order-marks (BOM)
3072 BOMS = [
3073     (b'\xef\xbb\xbf', 'utf-8'),
3074     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3075     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3076     (b'\xff\xfe', 'utf-16-le'),
3077     (b'\xfe\xff', 'utf-16-be'),
3078 ]
3079
3080
3081 def is_html(first_bytes):
3082     """ Detect whether a file contains HTML by examining its first bytes. """
3083
3084     encoding = 'utf-8'
3085     for bom, enc in BOMS:
3086         while first_bytes.startswith(bom):
3087             encoding, first_bytes = enc, first_bytes[len(bom):]
3088
3089     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3090
3091
3092 def determine_protocol(info_dict):
3093     protocol = info_dict.get('protocol')
3094     if protocol is not None:
3095         return protocol
3096
3097     url = sanitize_url(info_dict['url'])
3098     if url.startswith('rtmp'):
3099         return 'rtmp'
3100     elif url.startswith('mms'):
3101         return 'mms'
3102     elif url.startswith('rtsp'):
3103         return 'rtsp'
3104
3105     ext = determine_ext(url)
3106     if ext == 'm3u8':
3107         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3108     elif ext == 'f4m':
3109         return 'f4m'
3110
3111     return urllib.parse.urlparse(url).scheme
3112
3113
3114 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3115     """ Render a list of rows, each as a list of values.
3116     Text after a \t will be right aligned """
3117     def width(string):
3118         return len(remove_terminal_sequences(string).replace('\t', ''))
3119
3120     def get_max_lens(table):
3121         return [max(width(str(v)) for v in col) for col in zip(*table)]
3122
3123     def filter_using_list(row, filterArray):
3124         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3125
3126     max_lens = get_max_lens(data) if hide_empty else []
3127     header_row = filter_using_list(header_row, max_lens)
3128     data = [filter_using_list(row, max_lens) for row in data]
3129
3130     table = [header_row] + data
3131     max_lens = get_max_lens(table)
3132     extra_gap += 1
3133     if delim:
3134         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3135         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3136     for row in table:
3137         for pos, text in enumerate(map(str, row)):
3138             if '\t' in text:
3139                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3140             else:
3141                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3142     ret = '\n'.join(''.join(row).rstrip() for row in table)
3143     return ret
3144
3145
3146 def _match_one(filter_part, dct, incomplete):
3147     # TODO: Generalize code with YoutubeDL._build_format_filter
3148     STRING_OPERATORS = {
3149         '*=': operator.contains,
3150         '^=': lambda attr, value: attr.startswith(value),
3151         '$=': lambda attr, value: attr.endswith(value),
3152         '~=': lambda attr, value: re.search(value, attr),
3153     }
3154     COMPARISON_OPERATORS = {
3155         **STRING_OPERATORS,
3156         '<=': operator.le,  # "<=" must be defined above "<"
3157         '<': operator.lt,
3158         '>=': operator.ge,
3159         '>': operator.gt,
3160         '=': operator.eq,
3161     }
3162
3163     if isinstance(incomplete, bool):
3164         is_incomplete = lambda _: incomplete
3165     else:
3166         is_incomplete = lambda k: k in incomplete
3167
3168     operator_rex = re.compile(r'''(?x)
3169         (?P<key>[a-z_]+)
3170         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3171         (?:
3172             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3173             (?P<strval>.+?)
3174         )
3175         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3176     m = operator_rex.fullmatch(filter_part.strip())
3177     if m:
3178         m = m.groupdict()
3179         unnegated_op = COMPARISON_OPERATORS[m['op']]
3180         if m['negation']:
3181             op = lambda attr, value: not unnegated_op(attr, value)
3182         else:
3183             op = unnegated_op
3184         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3185         if m['quote']:
3186             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3187         actual_value = dct.get(m['key'])
3188         numeric_comparison = None
3189         if isinstance(actual_value, (int, float)):
3190             # If the original field is a string and matching comparisonvalue is
3191             # a number we should respect the origin of the original field
3192             # and process comparison value as a string (see
3193             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3194             try:
3195                 numeric_comparison = int(comparison_value)
3196             except ValueError:
3197                 numeric_comparison = parse_filesize(comparison_value)
3198                 if numeric_comparison is None:
3199                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3200                 if numeric_comparison is None:
3201                     numeric_comparison = parse_duration(comparison_value)
3202         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3203             raise ValueError('Operator %s only supports string values!' % m['op'])
3204         if actual_value is None:
3205             return is_incomplete(m['key']) or m['none_inclusive']
3206         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3207
3208     UNARY_OPERATORS = {
3209         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3210         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3211     }
3212     operator_rex = re.compile(r'''(?x)
3213         (?P<op>%s)\s*(?P<key>[a-z_]+)
3214         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3215     m = operator_rex.fullmatch(filter_part.strip())
3216     if m:
3217         op = UNARY_OPERATORS[m.group('op')]
3218         actual_value = dct.get(m.group('key'))
3219         if is_incomplete(m.group('key')) and actual_value is None:
3220             return True
3221         return op(actual_value)
3222
3223     raise ValueError('Invalid filter part %r' % filter_part)
3224
3225
3226 def match_str(filter_str, dct, incomplete=False):
3227     """ Filter a dictionary with a simple string syntax.
3228     @returns           Whether the filter passes
3229     @param incomplete  Set of keys that is expected to be missing from dct.
3230                        Can be True/False to indicate all/none of the keys may be missing.
3231                        All conditions on incomplete keys pass if the key is missing
3232     """
3233     return all(
3234         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3235         for filter_part in re.split(r'(?<!\\)&', filter_str))
3236
3237
3238 def match_filter_func(filters, breaking_filters=None):
3239     if not filters and not breaking_filters:
3240         return None
3241     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3242     filters = set(variadic(filters or []))
3243
3244     interactive = '-' in filters
3245     if interactive:
3246         filters.remove('-')
3247
3248     def _match_func(info_dict, incomplete=False):
3249         ret = breaking_filters(info_dict, incomplete)
3250         if ret is not None:
3251             raise RejectedVideoReached(ret)
3252
3253         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3254             return NO_DEFAULT if interactive and not incomplete else None
3255         else:
3256             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3257             filter_str = ') | ('.join(map(str.strip, filters))
3258             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3259     return _match_func
3260
3261
3262 class download_range_func:
3263     def __init__(self, chapters, ranges, from_info=False):
3264         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3265
3266     def __call__(self, info_dict, ydl):
3267
3268         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3269                    else 'Cannot match chapters since chapter information is unavailable')
3270         for regex in self.chapters or []:
3271             for i, chapter in enumerate(info_dict.get('chapters') or []):
3272                 if re.search(regex, chapter['title']):
3273                     warning = None
3274                     yield {**chapter, 'index': i}
3275         if self.chapters and warning:
3276             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3277
3278         for start, end in self.ranges or []:
3279             yield {
3280                 'start_time': self._handle_negative_timestamp(start, info_dict),
3281                 'end_time': self._handle_negative_timestamp(end, info_dict),
3282             }
3283
3284         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3285             yield {
3286                 'start_time': info_dict.get('start_time') or 0,
3287                 'end_time': info_dict.get('end_time') or float('inf'),
3288             }
3289         elif not self.ranges and not self.chapters:
3290             yield {}
3291
3292     @staticmethod
3293     def _handle_negative_timestamp(time, info):
3294         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3295
3296     def __eq__(self, other):
3297         return (isinstance(other, download_range_func)
3298                 and self.chapters == other.chapters and self.ranges == other.ranges)
3299
3300     def __repr__(self):
3301         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3302
3303
3304 def parse_dfxp_time_expr(time_expr):
3305     if not time_expr:
3306         return
3307
3308     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3309     if mobj:
3310         return float(mobj.group('time_offset'))
3311
3312     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3313     if mobj:
3314         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3315
3316
3317 def srt_subtitles_timecode(seconds):
3318     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3319
3320
3321 def ass_subtitles_timecode(seconds):
3322     time = timetuple_from_msec(seconds * 1000)
3323     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3324
3325
3326 def dfxp2srt(dfxp_data):
3327     '''
3328     @param dfxp_data A bytes-like object containing DFXP data
3329     @returns A unicode object containing converted SRT data
3330     '''
3331     LEGACY_NAMESPACES = (
3332         (b'http://www.w3.org/ns/ttml', [
3333             b'http://www.w3.org/2004/11/ttaf1',
3334             b'http://www.w3.org/2006/04/ttaf1',
3335             b'http://www.w3.org/2006/10/ttaf1',
3336         ]),
3337         (b'http://www.w3.org/ns/ttml#styling', [
3338             b'http://www.w3.org/ns/ttml#style',
3339         ]),
3340     )
3341
3342     SUPPORTED_STYLING = [
3343         'color',
3344         'fontFamily',
3345         'fontSize',
3346         'fontStyle',
3347         'fontWeight',
3348         'textDecoration'
3349     ]
3350
3351     _x = functools.partial(xpath_with_ns, ns_map={
3352         'xml': 'http://www.w3.org/XML/1998/namespace',
3353         'ttml': 'http://www.w3.org/ns/ttml',
3354         'tts': 'http://www.w3.org/ns/ttml#styling',
3355     })
3356
3357     styles = {}
3358     default_style = {}
3359
3360     class TTMLPElementParser:
3361         _out = ''
3362         _unclosed_elements = []
3363         _applied_styles = []
3364
3365         def start(self, tag, attrib):
3366             if tag in (_x('ttml:br'), 'br'):
3367                 self._out += '\n'
3368             else:
3369                 unclosed_elements = []
3370                 style = {}
3371                 element_style_id = attrib.get('style')
3372                 if default_style:
3373                     style.update(default_style)
3374                 if element_style_id:
3375                     style.update(styles.get(element_style_id, {}))
3376                 for prop in SUPPORTED_STYLING:
3377                     prop_val = attrib.get(_x('tts:' + prop))
3378                     if prop_val:
3379                         style[prop] = prop_val
3380                 if style:
3381                     font = ''
3382                     for k, v in sorted(style.items()):
3383                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3384                             continue
3385                         if k == 'color':
3386                             font += ' color="%s"' % v
3387                         elif k == 'fontSize':
3388                             font += ' size="%s"' % v
3389                         elif k == 'fontFamily':
3390                             font += ' face="%s"' % v
3391                         elif k == 'fontWeight' and v == 'bold':
3392                             self._out += '<b>'
3393                             unclosed_elements.append('b')
3394                         elif k == 'fontStyle' and v == 'italic':
3395                             self._out += '<i>'
3396                             unclosed_elements.append('i')
3397                         elif k == 'textDecoration' and v == 'underline':
3398                             self._out += '<u>'
3399                             unclosed_elements.append('u')
3400                     if font:
3401                         self._out += '<font' + font + '>'
3402                         unclosed_elements.append('font')
3403                     applied_style = {}
3404                     if self._applied_styles:
3405                         applied_style.update(self._applied_styles[-1])
3406                     applied_style.update(style)
3407                     self._applied_styles.append(applied_style)
3408                 self._unclosed_elements.append(unclosed_elements)
3409
3410         def end(self, tag):
3411             if tag not in (_x('ttml:br'), 'br'):
3412                 unclosed_elements = self._unclosed_elements.pop()
3413                 for element in reversed(unclosed_elements):
3414                     self._out += '</%s>' % element
3415                 if unclosed_elements and self._applied_styles:
3416                     self._applied_styles.pop()
3417
3418         def data(self, data):
3419             self._out += data
3420
3421         def close(self):
3422             return self._out.strip()
3423
3424     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3425     # This will not trigger false positives since only UTF-8 text is being replaced
3426     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3427
3428     def parse_node(node):
3429         target = TTMLPElementParser()
3430         parser = xml.etree.ElementTree.XMLParser(target=target)
3431         parser.feed(xml.etree.ElementTree.tostring(node))
3432         return parser.close()
3433
3434     for k, v in LEGACY_NAMESPACES:
3435         for ns in v:
3436             dfxp_data = dfxp_data.replace(ns, k)
3437
3438     dfxp = compat_etree_fromstring(dfxp_data)
3439     out = []
3440     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3441
3442     if not paras:
3443         raise ValueError('Invalid dfxp/TTML subtitle')
3444
3445     repeat = False
3446     while True:
3447         for style in dfxp.findall(_x('.//ttml:style')):
3448             style_id = style.get('id') or style.get(_x('xml:id'))
3449             if not style_id:
3450                 continue
3451             parent_style_id = style.get('style')
3452             if parent_style_id:
3453                 if parent_style_id not in styles:
3454                     repeat = True
3455                     continue
3456                 styles[style_id] = styles[parent_style_id].copy()
3457             for prop in SUPPORTED_STYLING:
3458                 prop_val = style.get(_x('tts:' + prop))
3459                 if prop_val:
3460                     styles.setdefault(style_id, {})[prop] = prop_val
3461         if repeat:
3462             repeat = False
3463         else:
3464             break
3465
3466     for p in ('body', 'div'):
3467         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3468         if ele is None:
3469             continue
3470         style = styles.get(ele.get('style'))
3471         if not style:
3472             continue
3473         default_style.update(style)
3474
3475     for para, index in zip(paras, itertools.count(1)):
3476         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3477         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3478         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3479         if begin_time is None:
3480             continue
3481         if not end_time:
3482             if not dur:
3483                 continue
3484             end_time = begin_time + dur
3485         out.append('%d\n%s --> %s\n%s\n\n' % (
3486             index,
3487             srt_subtitles_timecode(begin_time),
3488             srt_subtitles_timecode(end_time),
3489             parse_node(para)))
3490
3491     return ''.join(out)
3492
3493
3494 def cli_option(params, command_option, param, separator=None):
3495     param = params.get(param)
3496     return ([] if param is None
3497             else [command_option, str(param)] if separator is None
3498             else [f'{command_option}{separator}{param}'])
3499
3500
3501 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3502     param = params.get(param)
3503     assert param in (True, False, None)
3504     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3505
3506
3507 def cli_valueless_option(params, command_option, param, expected_value=True):
3508     return [command_option] if params.get(param) == expected_value else []
3509
3510
3511 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3512     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3513         if use_compat:
3514             return argdict
3515         else:
3516             argdict = None
3517     if argdict is None:
3518         return default
3519     assert isinstance(argdict, dict)
3520
3521     assert isinstance(keys, (list, tuple))
3522     for key_list in keys:
3523         arg_list = list(filter(
3524             lambda x: x is not None,
3525             [argdict.get(key.lower()) for key in variadic(key_list)]))
3526         if arg_list:
3527             return [arg for args in arg_list for arg in args]
3528     return default
3529
3530
3531 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3532     main_key, exe = main_key.lower(), exe.lower()
3533     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3534     keys = [f'{root_key}{k}' for k in (keys or [''])]
3535     if root_key in keys:
3536         if main_key != exe:
3537             keys.append((main_key, exe))
3538         keys.append('default')
3539     else:
3540         use_compat = False
3541     return cli_configuration_args(argdict, keys, default, use_compat)
3542
3543
3544 class ISO639Utils:
3545     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3546     _lang_map = {
3547         'aa': 'aar',
3548         'ab': 'abk',
3549         'ae': 'ave',
3550         'af': 'afr',
3551         'ak': 'aka',
3552         'am': 'amh',
3553         'an': 'arg',
3554         'ar': 'ara',
3555         'as': 'asm',
3556         'av': 'ava',
3557         'ay': 'aym',
3558         'az': 'aze',
3559         'ba': 'bak',
3560         'be': 'bel',
3561         'bg': 'bul',
3562         'bh': 'bih',
3563         'bi': 'bis',
3564         'bm': 'bam',
3565         'bn': 'ben',
3566         'bo': 'bod',
3567         'br': 'bre',
3568         'bs': 'bos',
3569         'ca': 'cat',
3570         'ce': 'che',
3571         'ch': 'cha',
3572         'co': 'cos',
3573         'cr': 'cre',
3574         'cs': 'ces',
3575         'cu': 'chu',
3576         'cv': 'chv',
3577         'cy': 'cym',
3578         'da': 'dan',
3579         'de': 'deu',
3580         'dv': 'div',
3581         'dz': 'dzo',
3582         'ee': 'ewe',
3583         'el': 'ell',
3584         'en': 'eng',
3585         'eo': 'epo',
3586         'es': 'spa',
3587         'et': 'est',
3588         'eu': 'eus',
3589         'fa': 'fas',
3590         'ff': 'ful',
3591         'fi': 'fin',
3592         'fj': 'fij',
3593         'fo': 'fao',
3594         'fr': 'fra',
3595         'fy': 'fry',
3596         'ga': 'gle',
3597         'gd': 'gla',
3598         'gl': 'glg',
3599         'gn': 'grn',
3600         'gu': 'guj',
3601         'gv': 'glv',
3602         'ha': 'hau',
3603         'he': 'heb',
3604         'iw': 'heb',  # Replaced by he in 1989 revision
3605         'hi': 'hin',
3606         'ho': 'hmo',
3607         'hr': 'hrv',
3608         'ht': 'hat',
3609         'hu': 'hun',
3610         'hy': 'hye',
3611         'hz': 'her',
3612         'ia': 'ina',
3613         'id': 'ind',
3614         'in': 'ind',  # Replaced by id in 1989 revision
3615         'ie': 'ile',
3616         'ig': 'ibo',
3617         'ii': 'iii',
3618         'ik': 'ipk',
3619         'io': 'ido',
3620         'is': 'isl',
3621         'it': 'ita',
3622         'iu': 'iku',
3623         'ja': 'jpn',
3624         'jv': 'jav',
3625         'ka': 'kat',
3626         'kg': 'kon',
3627         'ki': 'kik',
3628         'kj': 'kua',
3629         'kk': 'kaz',
3630         'kl': 'kal',
3631         'km': 'khm',
3632         'kn': 'kan',
3633         'ko': 'kor',
3634         'kr': 'kau',
3635         'ks': 'kas',
3636         'ku': 'kur',
3637         'kv': 'kom',
3638         'kw': 'cor',
3639         'ky': 'kir',
3640         'la': 'lat',
3641         'lb': 'ltz',
3642         'lg': 'lug',
3643         'li': 'lim',
3644         'ln': 'lin',
3645         'lo': 'lao',
3646         'lt': 'lit',
3647         'lu': 'lub',
3648         'lv': 'lav',
3649         'mg': 'mlg',
3650         'mh': 'mah',
3651         'mi': 'mri',
3652         'mk': 'mkd',
3653         'ml': 'mal',
3654         'mn': 'mon',
3655         'mr': 'mar',
3656         'ms': 'msa',
3657         'mt': 'mlt',
3658         'my': 'mya',
3659         'na': 'nau',
3660         'nb': 'nob',
3661         'nd': 'nde',
3662         'ne': 'nep',
3663         'ng': 'ndo',
3664         'nl': 'nld',
3665         'nn': 'nno',
3666         'no': 'nor',
3667         'nr': 'nbl',
3668         'nv': 'nav',
3669         'ny': 'nya',
3670         'oc': 'oci',
3671         'oj': 'oji',
3672         'om': 'orm',
3673         'or': 'ori',
3674         'os': 'oss',
3675         'pa': 'pan',
3676         'pe': 'per',
3677         'pi': 'pli',
3678         'pl': 'pol',
3679         'ps': 'pus',
3680         'pt': 'por',
3681         'qu': 'que',
3682         'rm': 'roh',
3683         'rn': 'run',
3684         'ro': 'ron',
3685         'ru': 'rus',
3686         'rw': 'kin',
3687         'sa': 'san',
3688         'sc': 'srd',
3689         'sd': 'snd',
3690         'se': 'sme',
3691         'sg': 'sag',
3692         'si': 'sin',
3693         'sk': 'slk',
3694         'sl': 'slv',
3695         'sm': 'smo',
3696         'sn': 'sna',
3697         'so': 'som',
3698         'sq': 'sqi',
3699         'sr': 'srp',
3700         'ss': 'ssw',
3701         'st': 'sot',
3702         'su': 'sun',
3703         'sv': 'swe',
3704         'sw': 'swa',
3705         'ta': 'tam',
3706         'te': 'tel',
3707         'tg': 'tgk',
3708         'th': 'tha',
3709         'ti': 'tir',
3710         'tk': 'tuk',
3711         'tl': 'tgl',
3712         'tn': 'tsn',
3713         'to': 'ton',
3714         'tr': 'tur',
3715         'ts': 'tso',
3716         'tt': 'tat',
3717         'tw': 'twi',
3718         'ty': 'tah',
3719         'ug': 'uig',
3720         'uk': 'ukr',
3721         'ur': 'urd',
3722         'uz': 'uzb',
3723         've': 'ven',
3724         'vi': 'vie',
3725         'vo': 'vol',
3726         'wa': 'wln',
3727         'wo': 'wol',
3728         'xh': 'xho',
3729         'yi': 'yid',
3730         'ji': 'yid',  # Replaced by yi in 1989 revision
3731         'yo': 'yor',
3732         'za': 'zha',
3733         'zh': 'zho',
3734         'zu': 'zul',
3735     }
3736
3737     @classmethod
3738     def short2long(cls, code):
3739         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3740         return cls._lang_map.get(code[:2])
3741
3742     @classmethod
3743     def long2short(cls, code):
3744         """Convert language code from ISO 639-2/T to ISO 639-1"""
3745         for short_name, long_name in cls._lang_map.items():
3746             if long_name == code:
3747                 return short_name
3748
3749
3750 class ISO3166Utils:
3751     # From http://data.okfn.org/data/core/country-list
3752     _country_map = {
3753         'AF': 'Afghanistan',
3754         'AX': 'Åland Islands',
3755         'AL': 'Albania',
3756         'DZ': 'Algeria',
3757         'AS': 'American Samoa',
3758         'AD': 'Andorra',
3759         'AO': 'Angola',
3760         'AI': 'Anguilla',
3761         'AQ': 'Antarctica',
3762         'AG': 'Antigua and Barbuda',
3763         'AR': 'Argentina',
3764         'AM': 'Armenia',
3765         'AW': 'Aruba',
3766         'AU': 'Australia',
3767         'AT': 'Austria',
3768         'AZ': 'Azerbaijan',
3769         'BS': 'Bahamas',
3770         'BH': 'Bahrain',
3771         'BD': 'Bangladesh',
3772         'BB': 'Barbados',
3773         'BY': 'Belarus',
3774         'BE': 'Belgium',
3775         'BZ': 'Belize',
3776         'BJ': 'Benin',
3777         'BM': 'Bermuda',
3778         'BT': 'Bhutan',
3779         'BO': 'Bolivia, Plurinational State of',
3780         'BQ': 'Bonaire, Sint Eustatius and Saba',
3781         'BA': 'Bosnia and Herzegovina',
3782         'BW': 'Botswana',
3783         'BV': 'Bouvet Island',
3784         'BR': 'Brazil',
3785         'IO': 'British Indian Ocean Territory',
3786         'BN': 'Brunei Darussalam',
3787         'BG': 'Bulgaria',
3788         'BF': 'Burkina Faso',
3789         'BI': 'Burundi',
3790         'KH': 'Cambodia',
3791         'CM': 'Cameroon',
3792         'CA': 'Canada',
3793         'CV': 'Cape Verde',
3794         'KY': 'Cayman Islands',
3795         'CF': 'Central African Republic',
3796         'TD': 'Chad',
3797         'CL': 'Chile',
3798         'CN': 'China',
3799         'CX': 'Christmas Island',
3800         'CC': 'Cocos (Keeling) Islands',
3801         'CO': 'Colombia',
3802         'KM': 'Comoros',
3803         'CG': 'Congo',
3804         'CD': 'Congo, the Democratic Republic of the',
3805         'CK': 'Cook Islands',
3806         'CR': 'Costa Rica',
3807         'CI': 'Côte d\'Ivoire',
3808         'HR': 'Croatia',
3809         'CU': 'Cuba',
3810         'CW': 'Curaçao',
3811         'CY': 'Cyprus',
3812         'CZ': 'Czech Republic',
3813         'DK': 'Denmark',
3814         'DJ': 'Djibouti',
3815         'DM': 'Dominica',
3816         'DO': 'Dominican Republic',
3817         'EC': 'Ecuador',
3818         'EG': 'Egypt',
3819         'SV': 'El Salvador',
3820         'GQ': 'Equatorial Guinea',
3821         'ER': 'Eritrea',
3822         'EE': 'Estonia',
3823         'ET': 'Ethiopia',
3824         'FK': 'Falkland Islands (Malvinas)',
3825         'FO': 'Faroe Islands',
3826         'FJ': 'Fiji',
3827         'FI': 'Finland',
3828         'FR': 'France',
3829         'GF': 'French Guiana',
3830         'PF': 'French Polynesia',
3831         'TF': 'French Southern Territories',
3832         'GA': 'Gabon',
3833         'GM': 'Gambia',
3834         'GE': 'Georgia',
3835         'DE': 'Germany',
3836         'GH': 'Ghana',
3837         'GI': 'Gibraltar',
3838         'GR': 'Greece',
3839         'GL': 'Greenland',
3840         'GD': 'Grenada',
3841         'GP': 'Guadeloupe',
3842         'GU': 'Guam',
3843         'GT': 'Guatemala',
3844         'GG': 'Guernsey',
3845         'GN': 'Guinea',
3846         'GW': 'Guinea-Bissau',
3847         'GY': 'Guyana',
3848         'HT': 'Haiti',
3849         'HM': 'Heard Island and McDonald Islands',
3850         'VA': 'Holy See (Vatican City State)',
3851         'HN': 'Honduras',
3852         'HK': 'Hong Kong',
3853         'HU': 'Hungary',
3854         'IS': 'Iceland',
3855         'IN': 'India',
3856         'ID': 'Indonesia',
3857         'IR': 'Iran, Islamic Republic of',
3858         'IQ': 'Iraq',
3859         'IE': 'Ireland',
3860         'IM': 'Isle of Man',
3861         'IL': 'Israel',
3862         'IT': 'Italy',
3863         'JM': 'Jamaica',
3864         'JP': 'Japan',
3865         'JE': 'Jersey',
3866         'JO': 'Jordan',
3867         'KZ': 'Kazakhstan',
3868         'KE': 'Kenya',
3869         'KI': 'Kiribati',
3870         'KP': 'Korea, Democratic People\'s Republic of',
3871         'KR': 'Korea, Republic of',
3872         'KW': 'Kuwait',
3873         'KG': 'Kyrgyzstan',
3874         'LA': 'Lao People\'s Democratic Republic',
3875         'LV': 'Latvia',
3876         'LB': 'Lebanon',
3877         'LS': 'Lesotho',
3878         'LR': 'Liberia',
3879         'LY': 'Libya',
3880         'LI': 'Liechtenstein',
3881         'LT': 'Lithuania',
3882         'LU': 'Luxembourg',
3883         'MO': 'Macao',
3884         'MK': 'Macedonia, the Former Yugoslav Republic of',
3885         'MG': 'Madagascar',
3886         'MW': 'Malawi',
3887         'MY': 'Malaysia',
3888         'MV': 'Maldives',
3889         'ML': 'Mali',
3890         'MT': 'Malta',
3891         'MH': 'Marshall Islands',
3892         'MQ': 'Martinique',
3893         'MR': 'Mauritania',
3894         'MU': 'Mauritius',
3895         'YT': 'Mayotte',
3896         'MX': 'Mexico',
3897         'FM': 'Micronesia, Federated States of',
3898         'MD': 'Moldova, Republic of',
3899         'MC': 'Monaco',
3900         'MN': 'Mongolia',
3901         'ME': 'Montenegro',
3902         'MS': 'Montserrat',
3903         'MA': 'Morocco',
3904         'MZ': 'Mozambique',
3905         'MM': 'Myanmar',
3906         'NA': 'Namibia',
3907         'NR': 'Nauru',
3908         'NP': 'Nepal',
3909         'NL': 'Netherlands',
3910         'NC': 'New Caledonia',
3911         'NZ': 'New Zealand',
3912         'NI': 'Nicaragua',
3913         'NE': 'Niger',
3914         'NG': 'Nigeria',
3915         'NU': 'Niue',
3916         'NF': 'Norfolk Island',
3917         'MP': 'Northern Mariana Islands',
3918         'NO': 'Norway',
3919         'OM': 'Oman',
3920         'PK': 'Pakistan',
3921         'PW': 'Palau',
3922         'PS': 'Palestine, State of',
3923         'PA': 'Panama',
3924         'PG': 'Papua New Guinea',
3925         'PY': 'Paraguay',
3926         'PE': 'Peru',
3927         'PH': 'Philippines',
3928         'PN': 'Pitcairn',
3929         'PL': 'Poland',
3930         'PT': 'Portugal',
3931         'PR': 'Puerto Rico',
3932         'QA': 'Qatar',
3933         'RE': 'Réunion',
3934         'RO': 'Romania',
3935         'RU': 'Russian Federation',
3936         'RW': 'Rwanda',
3937         'BL': 'Saint Barthélemy',
3938         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3939         'KN': 'Saint Kitts and Nevis',
3940         'LC': 'Saint Lucia',
3941         'MF': 'Saint Martin (French part)',
3942         'PM': 'Saint Pierre and Miquelon',
3943         'VC': 'Saint Vincent and the Grenadines',
3944         'WS': 'Samoa',
3945         'SM': 'San Marino',
3946         'ST': 'Sao Tome and Principe',
3947         'SA': 'Saudi Arabia',
3948         'SN': 'Senegal',
3949         'RS': 'Serbia',
3950         'SC': 'Seychelles',
3951         'SL': 'Sierra Leone',
3952         'SG': 'Singapore',
3953         'SX': 'Sint Maarten (Dutch part)',
3954         'SK': 'Slovakia',
3955         'SI': 'Slovenia',
3956         'SB': 'Solomon Islands',
3957         'SO': 'Somalia',
3958         'ZA': 'South Africa',
3959         'GS': 'South Georgia and the South Sandwich Islands',
3960         'SS': 'South Sudan',
3961         'ES': 'Spain',
3962         'LK': 'Sri Lanka',
3963         'SD': 'Sudan',
3964         'SR': 'Suriname',
3965         'SJ': 'Svalbard and Jan Mayen',
3966         'SZ': 'Swaziland',
3967         'SE': 'Sweden',
3968         'CH': 'Switzerland',
3969         'SY': 'Syrian Arab Republic',
3970         'TW': 'Taiwan, Province of China',
3971         'TJ': 'Tajikistan',
3972         'TZ': 'Tanzania, United Republic of',
3973         'TH': 'Thailand',
3974         'TL': 'Timor-Leste',
3975         'TG': 'Togo',
3976         'TK': 'Tokelau',
3977         'TO': 'Tonga',
3978         'TT': 'Trinidad and Tobago',
3979         'TN': 'Tunisia',
3980         'TR': 'Turkey',
3981         'TM': 'Turkmenistan',
3982         'TC': 'Turks and Caicos Islands',
3983         'TV': 'Tuvalu',
3984         'UG': 'Uganda',
3985         'UA': 'Ukraine',
3986         'AE': 'United Arab Emirates',
3987         'GB': 'United Kingdom',
3988         'US': 'United States',
3989         'UM': 'United States Minor Outlying Islands',
3990         'UY': 'Uruguay',
3991         'UZ': 'Uzbekistan',
3992         'VU': 'Vanuatu',
3993         'VE': 'Venezuela, Bolivarian Republic of',
3994         'VN': 'Viet Nam',
3995         'VG': 'Virgin Islands, British',
3996         'VI': 'Virgin Islands, U.S.',
3997         'WF': 'Wallis and Futuna',
3998         'EH': 'Western Sahara',
3999         'YE': 'Yemen',
4000         'ZM': 'Zambia',
4001         'ZW': 'Zimbabwe',
4002         # Not ISO 3166 codes, but used for IP blocks
4003         'AP': 'Asia/Pacific Region',
4004         'EU': 'Europe',
4005     }
4006
4007     @classmethod
4008     def short2full(cls, code):
4009         """Convert an ISO 3166-2 country code to the corresponding full name"""
4010         return cls._country_map.get(code.upper())
4011
4012
4013 class GeoUtils:
4014     # Major IPv4 address blocks per country
4015     _country_ip_map = {
4016         'AD': '46.172.224.0/19',
4017         'AE': '94.200.0.0/13',
4018         'AF': '149.54.0.0/17',
4019         'AG': '209.59.64.0/18',
4020         'AI': '204.14.248.0/21',
4021         'AL': '46.99.0.0/16',
4022         'AM': '46.70.0.0/15',
4023         'AO': '105.168.0.0/13',
4024         'AP': '182.50.184.0/21',
4025         'AQ': '23.154.160.0/24',
4026         'AR': '181.0.0.0/12',
4027         'AS': '202.70.112.0/20',
4028         'AT': '77.116.0.0/14',
4029         'AU': '1.128.0.0/11',
4030         'AW': '181.41.0.0/18',
4031         'AX': '185.217.4.0/22',
4032         'AZ': '5.197.0.0/16',
4033         'BA': '31.176.128.0/17',
4034         'BB': '65.48.128.0/17',
4035         'BD': '114.130.0.0/16',
4036         'BE': '57.0.0.0/8',
4037         'BF': '102.178.0.0/15',
4038         'BG': '95.42.0.0/15',
4039         'BH': '37.131.0.0/17',
4040         'BI': '154.117.192.0/18',
4041         'BJ': '137.255.0.0/16',
4042         'BL': '185.212.72.0/23',
4043         'BM': '196.12.64.0/18',
4044         'BN': '156.31.0.0/16',
4045         'BO': '161.56.0.0/16',
4046         'BQ': '161.0.80.0/20',
4047         'BR': '191.128.0.0/12',
4048         'BS': '24.51.64.0/18',
4049         'BT': '119.2.96.0/19',
4050         'BW': '168.167.0.0/16',
4051         'BY': '178.120.0.0/13',
4052         'BZ': '179.42.192.0/18',
4053         'CA': '99.224.0.0/11',
4054         'CD': '41.243.0.0/16',
4055         'CF': '197.242.176.0/21',
4056         'CG': '160.113.0.0/16',
4057         'CH': '85.0.0.0/13',
4058         'CI': '102.136.0.0/14',
4059         'CK': '202.65.32.0/19',
4060         'CL': '152.172.0.0/14',
4061         'CM': '102.244.0.0/14',
4062         'CN': '36.128.0.0/10',
4063         'CO': '181.240.0.0/12',
4064         'CR': '201.192.0.0/12',
4065         'CU': '152.206.0.0/15',
4066         'CV': '165.90.96.0/19',
4067         'CW': '190.88.128.0/17',
4068         'CY': '31.153.0.0/16',
4069         'CZ': '88.100.0.0/14',
4070         'DE': '53.0.0.0/8',
4071         'DJ': '197.241.0.0/17',
4072         'DK': '87.48.0.0/12',
4073         'DM': '192.243.48.0/20',
4074         'DO': '152.166.0.0/15',
4075         'DZ': '41.96.0.0/12',
4076         'EC': '186.68.0.0/15',
4077         'EE': '90.190.0.0/15',
4078         'EG': '156.160.0.0/11',
4079         'ER': '196.200.96.0/20',
4080         'ES': '88.0.0.0/11',
4081         'ET': '196.188.0.0/14',
4082         'EU': '2.16.0.0/13',
4083         'FI': '91.152.0.0/13',
4084         'FJ': '144.120.0.0/16',
4085         'FK': '80.73.208.0/21',
4086         'FM': '119.252.112.0/20',
4087         'FO': '88.85.32.0/19',
4088         'FR': '90.0.0.0/9',
4089         'GA': '41.158.0.0/15',
4090         'GB': '25.0.0.0/8',
4091         'GD': '74.122.88.0/21',
4092         'GE': '31.146.0.0/16',
4093         'GF': '161.22.64.0/18',
4094         'GG': '62.68.160.0/19',
4095         'GH': '154.160.0.0/12',
4096         'GI': '95.164.0.0/16',
4097         'GL': '88.83.0.0/19',
4098         'GM': '160.182.0.0/15',
4099         'GN': '197.149.192.0/18',
4100         'GP': '104.250.0.0/19',
4101         'GQ': '105.235.224.0/20',
4102         'GR': '94.64.0.0/13',
4103         'GT': '168.234.0.0/16',
4104         'GU': '168.123.0.0/16',
4105         'GW': '197.214.80.0/20',
4106         'GY': '181.41.64.0/18',
4107         'HK': '113.252.0.0/14',
4108         'HN': '181.210.0.0/16',
4109         'HR': '93.136.0.0/13',
4110         'HT': '148.102.128.0/17',
4111         'HU': '84.0.0.0/14',
4112         'ID': '39.192.0.0/10',
4113         'IE': '87.32.0.0/12',
4114         'IL': '79.176.0.0/13',
4115         'IM': '5.62.80.0/20',
4116         'IN': '117.192.0.0/10',
4117         'IO': '203.83.48.0/21',
4118         'IQ': '37.236.0.0/14',
4119         'IR': '2.176.0.0/12',
4120         'IS': '82.221.0.0/16',
4121         'IT': '79.0.0.0/10',
4122         'JE': '87.244.64.0/18',
4123         'JM': '72.27.0.0/17',
4124         'JO': '176.29.0.0/16',
4125         'JP': '133.0.0.0/8',
4126         'KE': '105.48.0.0/12',
4127         'KG': '158.181.128.0/17',
4128         'KH': '36.37.128.0/17',
4129         'KI': '103.25.140.0/22',
4130         'KM': '197.255.224.0/20',
4131         'KN': '198.167.192.0/19',
4132         'KP': '175.45.176.0/22',
4133         'KR': '175.192.0.0/10',
4134         'KW': '37.36.0.0/14',
4135         'KY': '64.96.0.0/15',
4136         'KZ': '2.72.0.0/13',
4137         'LA': '115.84.64.0/18',
4138         'LB': '178.135.0.0/16',
4139         'LC': '24.92.144.0/20',
4140         'LI': '82.117.0.0/19',
4141         'LK': '112.134.0.0/15',
4142         'LR': '102.183.0.0/16',
4143         'LS': '129.232.0.0/17',
4144         'LT': '78.56.0.0/13',
4145         'LU': '188.42.0.0/16',
4146         'LV': '46.109.0.0/16',
4147         'LY': '41.252.0.0/14',
4148         'MA': '105.128.0.0/11',
4149         'MC': '88.209.64.0/18',
4150         'MD': '37.246.0.0/16',
4151         'ME': '178.175.0.0/17',
4152         'MF': '74.112.232.0/21',
4153         'MG': '154.126.0.0/17',
4154         'MH': '117.103.88.0/21',
4155         'MK': '77.28.0.0/15',
4156         'ML': '154.118.128.0/18',
4157         'MM': '37.111.0.0/17',
4158         'MN': '49.0.128.0/17',
4159         'MO': '60.246.0.0/16',
4160         'MP': '202.88.64.0/20',
4161         'MQ': '109.203.224.0/19',
4162         'MR': '41.188.64.0/18',
4163         'MS': '208.90.112.0/22',
4164         'MT': '46.11.0.0/16',
4165         'MU': '105.16.0.0/12',
4166         'MV': '27.114.128.0/18',
4167         'MW': '102.70.0.0/15',
4168         'MX': '187.192.0.0/11',
4169         'MY': '175.136.0.0/13',
4170         'MZ': '197.218.0.0/15',
4171         'NA': '41.182.0.0/16',
4172         'NC': '101.101.0.0/18',
4173         'NE': '197.214.0.0/18',
4174         'NF': '203.17.240.0/22',
4175         'NG': '105.112.0.0/12',
4176         'NI': '186.76.0.0/15',
4177         'NL': '145.96.0.0/11',
4178         'NO': '84.208.0.0/13',
4179         'NP': '36.252.0.0/15',
4180         'NR': '203.98.224.0/19',
4181         'NU': '49.156.48.0/22',
4182         'NZ': '49.224.0.0/14',
4183         'OM': '5.36.0.0/15',
4184         'PA': '186.72.0.0/15',
4185         'PE': '186.160.0.0/14',
4186         'PF': '123.50.64.0/18',
4187         'PG': '124.240.192.0/19',
4188         'PH': '49.144.0.0/13',
4189         'PK': '39.32.0.0/11',
4190         'PL': '83.0.0.0/11',
4191         'PM': '70.36.0.0/20',
4192         'PR': '66.50.0.0/16',
4193         'PS': '188.161.0.0/16',
4194         'PT': '85.240.0.0/13',
4195         'PW': '202.124.224.0/20',
4196         'PY': '181.120.0.0/14',
4197         'QA': '37.210.0.0/15',
4198         'RE': '102.35.0.0/16',
4199         'RO': '79.112.0.0/13',
4200         'RS': '93.86.0.0/15',
4201         'RU': '5.136.0.0/13',
4202         'RW': '41.186.0.0/16',
4203         'SA': '188.48.0.0/13',
4204         'SB': '202.1.160.0/19',
4205         'SC': '154.192.0.0/11',
4206         'SD': '102.120.0.0/13',
4207         'SE': '78.64.0.0/12',
4208         'SG': '8.128.0.0/10',
4209         'SI': '188.196.0.0/14',
4210         'SK': '78.98.0.0/15',
4211         'SL': '102.143.0.0/17',
4212         'SM': '89.186.32.0/19',
4213         'SN': '41.82.0.0/15',
4214         'SO': '154.115.192.0/18',
4215         'SR': '186.179.128.0/17',
4216         'SS': '105.235.208.0/21',
4217         'ST': '197.159.160.0/19',
4218         'SV': '168.243.0.0/16',
4219         'SX': '190.102.0.0/20',
4220         'SY': '5.0.0.0/16',
4221         'SZ': '41.84.224.0/19',
4222         'TC': '65.255.48.0/20',
4223         'TD': '154.68.128.0/19',
4224         'TG': '196.168.0.0/14',
4225         'TH': '171.96.0.0/13',
4226         'TJ': '85.9.128.0/18',
4227         'TK': '27.96.24.0/21',
4228         'TL': '180.189.160.0/20',
4229         'TM': '95.85.96.0/19',
4230         'TN': '197.0.0.0/11',
4231         'TO': '175.176.144.0/21',
4232         'TR': '78.160.0.0/11',
4233         'TT': '186.44.0.0/15',
4234         'TV': '202.2.96.0/19',
4235         'TW': '120.96.0.0/11',
4236         'TZ': '156.156.0.0/14',
4237         'UA': '37.52.0.0/14',
4238         'UG': '102.80.0.0/13',
4239         'US': '6.0.0.0/8',
4240         'UY': '167.56.0.0/13',
4241         'UZ': '84.54.64.0/18',
4242         'VA': '212.77.0.0/19',
4243         'VC': '207.191.240.0/21',
4244         'VE': '186.88.0.0/13',
4245         'VG': '66.81.192.0/20',
4246         'VI': '146.226.0.0/16',
4247         'VN': '14.160.0.0/11',
4248         'VU': '202.80.32.0/20',
4249         'WF': '117.20.32.0/21',
4250         'WS': '202.4.32.0/19',
4251         'YE': '134.35.0.0/16',
4252         'YT': '41.242.116.0/22',
4253         'ZA': '41.0.0.0/11',
4254         'ZM': '102.144.0.0/13',
4255         'ZW': '102.177.192.0/18',
4256     }
4257
4258     @classmethod
4259     def random_ipv4(cls, code_or_block):
4260         if len(code_or_block) == 2:
4261             block = cls._country_ip_map.get(code_or_block.upper())
4262             if not block:
4263                 return None
4264         else:
4265             block = code_or_block
4266         addr, preflen = block.split('/')
4267         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4268         addr_max = addr_min | (0xffffffff >> int(preflen))
4269         return str(socket.inet_ntoa(
4270             struct.pack('!L', random.randint(addr_min, addr_max))))
4271
4272
4273 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4274 # released into Public Domain
4275 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4276
4277 def long_to_bytes(n, blocksize=0):
4278     """long_to_bytes(n:long, blocksize:int) : string
4279     Convert a long integer to a byte string.
4280
4281     If optional blocksize is given and greater than zero, pad the front of the
4282     byte string with binary zeros so that the length is a multiple of
4283     blocksize.
4284     """
4285     # after much testing, this algorithm was deemed to be the fastest
4286     s = b''
4287     n = int(n)
4288     while n > 0:
4289         s = struct.pack('>I', n & 0xffffffff) + s
4290         n = n >> 32
4291     # strip off leading zeros
4292     for i in range(len(s)):
4293         if s[i] != b'\000'[0]:
4294             break
4295     else:
4296         # only happens when n == 0
4297         s = b'\000'
4298         i = 0
4299     s = s[i:]
4300     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4301     # de-padding being done above, but sigh...
4302     if blocksize > 0 and len(s) % blocksize:
4303         s = (blocksize - len(s) % blocksize) * b'\000' + s
4304     return s
4305
4306
4307 def bytes_to_long(s):
4308     """bytes_to_long(string) : long
4309     Convert a byte string to a long integer.
4310
4311     This is (essentially) the inverse of long_to_bytes().
4312     """
4313     acc = 0
4314     length = len(s)
4315     if length % 4:
4316         extra = (4 - length % 4)
4317         s = b'\000' * extra + s
4318         length = length + extra
4319     for i in range(0, length, 4):
4320         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4321     return acc
4322
4323
4324 def ohdave_rsa_encrypt(data, exponent, modulus):
4325     '''
4326     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4327
4328     Input:
4329         data: data to encrypt, bytes-like object
4330         exponent, modulus: parameter e and N of RSA algorithm, both integer
4331     Output: hex string of encrypted data
4332
4333     Limitation: supports one block encryption only
4334     '''
4335
4336     payload = int(binascii.hexlify(data[::-1]), 16)
4337     encrypted = pow(payload, exponent, modulus)
4338     return '%x' % encrypted
4339
4340
4341 def pkcs1pad(data, length):
4342     """
4343     Padding input data with PKCS#1 scheme
4344
4345     @param {int[]} data        input data
4346     @param {int}   length      target length
4347     @returns {int[]}           padded data
4348     """
4349     if len(data) > length - 11:
4350         raise ValueError('Input data too long for PKCS#1 padding')
4351
4352     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4353     return [0, 2] + pseudo_random + [0] + data
4354
4355
4356 def _base_n_table(n, table):
4357     if not table and not n:
4358         raise ValueError('Either table or n must be specified')
4359     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4360
4361     if n and n != len(table):
4362         raise ValueError(f'base {n} exceeds table length {len(table)}')
4363     return table
4364
4365
4366 def encode_base_n(num, n=None, table=None):
4367     """Convert given int to a base-n string"""
4368     table = _base_n_table(n, table)
4369     if not num:
4370         return table[0]
4371
4372     result, base = '', len(table)
4373     while num:
4374         result = table[num % base] + result
4375         num = num // base
4376     return result
4377
4378
4379 def decode_base_n(string, n=None, table=None):
4380     """Convert given base-n string to int"""
4381     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4382     result, base = 0, len(table)
4383     for char in string:
4384         result = result * base + table[char]
4385     return result
4386
4387
4388 def decode_packed_codes(code):
4389     mobj = re.search(PACKED_CODES_RE, code)
4390     obfuscated_code, base, count, symbols = mobj.groups()
4391     base = int(base)
4392     count = int(count)
4393     symbols = symbols.split('|')
4394     symbol_table = {}
4395
4396     while count:
4397         count -= 1
4398         base_n_count = encode_base_n(count, base)
4399         symbol_table[base_n_count] = symbols[count] or base_n_count
4400
4401     return re.sub(
4402         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4403         obfuscated_code)
4404
4405
4406 def caesar(s, alphabet, shift):
4407     if shift == 0:
4408         return s
4409     l = len(alphabet)
4410     return ''.join(
4411         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4412         for c in s)
4413
4414
4415 def rot47(s):
4416     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4417
4418
4419 def parse_m3u8_attributes(attrib):
4420     info = {}
4421     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4422         if val.startswith('"'):
4423             val = val[1:-1]
4424         info[key] = val
4425     return info
4426
4427
4428 def urshift(val, n):
4429     return val >> n if val >= 0 else (val + 0x100000000) >> n
4430
4431
4432 def write_xattr(path, key, value):
4433     # Windows: Write xattrs to NTFS Alternate Data Streams:
4434     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4435     if compat_os_name == 'nt':
4436         assert ':' not in key
4437         assert os.path.exists(path)
4438
4439         try:
4440             with open(f'{path}:{key}', 'wb') as f:
4441                 f.write(value)
4442         except OSError as e:
4443             raise XAttrMetadataError(e.errno, e.strerror)
4444         return
4445
4446     # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
4447
4448     setxattr = None
4449     if callable(getattr(os, 'setxattr', None)):
4450         setxattr = os.setxattr
4451     elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4452         # Unicode arguments are not supported in pyxattr until version 0.5.0
4453         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4454         if version_tuple(xattr.__version__) >= (0, 5, 0):
4455             setxattr = xattr.set
4456     elif xattr:
4457         setxattr = xattr.setxattr
4458
4459     if setxattr:
4460         try:
4461             setxattr(path, key, value)
4462         except OSError as e:
4463             raise XAttrMetadataError(e.errno, e.strerror)
4464         return
4465
4466     # UNIX Method 2. Use setfattr/xattr executables
4467     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4468            else 'xattr' if check_executable('xattr', ['-h']) else None)
4469     if not exe:
4470         raise XAttrUnavailableError(
4471             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4472             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4473
4474     value = value.decode()
4475     try:
4476         _, stderr, returncode = Popen.run(
4477             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4478             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4479     except OSError as e:
4480         raise XAttrMetadataError(e.errno, e.strerror)
4481     if returncode:
4482         raise XAttrMetadataError(returncode, stderr)
4483
4484
4485 def random_birthday(year_field, month_field, day_field):
4486     start_date = datetime.date(1950, 1, 1)
4487     end_date = datetime.date(1995, 12, 31)
4488     offset = random.randint(0, (end_date - start_date).days)
4489     random_date = start_date + datetime.timedelta(offset)
4490     return {
4491         year_field: str(random_date.year),
4492         month_field: str(random_date.month),
4493         day_field: str(random_date.day),
4494     }
4495
4496
4497 def find_available_port(interface=''):
4498     try:
4499         with socket.socket() as sock:
4500             sock.bind((interface, 0))
4501             return sock.getsockname()[1]
4502     except OSError:
4503         return None
4504
4505
4506 # Templates for internet shortcut files, which are plain text files.
4507 DOT_URL_LINK_TEMPLATE = '''\
4508 [InternetShortcut]
4509 URL=%(url)s
4510 '''
4511
4512 DOT_WEBLOC_LINK_TEMPLATE = '''\
4513 <?xml version="1.0" encoding="UTF-8"?>
4514 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4515 <plist version="1.0">
4516 <dict>
4517 \t<key>URL</key>
4518 \t<string>%(url)s</string>
4519 </dict>
4520 </plist>
4521 '''
4522
4523 DOT_DESKTOP_LINK_TEMPLATE = '''\
4524 [Desktop Entry]
4525 Encoding=UTF-8
4526 Name=%(filename)s
4527 Type=Link
4528 URL=%(url)s
4529 Icon=text-html
4530 '''
4531
4532 LINK_TEMPLATES = {
4533     'url': DOT_URL_LINK_TEMPLATE,
4534     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4535     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4536 }
4537
4538
4539 def iri_to_uri(iri):
4540     """
4541     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4542
4543     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4544     """
4545
4546     iri_parts = urllib.parse.urlparse(iri)
4547
4548     if '[' in iri_parts.netloc:
4549         raise ValueError('IPv6 URIs are not, yet, supported.')
4550         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4551
4552     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4553
4554     net_location = ''
4555     if iri_parts.username:
4556         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4557         if iri_parts.password is not None:
4558             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4559         net_location += '@'
4560
4561     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4562     # The 'idna' encoding produces ASCII text.
4563     if iri_parts.port is not None and iri_parts.port != 80:
4564         net_location += ':' + str(iri_parts.port)
4565
4566     return urllib.parse.urlunparse(
4567         (iri_parts.scheme,
4568             net_location,
4569
4570             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4571
4572             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4573             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4574
4575             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4576             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4577
4578             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4579
4580     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4581
4582
4583 def to_high_limit_path(path):
4584     if sys.platform in ['win32', 'cygwin']:
4585         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4586         return '\\\\?\\' + os.path.abspath(path)
4587
4588     return path
4589
4590
4591 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4592     val = traversal.traverse_obj(obj, *variadic(field))
4593     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4594         return default
4595     return template % func(val)
4596
4597
4598 def clean_podcast_url(url):
4599     url = re.sub(r'''(?x)
4600         (?:
4601             (?:
4602                 chtbl\.com/track|
4603                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4604                 play\.podtrac\.com|
4605                 chrt\.fm/track|
4606                 mgln\.ai/e
4607             )(?:/[^/.]+)?|
4608             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4609             flex\.acast\.com|
4610             pd(?:
4611                 cn\.co| # https://podcorn.com/analytics-prefix/
4612                 st\.fm # https://podsights.com/docs/
4613             )/e|
4614             [0-9]\.gum\.fm|
4615             pscrb\.fm/rss/p
4616         )/''', '', url)
4617     return re.sub(r'^\w+://(\w+://)', r'\1', url)
4618
4619
4620 _HEX_TABLE = '0123456789abcdef'
4621
4622
4623 def random_uuidv4():
4624     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4625
4626
4627 def make_dir(path, to_screen=None):
4628     try:
4629         dn = os.path.dirname(path)
4630         if dn:
4631             os.makedirs(dn, exist_ok=True)
4632         return True
4633     except OSError as err:
4634         if callable(to_screen) is not None:
4635             to_screen(f'unable to create directory {err}')
4636         return False
4637
4638
4639 def get_executable_path():
4640     from ..update import _get_variant_and_executable_path
4641
4642     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4643
4644
4645 def get_user_config_dirs(package_name):
4646     # .config (e.g. ~/.config/package_name)
4647     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4648     yield os.path.join(xdg_config_home, package_name)
4649
4650     # appdata (%APPDATA%/package_name)
4651     appdata_dir = os.getenv('appdata')
4652     if appdata_dir:
4653         yield os.path.join(appdata_dir, package_name)
4654
4655     # home (~/.package_name)
4656     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4657
4658
4659 def get_system_config_dirs(package_name):
4660     # /etc/package_name
4661     yield os.path.join('/etc', package_name)
4662
4663
4664 def time_seconds(**kwargs):
4665     """
4666     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4667     """
4668     return time.time() + datetime.timedelta(**kwargs).total_seconds()
4669
4670
4671 # create a JSON Web Signature (jws) with HS256 algorithm
4672 # the resulting format is in JWS Compact Serialization
4673 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4674 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4675 def jwt_encode_hs256(payload_data, key, headers={}):
4676     header_data = {
4677         'alg': 'HS256',
4678         'typ': 'JWT',
4679     }
4680     if headers:
4681         header_data.update(headers)
4682     header_b64 = base64.b64encode(json.dumps(header_data).encode())
4683     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4684     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4685     signature_b64 = base64.b64encode(h.digest())
4686     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4687     return token
4688
4689
4690 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4691 def jwt_decode_hs256(jwt):
4692     header_b64, payload_b64, signature_b64 = jwt.split('.')
4693     # add trailing ='s that may have been stripped, superfluous ='s are ignored
4694     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4695     return payload_data
4696
4697
4698 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4699
4700
4701 @functools.cache
4702 def supports_terminal_sequences(stream):
4703     if compat_os_name == 'nt':
4704         if not WINDOWS_VT_MODE:
4705             return False
4706     elif not os.getenv('TERM'):
4707         return False
4708     try:
4709         return stream.isatty()
4710     except BaseException:
4711         return False
4712
4713
4714 def windows_enable_vt_mode():
4715     """Ref: https://bugs.python.org/issue30075 """
4716     if get_windows_version() < (10, 0, 10586):
4717         return
4718
4719     import ctypes
4720     import ctypes.wintypes
4721     import msvcrt
4722
4723     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4724
4725     dll = ctypes.WinDLL('kernel32', use_last_error=False)
4726     handle = os.open('CONOUT$', os.O_RDWR)
4727     try:
4728         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4729         dw_original_mode = ctypes.wintypes.DWORD()
4730         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4731         if not success:
4732             raise Exception('GetConsoleMode failed')
4733
4734         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4735             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4736         if not success:
4737             raise Exception('SetConsoleMode failed')
4738     finally:
4739         os.close(handle)
4740
4741     global WINDOWS_VT_MODE
4742     WINDOWS_VT_MODE = True
4743     supports_terminal_sequences.cache_clear()
4744
4745
4746 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4747
4748
4749 def remove_terminal_sequences(string):
4750     return _terminal_sequences_re.sub('', string)
4751
4752
4753 def number_of_digits(number):
4754     return len('%d' % number)
4755
4756
4757 def join_nonempty(*values, delim='-', from_dict=None):
4758     if from_dict is not None:
4759         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4760     return delim.join(map(str, filter(None, values)))
4761
4762
4763 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4764     """
4765     Find the largest format dimensions in terms of video width and, for each thumbnail:
4766     * Modify the URL: Match the width with the provided regex and replace with the former width
4767     * Update dimensions
4768
4769     This function is useful with video services that scale the provided thumbnails on demand
4770     """
4771     _keys = ('width', 'height')
4772     max_dimensions = max(
4773         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4774         default=(0, 0))
4775     if not max_dimensions[0]:
4776         return thumbnails
4777     return [
4778         merge_dicts(
4779             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4780             dict(zip(_keys, max_dimensions)), thumbnail)
4781         for thumbnail in thumbnails
4782     ]
4783
4784
4785 def parse_http_range(range):
4786     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4787     if not range:
4788         return None, None, None
4789     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4790     if not crg:
4791         return None, None, None
4792     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4793
4794
4795 def read_stdin(what):
4796     if what:
4797         eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4798         write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4799     return sys.stdin
4800
4801
4802 def determine_file_encoding(data):
4803     """
4804     Detect the text encoding used
4805     @returns (encoding, bytes to skip)
4806     """
4807
4808     # BOM marks are given priority over declarations
4809     for bom, enc in BOMS:
4810         if data.startswith(bom):
4811             return enc, len(bom)
4812
4813     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4814     # We ignore the endianness to get a good enough match
4815     data = data.replace(b'\0', b'')
4816     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4817     return mobj.group(1).decode() if mobj else None, 0
4818
4819
4820 class Config:
4821     own_args = None
4822     parsed_args = None
4823     filename = None
4824     __initialized = False
4825
4826     def __init__(self, parser, label=None):
4827         self.parser, self.label = parser, label
4828         self._loaded_paths, self.configs = set(), []
4829
4830     def init(self, args=None, filename=None):
4831         assert not self.__initialized
4832         self.own_args, self.filename = args, filename
4833         return self.load_configs()
4834
4835     def load_configs(self):
4836         directory = ''
4837         if self.filename:
4838             location = os.path.realpath(self.filename)
4839             directory = os.path.dirname(location)
4840             if location in self._loaded_paths:
4841                 return False
4842             self._loaded_paths.add(location)
4843
4844         self.__initialized = True
4845         opts, _ = self.parser.parse_known_args(self.own_args)
4846         self.parsed_args = self.own_args
4847         for location in opts.config_locations or []:
4848             if location == '-':
4849                 if location in self._loaded_paths:
4850                     continue
4851                 self._loaded_paths.add(location)
4852                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4853                 continue
4854             location = os.path.join(directory, expand_path(location))
4855             if os.path.isdir(location):
4856                 location = os.path.join(location, 'yt-dlp.conf')
4857             if not os.path.exists(location):
4858                 self.parser.error(f'config location {location} does not exist')
4859             self.append_config(self.read_file(location), location)
4860         return True
4861
4862     def __str__(self):
4863         label = join_nonempty(
4864             self.label, 'config', f'"{self.filename}"' if self.filename else '',
4865             delim=' ')
4866         return join_nonempty(
4867             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4868             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4869             delim='\n')
4870
4871     @staticmethod
4872     def read_file(filename, default=[]):
4873         try:
4874             optionf = open(filename, 'rb')
4875         except OSError:
4876             return default  # silently skip if file is not present
4877         try:
4878             enc, skip = determine_file_encoding(optionf.read(512))
4879             optionf.seek(skip, io.SEEK_SET)
4880         except OSError:
4881             enc = None  # silently skip read errors
4882         try:
4883             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4884             contents = optionf.read().decode(enc or preferredencoding())
4885             res = shlex.split(contents, comments=True)
4886         except Exception as err:
4887             raise ValueError(f'Unable to parse "{filename}": {err}')
4888         finally:
4889             optionf.close()
4890         return res
4891
4892     @staticmethod
4893     def hide_login_info(opts):
4894         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4895         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4896
4897         def _scrub_eq(o):
4898             m = eqre.match(o)
4899             if m:
4900                 return m.group('key') + '=PRIVATE'
4901             else:
4902                 return o
4903
4904         opts = list(map(_scrub_eq, opts))
4905         for idx, opt in enumerate(opts):
4906             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4907                 opts[idx + 1] = 'PRIVATE'
4908         return opts
4909
4910     def append_config(self, *args, label=None):
4911         config = type(self)(self.parser, label)
4912         config._loaded_paths = self._loaded_paths
4913         if config.init(*args):
4914             self.configs.append(config)
4915
4916     @property
4917     def all_args(self):
4918         for config in reversed(self.configs):
4919             yield from config.all_args
4920         yield from self.parsed_args or []
4921
4922     def parse_known_args(self, **kwargs):
4923         return self.parser.parse_known_args(self.all_args, **kwargs)
4924
4925     def parse_args(self):
4926         return self.parser.parse_args(self.all_args)
4927
4928
4929 def merge_headers(*dicts):
4930     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4931     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
4932
4933
4934 def cached_method(f):
4935     """Cache a method"""
4936     signature = inspect.signature(f)
4937
4938     @functools.wraps(f)
4939     def wrapper(self, *args, **kwargs):
4940         bound_args = signature.bind(self, *args, **kwargs)
4941         bound_args.apply_defaults()
4942         key = tuple(bound_args.arguments.values())[1:]
4943
4944         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
4945         if key not in cache:
4946             cache[key] = f(self, *args, **kwargs)
4947         return cache[key]
4948     return wrapper
4949
4950
4951 class classproperty:
4952     """property access for class methods with optional caching"""
4953     def __new__(cls, func=None, *args, **kwargs):
4954         if not func:
4955             return functools.partial(cls, *args, **kwargs)
4956         return super().__new__(cls)
4957
4958     def __init__(self, func, *, cache=False):
4959         functools.update_wrapper(self, func)
4960         self.func = func
4961         self._cache = {} if cache else None
4962
4963     def __get__(self, _, cls):
4964         if self._cache is None:
4965             return self.func(cls)
4966         elif cls not in self._cache:
4967             self._cache[cls] = self.func(cls)
4968         return self._cache[cls]
4969
4970
4971 class function_with_repr:
4972     def __init__(self, func, repr_=None):
4973         functools.update_wrapper(self, func)
4974         self.func, self.__repr = func, repr_
4975
4976     def __call__(self, *args, **kwargs):
4977         return self.func(*args, **kwargs)
4978
4979     def __repr__(self):
4980         if self.__repr:
4981             return self.__repr
4982         return f'{self.func.__module__}.{self.func.__qualname__}'
4983
4984
4985 class Namespace(types.SimpleNamespace):
4986     """Immutable namespace"""
4987
4988     def __iter__(self):
4989         return iter(self.__dict__.values())
4990
4991     @property
4992     def items_(self):
4993         return self.__dict__.items()
4994
4995
4996 MEDIA_EXTENSIONS = Namespace(
4997     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
4998     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
4999     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5000     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5001     thumbnails=('jpg', 'png', 'webp'),
5002     storyboards=('mhtml', ),
5003     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5004     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5005 )
5006 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5007 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5008
5009 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5010
5011
5012 class RetryManager:
5013     """Usage:
5014         for retry in RetryManager(...):
5015             try:
5016                 ...
5017             except SomeException as err:
5018                 retry.error = err
5019                 continue
5020     """
5021     attempt, _error = 0, None
5022
5023     def __init__(self, _retries, _error_callback, **kwargs):
5024         self.retries = _retries or 0
5025         self.error_callback = functools.partial(_error_callback, **kwargs)
5026
5027     def _should_retry(self):
5028         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5029
5030     @property
5031     def error(self):
5032         if self._error is NO_DEFAULT:
5033             return None
5034         return self._error
5035
5036     @error.setter
5037     def error(self, value):
5038         self._error = value
5039
5040     def __iter__(self):
5041         while self._should_retry():
5042             self.error = NO_DEFAULT
5043             self.attempt += 1
5044             yield self
5045             if self.error:
5046                 self.error_callback(self.error, self.attempt, self.retries)
5047
5048     @staticmethod
5049     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5050         """Utility function for reporting retries"""
5051         if count > retries:
5052             if error:
5053                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5054             raise e
5055
5056         if not count:
5057             return warn(e)
5058         elif isinstance(e, ExtractorError):
5059             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5060         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5061
5062         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5063         if delay:
5064             info(f'Sleeping {delay:.2f} seconds ...')
5065             time.sleep(delay)
5066
5067
5068 def make_archive_id(ie, video_id):
5069     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5070     return f'{ie_key.lower()} {video_id}'
5071
5072
5073 def truncate_string(s, left, right=0):
5074     assert left > 3 and right >= 0
5075     if s is None or len(s) <= left + right:
5076         return s
5077     return f'{s[:left - 3]}...{s[-right:] if right else ""}'
5078
5079
5080 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5081     assert 'all' in alias_dict, '"all" alias is required'
5082     requested = list(start or [])
5083     for val in options:
5084         discard = val.startswith('-')
5085         if discard:
5086             val = val[1:]
5087
5088         if val in alias_dict:
5089             val = alias_dict[val] if not discard else [
5090                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5091             # NB: Do not allow regex in aliases for performance
5092             requested = orderedSet_from_options(val, alias_dict, start=requested)
5093             continue
5094
5095         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5096                    else [val] if val in alias_dict['all'] else None)
5097         if current is None:
5098             raise ValueError(val)
5099
5100         if discard:
5101             for item in current:
5102                 while item in requested:
5103                     requested.remove(item)
5104         else:
5105             requested.extend(current)
5106
5107     return orderedSet(requested)
5108
5109
5110 # TODO: Rewrite
5111 class FormatSorter:
5112     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5113
5114     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5115                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5116                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5117     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5118                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5119                     'fps', 'fs_approx', 'source', 'id')
5120
5121     settings = {
5122         'vcodec': {'type': 'ordered', 'regex': True,
5123                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5124         'acodec': {'type': 'ordered', 'regex': True,
5125                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5126         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5127                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5128         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5129                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5130         'vext': {'type': 'ordered', 'field': 'video_ext',
5131                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5132                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5133         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5134                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5135                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5136         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5137         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5138                        'field': ('vcodec', 'acodec'),
5139                        'function': lambda it: int(any(v != 'none' for v in it))},
5140         'ie_pref': {'priority': True, 'type': 'extractor'},
5141         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5142         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5143         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5144         'quality': {'convert': 'float', 'default': -1},
5145         'filesize': {'convert': 'bytes'},
5146         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5147         'id': {'convert': 'string', 'field': 'format_id'},
5148         'height': {'convert': 'float_none'},
5149         'width': {'convert': 'float_none'},
5150         'fps': {'convert': 'float_none'},
5151         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5152         'tbr': {'convert': 'float_none'},
5153         'vbr': {'convert': 'float_none'},
5154         'abr': {'convert': 'float_none'},
5155         'asr': {'convert': 'float_none'},
5156         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5157
5158         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5159         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5160                'function': lambda it: next(filter(None, it), None)},
5161         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5162                  'function': lambda it: next(filter(None, it), None)},
5163         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5164         'res': {'type': 'multiple', 'field': ('height', 'width'),
5165                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5166
5167         # Actual field names
5168         'format_id': {'type': 'alias', 'field': 'id'},
5169         'preference': {'type': 'alias', 'field': 'ie_pref'},
5170         'language_preference': {'type': 'alias', 'field': 'lang'},
5171         'source_preference': {'type': 'alias', 'field': 'source'},
5172         'protocol': {'type': 'alias', 'field': 'proto'},
5173         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5174         'audio_channels': {'type': 'alias', 'field': 'channels'},
5175
5176         # Deprecated
5177         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5178         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5179         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5180         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5181         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5182         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5183         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5184         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5185         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5186         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5187         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5188         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5189         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5190         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5191         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5192         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5193         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5194         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5195         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5196         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5197     }
5198
5199     def __init__(self, ydl, field_preference):
5200         self.ydl = ydl
5201         self._order = []
5202         self.evaluate_params(self.ydl.params, field_preference)
5203         if ydl.params.get('verbose'):
5204             self.print_verbose_info(self.ydl.write_debug)
5205
5206     def _get_field_setting(self, field, key):
5207         if field not in self.settings:
5208             if key in ('forced', 'priority'):
5209                 return False
5210             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5211                                         'deprecated and may be removed in a future version')
5212             self.settings[field] = {}
5213         propObj = self.settings[field]
5214         if key not in propObj:
5215             type = propObj.get('type')
5216             if key == 'field':
5217                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5218             elif key == 'convert':
5219                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5220             else:
5221                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5222             propObj[key] = default
5223         return propObj[key]
5224
5225     def _resolve_field_value(self, field, value, convertNone=False):
5226         if value is None:
5227             if not convertNone:
5228                 return None
5229         else:
5230             value = value.lower()
5231         conversion = self._get_field_setting(field, 'convert')
5232         if conversion == 'ignore':
5233             return None
5234         if conversion == 'string':
5235             return value
5236         elif conversion == 'float_none':
5237             return float_or_none(value)
5238         elif conversion == 'bytes':
5239             return parse_bytes(value)
5240         elif conversion == 'order':
5241             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5242             use_regex = self._get_field_setting(field, 'regex')
5243             list_length = len(order_list)
5244             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5245             if use_regex and value is not None:
5246                 for i, regex in enumerate(order_list):
5247                     if regex and re.match(regex, value):
5248                         return list_length - i
5249                 return list_length - empty_pos  # not in list
5250             else:  # not regex or  value = None
5251                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5252         else:
5253             if value.isnumeric():
5254                 return float(value)
5255             else:
5256                 self.settings[field]['convert'] = 'string'
5257                 return value
5258
5259     def evaluate_params(self, params, sort_extractor):
5260         self._use_free_order = params.get('prefer_free_formats', False)
5261         self._sort_user = params.get('format_sort', [])
5262         self._sort_extractor = sort_extractor
5263
5264         def add_item(field, reverse, closest, limit_text):
5265             field = field.lower()
5266             if field in self._order:
5267                 return
5268             self._order.append(field)
5269             limit = self._resolve_field_value(field, limit_text)
5270             data = {
5271                 'reverse': reverse,
5272                 'closest': False if limit is None else closest,
5273                 'limit_text': limit_text,
5274                 'limit': limit}
5275             if field in self.settings:
5276                 self.settings[field].update(data)
5277             else:
5278                 self.settings[field] = data
5279
5280         sort_list = (
5281             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5282             + (tuple() if params.get('format_sort_force', False)
5283                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5284             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5285
5286         for item in sort_list:
5287             match = re.match(self.regex, item)
5288             if match is None:
5289                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5290             field = match.group('field')
5291             if field is None:
5292                 continue
5293             if self._get_field_setting(field, 'type') == 'alias':
5294                 alias, field = field, self._get_field_setting(field, 'field')
5295                 if self._get_field_setting(alias, 'deprecated'):
5296                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5297                                                 f'be removed in a future version. Please use {field} instead')
5298             reverse = match.group('reverse') is not None
5299             closest = match.group('separator') == '~'
5300             limit_text = match.group('limit')
5301
5302             has_limit = limit_text is not None
5303             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5304             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5305
5306             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5307             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5308             limit_count = len(limits)
5309             for (i, f) in enumerate(fields):
5310                 add_item(f, reverse, closest,
5311                          limits[i] if i < limit_count
5312                          else limits[0] if has_limit and not has_multiple_limits
5313                          else None)
5314
5315     def print_verbose_info(self, write_debug):
5316         if self._sort_user:
5317             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5318         if self._sort_extractor:
5319             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5320         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5321             '+' if self._get_field_setting(field, 'reverse') else '', field,
5322             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5323                           self._get_field_setting(field, 'limit_text'),
5324                           self._get_field_setting(field, 'limit'))
5325             if self._get_field_setting(field, 'limit_text') is not None else '')
5326             for field in self._order if self._get_field_setting(field, 'visible')]))
5327
5328     def _calculate_field_preference_from_value(self, format, field, type, value):
5329         reverse = self._get_field_setting(field, 'reverse')
5330         closest = self._get_field_setting(field, 'closest')
5331         limit = self._get_field_setting(field, 'limit')
5332
5333         if type == 'extractor':
5334             maximum = self._get_field_setting(field, 'max')
5335             if value is None or (maximum is not None and value >= maximum):
5336                 value = -1
5337         elif type == 'boolean':
5338             in_list = self._get_field_setting(field, 'in_list')
5339             not_in_list = self._get_field_setting(field, 'not_in_list')
5340             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5341         elif type == 'ordered':
5342             value = self._resolve_field_value(field, value, True)
5343
5344         # try to convert to number
5345         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5346         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5347         if is_num:
5348             value = val_num
5349
5350         return ((-10, 0) if value is None
5351                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5352                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5353                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5354                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5355                 else (-1, value, 0))
5356
5357     def _calculate_field_preference(self, format, field):
5358         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5359         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5360         if type == 'multiple':
5361             type = 'field'  # Only 'field' is allowed in multiple for now
5362             actual_fields = self._get_field_setting(field, 'field')
5363
5364             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5365         else:
5366             value = get_value(field)
5367         return self._calculate_field_preference_from_value(format, field, type, value)
5368
5369     def calculate_preference(self, format):
5370         # Determine missing protocol
5371         if not format.get('protocol'):
5372             format['protocol'] = determine_protocol(format)
5373
5374         # Determine missing ext
5375         if not format.get('ext') and 'url' in format:
5376             format['ext'] = determine_ext(format['url'])
5377         if format.get('vcodec') == 'none':
5378             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5379             format['video_ext'] = 'none'
5380         else:
5381             format['video_ext'] = format['ext']
5382             format['audio_ext'] = 'none'
5383         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5384         #    format['preference'] = -1000
5385
5386         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5387             # HEVC-over-FLV is out-of-spec by FLV's original spec
5388             # ref. https://trac.ffmpeg.org/ticket/6389
5389             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5390             format['preference'] = -100
5391
5392         # Determine missing bitrates
5393         if format.get('vcodec') == 'none':
5394             format['vbr'] = 0
5395         if format.get('acodec') == 'none':
5396             format['abr'] = 0
5397         if not format.get('vbr') and format.get('vcodec') != 'none':
5398             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5399         if not format.get('abr') and format.get('acodec') != 'none':
5400             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5401         if not format.get('tbr'):
5402             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5403
5404         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5405
5406
5407 # XXX: Temporary
5408 class _YDLLogger:
5409     def __init__(self, ydl=None):
5410         self._ydl = ydl
5411
5412     def debug(self, message):
5413         if self._ydl:
5414             self._ydl.write_debug(message)
5415
5416     def info(self, message):
5417         if self._ydl:
5418             self._ydl.to_screen(message)
5419
5420     def warning(self, message, *, once=False):
5421         if self._ydl:
5422             self._ydl.report_warning(message, once)
5423
5424     def error(self, message, *, is_error=True):
5425         if self._ydl:
5426             self._ydl.report_error(message, is_error=is_error)
5427
5428     def stdout(self, message):
5429         if self._ydl:
5430             self._ydl.to_stdout(message)
5431
5432     def stderr(self, message):
5433         if self._ydl:
5434             self._ydl.to_stderr(message)