yt_dlp/utils/_utils.py

   1 import base64
   2 import binascii
   3 import calendar
   4 import codecs
   5 import collections
   6 import collections.abc
   7 import contextlib
   8 import datetime
   9 import email.header
  10 import email.utils
  11 import errno
  12 import hashlib
  13 import hmac
  14 import html.entities
  15 import html.parser
  16 import inspect
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import mimetypes
  23 import netrc
  24 import operator
  25 import os
  26 import platform
  27 import random
  28 import re
  29 import shlex
  30 import socket
  31 import ssl
  32 import struct
  33 import subprocess
  34 import sys
  35 import tempfile
  36 import time
  37 import traceback
  38 import types
  39 import unicodedata
  40 import urllib.error
  41 import urllib.parse
  42 import urllib.request
  43 import xml.etree.ElementTree
  44
  45 from . import traversal
  46
  47 from ..compat import functools  # isort: split
  48 from ..compat import (
  49     compat_etree_fromstring,
  50     compat_expanduser,
  51     compat_HTMLParseError,
  52     compat_os_name,
  53     compat_shlex_quote,
  54 )
  55 from ..dependencies import xattr
  56
  57 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  58
  59 # This is not clearly defined otherwise
  60 compiled_regex_type = type(re.compile(''))
  61
  62
  63 class NO_DEFAULT:
  64     pass
  65
  66
  67 def IDENTITY(x):
  68     return x
  69
  70
  71 ENGLISH_MONTH_NAMES = [
  72     'January', 'February', 'March', 'April', 'May', 'June',
  73     'July', 'August', 'September', 'October', 'November', 'December']
  74
  75 MONTH_NAMES = {
  76     'en': ENGLISH_MONTH_NAMES,
  77     'fr': [
  78         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  79         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  80     # these follow the genitive grammatical case (dopełniacz)
  81     # some websites might be using nominative, which will require another month list
  82     # https://en.wikibooks.org/wiki/Polish/Noun_cases
  83     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  84            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  85 }
  86
  87 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  88 TIMEZONE_NAMES = {
  89     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  90     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
  91     'EST': -5, 'EDT': -4,  # Eastern
  92     'CST': -6, 'CDT': -5,  # Central
  93     'MST': -7, 'MDT': -6,  # Mountain
  94     'PST': -8, 'PDT': -7   # Pacific
  95 }
  96
  97 # needed for sanitizing filenames in restricted mode
  98 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
  99                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 100                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 101
 102 DATE_FORMATS = (
 103     '%d %B %Y',
 104     '%d %b %Y',
 105     '%B %d %Y',
 106     '%B %dst %Y',
 107     '%B %dnd %Y',
 108     '%B %drd %Y',
 109     '%B %dth %Y',
 110     '%b %d %Y',
 111     '%b %dst %Y',
 112     '%b %dnd %Y',
 113     '%b %drd %Y',
 114     '%b %dth %Y',
 115     '%b %dst %Y %I:%M',
 116     '%b %dnd %Y %I:%M',
 117     '%b %drd %Y %I:%M',
 118     '%b %dth %Y %I:%M',
 119     '%Y %m %d',
 120     '%Y-%m-%d',
 121     '%Y.%m.%d.',
 122     '%Y/%m/%d',
 123     '%Y/%m/%d %H:%M',
 124     '%Y/%m/%d %H:%M:%S',
 125     '%Y%m%d%H%M',
 126     '%Y%m%d%H%M%S',
 127     '%Y%m%d',
 128     '%Y-%m-%d %H:%M',
 129     '%Y-%m-%d %H:%M:%S',
 130     '%Y-%m-%d %H:%M:%S.%f',
 131     '%Y-%m-%d %H:%M:%S:%f',
 132     '%d.%m.%Y %H:%M',
 133     '%d.%m.%Y %H.%M',
 134     '%Y-%m-%dT%H:%M:%SZ',
 135     '%Y-%m-%dT%H:%M:%S.%fZ',
 136     '%Y-%m-%dT%H:%M:%S.%f0Z',
 137     '%Y-%m-%dT%H:%M:%S',
 138     '%Y-%m-%dT%H:%M:%S.%f',
 139     '%Y-%m-%dT%H:%M',
 140     '%b %d %Y at %H:%M',
 141     '%b %d %Y at %H:%M:%S',
 142     '%B %d %Y at %H:%M',
 143     '%B %d %Y at %H:%M:%S',
 144     '%H:%M %d-%b-%Y',
 145 )
 146
 147 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 148 DATE_FORMATS_DAY_FIRST.extend([
 149     '%d-%m-%Y',
 150     '%d.%m.%Y',
 151     '%d.%m.%y',
 152     '%d/%m/%Y',
 153     '%d/%m/%y',
 154     '%d/%m/%Y %H:%M:%S',
 155     '%d-%m-%Y %H:%M',
 156     '%H:%M %d/%m/%Y',
 157 ])
 158
 159 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 160 DATE_FORMATS_MONTH_FIRST.extend([
 161     '%m-%d-%Y',
 162     '%m.%d.%Y',
 163     '%m/%d/%Y',
 164     '%m/%d/%y',
 165     '%m/%d/%Y %H:%M:%S',
 166 ])
 167
 168 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 169 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 170
 171 NUMBER_RE = r'\d+(?:\.\d+)?'
 172
 173
 174 @functools.cache
 175 def preferredencoding():
 176     """Get preferred encoding.
 177
 178     Returns the best encoding scheme for the system, based on
 179     locale.getpreferredencoding() and some further tweaks.
 180     """
 181     try:
 182         pref = locale.getpreferredencoding()
 183         'TEST'.encode(pref)
 184     except Exception:
 185         pref = 'UTF-8'
 186
 187     return pref
 188
 189
 190 def write_json_file(obj, fn):
 191     """ Encode obj as JSON and write it to fn, atomically if possible """
 192
 193     tf = tempfile.NamedTemporaryFile(
 194         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 195         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 196
 197     try:
 198         with tf:
 199             json.dump(obj, tf, ensure_ascii=False)
 200         if sys.platform == 'win32':
 201             # Need to remove existing file on Windows, else os.rename raises
 202             # WindowsError or FileExistsError.
 203             with contextlib.suppress(OSError):
 204                 os.unlink(fn)
 205         with contextlib.suppress(OSError):
 206             mask = os.umask(0)
 207             os.umask(mask)
 208             os.chmod(tf.name, 0o666 & ~mask)
 209         os.rename(tf.name, fn)
 210     except Exception:
 211         with contextlib.suppress(OSError):
 212             os.remove(tf.name)
 213         raise
 214
 215
 216 def find_xpath_attr(node, xpath, key, val=None):
 217     """ Find the xpath xpath[@key=val] """
 218     assert re.match(r'^[a-zA-Z_-]+$', key)
 219     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 220     return node.find(expr)
 221
 222 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 223 # the namespace parameter
 224
 225
 226 def xpath_with_ns(path, ns_map):
 227     components = [c.split(':') for c in path.split('/')]
 228     replaced = []
 229     for c in components:
 230         if len(c) == 1:
 231             replaced.append(c[0])
 232         else:
 233             ns, tag = c
 234             replaced.append('{%s}%s' % (ns_map[ns], tag))
 235     return '/'.join(replaced)
 236
 237
 238 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 239     def _find_xpath(xpath):
 240         return node.find(xpath)
 241
 242     if isinstance(xpath, str):
 243         n = _find_xpath(xpath)
 244     else:
 245         for xp in xpath:
 246             n = _find_xpath(xp)
 247             if n is not None:
 248                 break
 249
 250     if n is None:
 251         if default is not NO_DEFAULT:
 252             return default
 253         elif fatal:
 254             name = xpath if name is None else name
 255             raise ExtractorError('Could not find XML element %s' % name)
 256         else:
 257             return None
 258     return n
 259
 260
 261 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 262     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 263     if n is None or n == default:
 264         return n
 265     if n.text is None:
 266         if default is not NO_DEFAULT:
 267             return default
 268         elif fatal:
 269             name = xpath if name is None else name
 270             raise ExtractorError('Could not find XML element\'s text %s' % name)
 271         else:
 272             return None
 273     return n.text
 274
 275
 276 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 277     n = find_xpath_attr(node, xpath, key)
 278     if n is None:
 279         if default is not NO_DEFAULT:
 280             return default
 281         elif fatal:
 282             name = f'{xpath}[@{key}]' if name is None else name
 283             raise ExtractorError('Could not find XML attribute %s' % name)
 284         else:
 285             return None
 286     return n.attrib[key]
 287
 288
 289 def get_element_by_id(id, html, **kwargs):
 290     """Return the content of the tag with the specified ID in the passed HTML document"""
 291     return get_element_by_attribute('id', id, html, **kwargs)
 292
 293
 294 def get_element_html_by_id(id, html, **kwargs):
 295     """Return the html of the tag with the specified ID in the passed HTML document"""
 296     return get_element_html_by_attribute('id', id, html, **kwargs)
 297
 298
 299 def get_element_by_class(class_name, html):
 300     """Return the content of the first tag with the specified class in the passed HTML document"""
 301     retval = get_elements_by_class(class_name, html)
 302     return retval[0] if retval else None
 303
 304
 305 def get_element_html_by_class(class_name, html):
 306     """Return the html of the first tag with the specified class in the passed HTML document"""
 307     retval = get_elements_html_by_class(class_name, html)
 308     return retval[0] if retval else None
 309
 310
 311 def get_element_by_attribute(attribute, value, html, **kwargs):
 312     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 313     return retval[0] if retval else None
 314
 315
 316 def get_element_html_by_attribute(attribute, value, html, **kargs):
 317     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 318     return retval[0] if retval else None
 319
 320
 321 def get_elements_by_class(class_name, html, **kargs):
 322     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 323     return get_elements_by_attribute(
 324         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 325         html, escape_value=False)
 326
 327
 328 def get_elements_html_by_class(class_name, html):
 329     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 330     return get_elements_html_by_attribute(
 331         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 332         html, escape_value=False)
 333
 334
 335 def get_elements_by_attribute(*args, **kwargs):
 336     """Return the content of the tag with the specified attribute in the passed HTML document"""
 337     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 338
 339
 340 def get_elements_html_by_attribute(*args, **kwargs):
 341     """Return the html of the tag with the specified attribute in the passed HTML document"""
 342     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 343
 344
 345 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 346     """
 347     Return the text (content) and the html (whole) of the tag with the specified
 348     attribute in the passed HTML document
 349     """
 350     if not value:
 351         return
 352
 353     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 354
 355     value = re.escape(value) if escape_value else value
 356
 357     partial_element_re = rf'''(?x)
 358         <(?P<tag>{tag})
 359          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 360          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 361         '''
 362
 363     for m in re.finditer(partial_element_re, html):
 364         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 365
 366         yield (
 367             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 368             whole
 369         )
 370
 371
 372 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 373     """
 374     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 375     closing tag for the first opening tag it has encountered, and can be used
 376     as a context manager
 377     """
 378
 379     class HTMLBreakOnClosingTagException(Exception):
 380         pass
 381
 382     def __init__(self):
 383         self.tagstack = collections.deque()
 384         html.parser.HTMLParser.__init__(self)
 385
 386     def __enter__(self):
 387         return self
 388
 389     def __exit__(self, *_):
 390         self.close()
 391
 392     def close(self):
 393         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 394         # so data remains buffered; we no longer have any interest in it, thus
 395         # override this method to discard it
 396         pass
 397
 398     def handle_starttag(self, tag, _):
 399         self.tagstack.append(tag)
 400
 401     def handle_endtag(self, tag):
 402         if not self.tagstack:
 403             raise compat_HTMLParseError('no tags in the stack')
 404         while self.tagstack:
 405             inner_tag = self.tagstack.pop()
 406             if inner_tag == tag:
 407                 break
 408         else:
 409             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 410         if not self.tagstack:
 411             raise self.HTMLBreakOnClosingTagException()
 412
 413
 414 # XXX: This should be far less strict
 415 def get_element_text_and_html_by_tag(tag, html):
 416     """
 417     For the first element with the specified tag in the passed HTML document
 418     return its' content (text) and the whole element (html)
 419     """
 420     def find_or_raise(haystack, needle, exc):
 421         try:
 422             return haystack.index(needle)
 423         except ValueError:
 424             raise exc
 425     closing_tag = f'</{tag}>'
 426     whole_start = find_or_raise(
 427         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 428     content_start = find_or_raise(
 429         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 430     content_start += whole_start + 1
 431     with HTMLBreakOnClosingTagParser() as parser:
 432         parser.feed(html[whole_start:content_start])
 433         if not parser.tagstack or parser.tagstack[0] != tag:
 434             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 435         offset = content_start
 436         while offset < len(html):
 437             next_closing_tag_start = find_or_raise(
 438                 html[offset:], closing_tag,
 439                 compat_HTMLParseError(f'closing {tag} tag not found'))
 440             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 441             try:
 442                 parser.feed(html[offset:offset + next_closing_tag_end])
 443                 offset += next_closing_tag_end
 444             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 445                 return html[content_start:offset + next_closing_tag_start], \
 446                     html[whole_start:offset + next_closing_tag_end]
 447         raise compat_HTMLParseError('unexpected end of html')
 448
 449
 450 class HTMLAttributeParser(html.parser.HTMLParser):
 451     """Trivial HTML parser to gather the attributes for a single element"""
 452
 453     def __init__(self):
 454         self.attrs = {}
 455         html.parser.HTMLParser.__init__(self)
 456
 457     def handle_starttag(self, tag, attrs):
 458         self.attrs = dict(attrs)
 459         raise compat_HTMLParseError('done')
 460
 461
 462 class HTMLListAttrsParser(html.parser.HTMLParser):
 463     """HTML parser to gather the attributes for the elements of a list"""
 464
 465     def __init__(self):
 466         html.parser.HTMLParser.__init__(self)
 467         self.items = []
 468         self._level = 0
 469
 470     def handle_starttag(self, tag, attrs):
 471         if tag == 'li' and self._level == 0:
 472             self.items.append(dict(attrs))
 473         self._level += 1
 474
 475     def handle_endtag(self, tag):
 476         self._level -= 1
 477
 478
 479 def extract_attributes(html_element):
 480     """Given a string for an HTML element such as
 481     <el
 482          a="foo" B="bar" c="&98;az" d=boz
 483          empty= noval entity="&amp;"
 484          sq='"' dq="'"
 485     >
 486     Decode and return a dictionary of attributes.
 487     {
 488         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 489         'empty': '', 'noval': None, 'entity': '&',
 490         'sq': '"', 'dq': '\''
 491     }.
 492     """
 493     parser = HTMLAttributeParser()
 494     with contextlib.suppress(compat_HTMLParseError):
 495         parser.feed(html_element)
 496         parser.close()
 497     return parser.attrs
 498
 499
 500 def parse_list(webpage):
 501     """Given a string for an series of HTML <li> elements,
 502     return a dictionary of their attributes"""
 503     parser = HTMLListAttrsParser()
 504     parser.feed(webpage)
 505     parser.close()
 506     return parser.items
 507
 508
 509 def clean_html(html):
 510     """Clean an HTML snippet into a readable string"""
 511
 512     if html is None:  # Convenience for sanitizing descriptions etc.
 513         return html
 514
 515     html = re.sub(r'\s+', ' ', html)
 516     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 517     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 518     # Strip html tags
 519     html = re.sub('<.*?>', '', html)
 520     # Replace html entities
 521     html = unescapeHTML(html)
 522     return html.strip()
 523
 524
 525 class LenientJSONDecoder(json.JSONDecoder):
 526     # TODO: Write tests
 527     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 528         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 529         self._close_attempts = 2 * close_objects
 530         super().__init__(*args, **kwargs)
 531
 532     @staticmethod
 533     def _close_object(err):
 534         doc = err.doc[:err.pos]
 535         # We need to add comma first to get the correct error message
 536         if err.msg.startswith('Expecting \',\''):
 537             return doc + ','
 538         elif not doc.endswith(','):
 539             return
 540
 541         if err.msg.startswith('Expecting property name'):
 542             return doc[:-1] + '}'
 543         elif err.msg.startswith('Expecting value'):
 544             return doc[:-1] + ']'
 545
 546     def decode(self, s):
 547         if self.transform_source:
 548             s = self.transform_source(s)
 549         for attempt in range(self._close_attempts + 1):
 550             try:
 551                 if self.ignore_extra:
 552                     return self.raw_decode(s.lstrip())[0]
 553                 return super().decode(s)
 554             except json.JSONDecodeError as e:
 555                 if e.pos is None:
 556                     raise
 557                 elif attempt < self._close_attempts:
 558                     s = self._close_object(e)
 559                     if s is not None:
 560                         continue
 561                 raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
 562         assert False, 'Too many attempts to decode JSON'
 563
 564
 565 def sanitize_open(filename, open_mode):
 566     """Try to open the given filename, and slightly tweak it if this fails.
 567
 568     Attempts to open the given filename. If this fails, it tries to change
 569     the filename slightly, step by step, until it's either able to open it
 570     or it fails and raises a final exception, like the standard open()
 571     function.
 572
 573     It returns the tuple (stream, definitive_file_name).
 574     """
 575     if filename == '-':
 576         if sys.platform == 'win32':
 577             import msvcrt
 578
 579             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 580             with contextlib.suppress(io.UnsupportedOperation):
 581                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 582         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 583
 584     for attempt in range(2):
 585         try:
 586             try:
 587                 if sys.platform == 'win32':
 588                     # FIXME: An exclusive lock also locks the file from being read.
 589                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 590                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 591                     raise LockingUnsupportedError()
 592                 stream = locked_file(filename, open_mode, block=False).__enter__()
 593             except OSError:
 594                 stream = open(filename, open_mode)
 595             return stream, filename
 596         except OSError as err:
 597             if attempt or err.errno in (errno.EACCES,):
 598                 raise
 599             old_filename, filename = filename, sanitize_path(filename)
 600             if old_filename == filename:
 601                 raise
 602
 603
 604 def timeconvert(timestr):
 605     """Convert RFC 2822 defined time string into system timestamp"""
 606     timestamp = None
 607     timetuple = email.utils.parsedate_tz(timestr)
 608     if timetuple is not None:
 609         timestamp = email.utils.mktime_tz(timetuple)
 610     return timestamp
 611
 612
 613 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 614     """Sanitizes a string so it could be used as part of a filename.
 615     @param restricted   Use a stricter subset of allowed characters
 616     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 617                         If unset, yt-dlp's new sanitization rules are in effect
 618     """
 619     if s == '':
 620         return ''
 621
 622     def replace_insane(char):
 623         if restricted and char in ACCENT_CHARS:
 624             return ACCENT_CHARS[char]
 625         elif not restricted and char == '\n':
 626             return '\0 '
 627         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 628             # Replace with their full-width unicode counterparts
 629             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 630         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 631             return ''
 632         elif char == '"':
 633             return '' if restricted else '\''
 634         elif char == ':':
 635             return '\0_\0-' if restricted else '\0 \0-'
 636         elif char in '\\/|*<>':
 637             return '\0_'
 638         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 639             return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
 640         return char
 641
 642     # Replace look-alike Unicode glyphs
 643     if restricted and (is_id is NO_DEFAULT or not is_id):
 644         s = unicodedata.normalize('NFKC', s)
 645     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 646     result = ''.join(map(replace_insane, s))
 647     if is_id is NO_DEFAULT:
 648         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 649         STRIP_RE = r'(?:\0.|[ _-])*'
 650         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 651     result = result.replace('\0', '') or '_'
 652
 653     if not is_id:
 654         while '__' in result:
 655             result = result.replace('__', '_')
 656         result = result.strip('_')
 657         # Common case of "Foreign band name - English song title"
 658         if restricted and result.startswith('-_'):
 659             result = result[2:]
 660         if result.startswith('-'):
 661             result = '_' + result[len('-'):]
 662         result = result.lstrip('.')
 663         if not result:
 664             result = '_'
 665     return result
 666
 667
 668 def sanitize_path(s, force=False):
 669     """Sanitizes and normalizes path on Windows"""
 670     # XXX: this handles drive relative paths (c:sth) incorrectly
 671     if sys.platform == 'win32':
 672         force = False
 673         drive_or_unc, _ = os.path.splitdrive(s)
 674     elif force:
 675         drive_or_unc = ''
 676     else:
 677         return s
 678
 679     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 680     if drive_or_unc:
 681         norm_path.pop(0)
 682     sanitized_path = [
 683         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 684         for path_part in norm_path]
 685     if drive_or_unc:
 686         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 687     elif force and s and s[0] == os.path.sep:
 688         sanitized_path.insert(0, os.path.sep)
 689     # TODO: Fix behavioral differences <3.12
 690     # The workaround using `normpath` only superficially passes tests
 691     # Ref: https://github.com/python/cpython/pull/100351
 692     return os.path.normpath(os.path.join(*sanitized_path))
 693
 694
 695 def sanitize_url(url, *, scheme='http'):
 696     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 697     # the number of unwanted failures due to missing protocol
 698     if url is None:
 699         return
 700     elif url.startswith('//'):
 701         return f'{scheme}:{url}'
 702     # Fix some common typos seen so far
 703     COMMON_TYPOS = (
 704         # https://github.com/ytdl-org/youtube-dl/issues/15649
 705         (r'^httpss://', r'https://'),
 706         # https://bx1.be/lives/direct-tv/
 707         (r'^rmtp([es]?)://', r'rtmp\1://'),
 708     )
 709     for mistake, fixup in COMMON_TYPOS:
 710         if re.match(mistake, url):
 711             return re.sub(mistake, fixup, url)
 712     return url
 713
 714
 715 def extract_basic_auth(url):
 716     parts = urllib.parse.urlsplit(url)
 717     if parts.username is None:
 718         return url, None
 719     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 720         parts.hostname if parts.port is None
 721         else '%s:%d' % (parts.hostname, parts.port))))
 722     auth_payload = base64.b64encode(
 723         ('%s:%s' % (parts.username, parts.password or '')).encode())
 724     return url, f'Basic {auth_payload.decode()}'
 725
 726
 727 def expand_path(s):
 728     """Expand shell variables and ~"""
 729     return os.path.expandvars(compat_expanduser(s))
 730
 731
 732 def orderedSet(iterable, *, lazy=False):
 733     """Remove all duplicates from the input iterable"""
 734     def _iter():
 735         seen = []  # Do not use set since the items can be unhashable
 736         for x in iterable:
 737             if x not in seen:
 738                 seen.append(x)
 739                 yield x
 740
 741     return _iter() if lazy else list(_iter())
 742
 743
 744 def _htmlentity_transform(entity_with_semicolon):
 745     """Transforms an HTML entity to a character."""
 746     entity = entity_with_semicolon[:-1]
 747
 748     # Known non-numeric HTML entity
 749     if entity in html.entities.name2codepoint:
 750         return chr(html.entities.name2codepoint[entity])
 751
 752     # TODO: HTML5 allows entities without a semicolon.
 753     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 754     if entity_with_semicolon in html.entities.html5:
 755         return html.entities.html5[entity_with_semicolon]
 756
 757     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 758     if mobj is not None:
 759         numstr = mobj.group(1)
 760         if numstr.startswith('x'):
 761             base = 16
 762             numstr = '0%s' % numstr
 763         else:
 764             base = 10
 765         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 766         with contextlib.suppress(ValueError):
 767             return chr(int(numstr, base))
 768
 769     # Unknown entity in name, return its literal representation
 770     return '&%s;' % entity
 771
 772
 773 def unescapeHTML(s):
 774     if s is None:
 775         return None
 776     assert isinstance(s, str)
 777
 778     return re.sub(
 779         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 780
 781
 782 def escapeHTML(text):
 783     return (
 784         text
 785         .replace('&', '&amp;')
 786         .replace('<', '&lt;')
 787         .replace('>', '&gt;')
 788         .replace('"', '&quot;')
 789         .replace("'", '&#39;')
 790     )
 791
 792
 793 class netrc_from_content(netrc.netrc):
 794     def __init__(self, content):
 795         self.hosts, self.macros = {}, {}
 796         with io.StringIO(content) as stream:
 797             self._parse('-', stream, False)
 798
 799
 800 class Popen(subprocess.Popen):
 801     if sys.platform == 'win32':
 802         _startupinfo = subprocess.STARTUPINFO()
 803         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 804     else:
 805         _startupinfo = None
 806
 807     @staticmethod
 808     def _fix_pyinstaller_ld_path(env):
 809         """Restore LD_LIBRARY_PATH when using PyInstaller
 810             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 811                  https://github.com/yt-dlp/yt-dlp/issues/4573
 812         """
 813         if not hasattr(sys, '_MEIPASS'):
 814             return
 815
 816         def _fix(key):
 817             orig = env.get(f'{key}_ORIG')
 818             if orig is None:
 819                 env.pop(key, None)
 820             else:
 821                 env[key] = orig
 822
 823         _fix('LD_LIBRARY_PATH')  # Linux
 824         _fix('DYLD_LIBRARY_PATH')  # macOS
 825
 826     def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
 827         if env is None:
 828             env = os.environ.copy()
 829         self._fix_pyinstaller_ld_path(env)
 830
 831         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 832         if text is True:
 833             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 834             kwargs.setdefault('encoding', 'utf-8')
 835             kwargs.setdefault('errors', 'replace')
 836
 837         if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
 838             if not isinstance(args, str):
 839                 args = ' '.join(compat_shlex_quote(a) for a in args)
 840             shell = False
 841             args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"'
 842
 843         super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
 844
 845     def __comspec(self):
 846         comspec = os.environ.get('ComSpec') or os.path.join(
 847             os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
 848         if os.path.isabs(comspec):
 849             return comspec
 850         raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
 851
 852     def communicate_or_kill(self, *args, **kwargs):
 853         try:
 854             return self.communicate(*args, **kwargs)
 855         except BaseException:  # Including KeyboardInterrupt
 856             self.kill(timeout=None)
 857             raise
 858
 859     def kill(self, *, timeout=0):
 860         super().kill()
 861         if timeout != 0:
 862             self.wait(timeout=timeout)
 863
 864     @classmethod
 865     def run(cls, *args, timeout=None, **kwargs):
 866         with cls(*args, **kwargs) as proc:
 867             default = '' if proc.__text_mode else b''
 868             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 869             return stdout or default, stderr or default, proc.returncode
 870
 871
 872 def encodeArgument(s):
 873     # Legacy code that uses byte strings
 874     # Uncomment the following line after fixing all post processors
 875     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 876     return s if isinstance(s, str) else s.decode('ascii')
 877
 878
 879 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 880
 881
 882 def timetuple_from_msec(msec):
 883     secs, msec = divmod(msec, 1000)
 884     mins, secs = divmod(secs, 60)
 885     hrs, mins = divmod(mins, 60)
 886     return _timetuple(hrs, mins, secs, msec)
 887
 888
 889 def formatSeconds(secs, delim=':', msec=False):
 890     time = timetuple_from_msec(secs * 1000)
 891     if time.hours:
 892         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 893     elif time.minutes:
 894         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 895     else:
 896         ret = '%d' % time.seconds
 897     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 898
 899
 900 def bug_reports_message(before=';'):
 901     from ..update import REPOSITORY
 902
 903     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 904            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 905
 906     before = before.rstrip()
 907     if not before or before.endswith(('.', '!', '?')):
 908         msg = msg[0].title() + msg[1:]
 909
 910     return (before + ' ' if before else '') + msg
 911
 912
 913 class YoutubeDLError(Exception):
 914     """Base exception for YoutubeDL errors."""
 915     msg = None
 916
 917     def __init__(self, msg=None):
 918         if msg is not None:
 919             self.msg = msg
 920         elif self.msg is None:
 921             self.msg = type(self).__name__
 922         super().__init__(self.msg)
 923
 924
 925 class ExtractorError(YoutubeDLError):
 926     """Error during info extraction."""
 927
 928     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 929         """ tb, if given, is the original traceback (so that it can be printed out).
 930         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 931         """
 932         from ..networking.exceptions import network_exceptions
 933         if sys.exc_info()[0] in network_exceptions:
 934             expected = True
 935
 936         self.orig_msg = str(msg)
 937         self.traceback = tb
 938         self.expected = expected
 939         self.cause = cause
 940         self.video_id = video_id
 941         self.ie = ie
 942         self.exc_info = sys.exc_info()  # preserve original exception
 943         if isinstance(self.exc_info[1], ExtractorError):
 944             self.exc_info = self.exc_info[1].exc_info
 945         super().__init__(self.__msg)
 946
 947     @property
 948     def __msg(self):
 949         return ''.join((
 950             format_field(self.ie, None, '[%s] '),
 951             format_field(self.video_id, None, '%s: '),
 952             self.orig_msg,
 953             format_field(self.cause, None, ' (caused by %r)'),
 954             '' if self.expected else bug_reports_message()))
 955
 956     def format_traceback(self):
 957         return join_nonempty(
 958             self.traceback and ''.join(traceback.format_tb(self.traceback)),
 959             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
 960             delim='\n') or None
 961
 962     def __setattr__(self, name, value):
 963         super().__setattr__(name, value)
 964         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
 965             self.msg = self.__msg or type(self).__name__
 966             self.args = (self.msg, )  # Cannot be property
 967
 968
 969 class UnsupportedError(ExtractorError):
 970     def __init__(self, url):
 971         super().__init__(
 972             'Unsupported URL: %s' % url, expected=True)
 973         self.url = url
 974
 975
 976 class RegexNotFoundError(ExtractorError):
 977     """Error when a regex didn't match"""
 978     pass
 979
 980
 981 class GeoRestrictedError(ExtractorError):
 982     """Geographic restriction Error exception.
 983
 984     This exception may be thrown when a video is not available from your
 985     geographic location due to geographic restrictions imposed by a website.
 986     """
 987
 988     def __init__(self, msg, countries=None, **kwargs):
 989         kwargs['expected'] = True
 990         super().__init__(msg, **kwargs)
 991         self.countries = countries
 992
 993
 994 class UserNotLive(ExtractorError):
 995     """Error when a channel/user is not live"""
 996
 997     def __init__(self, msg=None, **kwargs):
 998         kwargs['expected'] = True
 999         super().__init__(msg or 'The channel is not currently live', **kwargs)
1000
1001
1002 class DownloadError(YoutubeDLError):
1003     """Download Error exception.
1004
1005     This exception may be thrown by FileDownloader objects if they are not
1006     configured to continue on errors. They will contain the appropriate
1007     error message.
1008     """
1009
1010     def __init__(self, msg, exc_info=None):
1011         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1012         super().__init__(msg)
1013         self.exc_info = exc_info
1014
1015
1016 class EntryNotInPlaylist(YoutubeDLError):
1017     """Entry not in playlist exception.
1018
1019     This exception will be thrown by YoutubeDL when a requested entry
1020     is not found in the playlist info_dict
1021     """
1022     msg = 'Entry not found in info'
1023
1024
1025 class SameFileError(YoutubeDLError):
1026     """Same File exception.
1027
1028     This exception will be thrown by FileDownloader objects if they detect
1029     multiple files would have to be downloaded to the same file on disk.
1030     """
1031     msg = 'Fixed output name but more than one file to download'
1032
1033     def __init__(self, filename=None):
1034         if filename is not None:
1035             self.msg += f': {filename}'
1036         super().__init__(self.msg)
1037
1038
1039 class PostProcessingError(YoutubeDLError):
1040     """Post Processing exception.
1041
1042     This exception may be raised by PostProcessor's .run() method to
1043     indicate an error in the postprocessing task.
1044     """
1045
1046
1047 class DownloadCancelled(YoutubeDLError):
1048     """ Exception raised when the download queue should be interrupted """
1049     msg = 'The download was cancelled'
1050
1051
1052 class ExistingVideoReached(DownloadCancelled):
1053     """ --break-on-existing triggered """
1054     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1055
1056
1057 class RejectedVideoReached(DownloadCancelled):
1058     """ --break-match-filter triggered """
1059     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1060
1061
1062 class MaxDownloadsReached(DownloadCancelled):
1063     """ --max-downloads limit has been reached. """
1064     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1065
1066
1067 class ReExtractInfo(YoutubeDLError):
1068     """ Video info needs to be re-extracted. """
1069
1070     def __init__(self, msg, expected=False):
1071         super().__init__(msg)
1072         self.expected = expected
1073
1074
1075 class ThrottledDownload(ReExtractInfo):
1076     """ Download speed below --throttled-rate. """
1077     msg = 'The download speed is below throttle limit'
1078
1079     def __init__(self):
1080         super().__init__(self.msg, expected=False)
1081
1082
1083 class UnavailableVideoError(YoutubeDLError):
1084     """Unavailable Format exception.
1085
1086     This exception will be thrown when a video is requested
1087     in a format that is not available for that video.
1088     """
1089     msg = 'Unable to download video'
1090
1091     def __init__(self, err=None):
1092         if err is not None:
1093             self.msg += f': {err}'
1094         super().__init__(self.msg)
1095
1096
1097 class ContentTooShortError(YoutubeDLError):
1098     """Content Too Short exception.
1099
1100     This exception may be raised by FileDownloader objects when a file they
1101     download is too small for what the server announced first, indicating
1102     the connection was probably interrupted.
1103     """
1104
1105     def __init__(self, downloaded, expected):
1106         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1107         # Both in bytes
1108         self.downloaded = downloaded
1109         self.expected = expected
1110
1111
1112 class XAttrMetadataError(YoutubeDLError):
1113     def __init__(self, code=None, msg='Unknown error'):
1114         super().__init__(msg)
1115         self.code = code
1116         self.msg = msg
1117
1118         # Parsing code and msg
1119         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1120                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1121             self.reason = 'NO_SPACE'
1122         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1123             self.reason = 'VALUE_TOO_LONG'
1124         else:
1125             self.reason = 'NOT_SUPPORTED'
1126
1127
1128 class XAttrUnavailableError(YoutubeDLError):
1129     pass
1130
1131
1132 def is_path_like(f):
1133     return isinstance(f, (str, bytes, os.PathLike))
1134
1135
1136 def extract_timezone(date_str):
1137     m = re.search(
1138         r'''(?x)
1139             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1140             (?P<tz>Z|                                            # just the UTC Z, or
1141                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1142                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1143                    [ ]?                                          # optional space
1144                 (?P<sign>\+|-)                                   # +/-
1145                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1146             $)
1147         ''', date_str)
1148     if not m:
1149         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1150         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1151         if timezone is not None:
1152             date_str = date_str[:-len(m.group('tz'))]
1153         timezone = datetime.timedelta(hours=timezone or 0)
1154     else:
1155         date_str = date_str[:-len(m.group('tz'))]
1156         if not m.group('sign'):
1157             timezone = datetime.timedelta()
1158         else:
1159             sign = 1 if m.group('sign') == '+' else -1
1160             timezone = datetime.timedelta(
1161                 hours=sign * int(m.group('hours')),
1162                 minutes=sign * int(m.group('minutes')))
1163     return timezone, date_str
1164
1165
1166 def parse_iso8601(date_str, delimiter='T', timezone=None):
1167     """ Return a UNIX timestamp from the given date """
1168
1169     if date_str is None:
1170         return None
1171
1172     date_str = re.sub(r'\.[0-9]+', '', date_str)
1173
1174     if timezone is None:
1175         timezone, date_str = extract_timezone(date_str)
1176
1177     with contextlib.suppress(ValueError):
1178         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1179         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1180         return calendar.timegm(dt.timetuple())
1181
1182
1183 def date_formats(day_first=True):
1184     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1185
1186
1187 def unified_strdate(date_str, day_first=True):
1188     """Return a string with the date in the format YYYYMMDD"""
1189
1190     if date_str is None:
1191         return None
1192     upload_date = None
1193     # Replace commas
1194     date_str = date_str.replace(',', ' ')
1195     # Remove AM/PM + timezone
1196     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1197     _, date_str = extract_timezone(date_str)
1198
1199     for expression in date_formats(day_first):
1200         with contextlib.suppress(ValueError):
1201             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1202     if upload_date is None:
1203         timetuple = email.utils.parsedate_tz(date_str)
1204         if timetuple:
1205             with contextlib.suppress(ValueError):
1206                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1207     if upload_date is not None:
1208         return str(upload_date)
1209
1210
1211 def unified_timestamp(date_str, day_first=True):
1212     if not isinstance(date_str, str):
1213         return None
1214
1215     date_str = re.sub(r'\s+', ' ', re.sub(
1216         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1217
1218     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1219     timezone, date_str = extract_timezone(date_str)
1220
1221     # Remove AM/PM + timezone
1222     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1223
1224     # Remove unrecognized timezones from ISO 8601 alike timestamps
1225     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1226     if m:
1227         date_str = date_str[:-len(m.group('tz'))]
1228
1229     # Python only supports microseconds, so remove nanoseconds
1230     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1231     if m:
1232         date_str = m.group(1)
1233
1234     for expression in date_formats(day_first):
1235         with contextlib.suppress(ValueError):
1236             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1237             return calendar.timegm(dt.timetuple())
1238
1239     timetuple = email.utils.parsedate_tz(date_str)
1240     if timetuple:
1241         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1242
1243
1244 def determine_ext(url, default_ext='unknown_video'):
1245     if url is None or '.' not in url:
1246         return default_ext
1247     guess = url.partition('?')[0].rpartition('.')[2]
1248     if re.match(r'^[A-Za-z0-9]+$', guess):
1249         return guess
1250     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1251     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1252         return guess.rstrip('/')
1253     else:
1254         return default_ext
1255
1256
1257 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1258     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1259
1260
1261 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1262     R"""
1263     Return a datetime object from a string.
1264     Supported format:
1265         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1266
1267     @param format       strftime format of DATE
1268     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1269                         auto: round to the unit provided in date_str (if applicable).
1270     """
1271     auto_precision = False
1272     if precision == 'auto':
1273         auto_precision = True
1274         precision = 'microsecond'
1275     today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
1276     if date_str in ('now', 'today'):
1277         return today
1278     if date_str == 'yesterday':
1279         return today - datetime.timedelta(days=1)
1280     match = re.match(
1281         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1282         date_str)
1283     if match is not None:
1284         start_time = datetime_from_str(match.group('start'), precision, format)
1285         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1286         unit = match.group('unit')
1287         if unit == 'month' or unit == 'year':
1288             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1289             unit = 'day'
1290         else:
1291             if unit == 'week':
1292                 unit = 'day'
1293                 time *= 7
1294             delta = datetime.timedelta(**{unit + 's': time})
1295             new_date = start_time + delta
1296         if auto_precision:
1297             return datetime_round(new_date, unit)
1298         return new_date
1299
1300     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1301
1302
1303 def date_from_str(date_str, format='%Y%m%d', strict=False):
1304     R"""
1305     Return a date object from a string using datetime_from_str
1306
1307     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1308                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1309     """
1310     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1311         raise ValueError(f'Invalid date format "{date_str}"')
1312     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1313
1314
1315 def datetime_add_months(dt, months):
1316     """Increment/Decrement a datetime object by months."""
1317     month = dt.month + months - 1
1318     year = dt.year + month // 12
1319     month = month % 12 + 1
1320     day = min(dt.day, calendar.monthrange(year, month)[1])
1321     return dt.replace(year, month, day)
1322
1323
1324 def datetime_round(dt, precision='day'):
1325     """
1326     Round a datetime object's time to a specific precision
1327     """
1328     if precision == 'microsecond':
1329         return dt
1330
1331     unit_seconds = {
1332         'day': 86400,
1333         'hour': 3600,
1334         'minute': 60,
1335         'second': 1,
1336     }
1337     roundto = lambda x, n: ((x + n / 2) // n) * n
1338     timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
1339     return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
1340
1341
1342 def hyphenate_date(date_str):
1343     """
1344     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1345     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1346     if match is not None:
1347         return '-'.join(match.groups())
1348     else:
1349         return date_str
1350
1351
1352 class DateRange:
1353     """Represents a time interval between two dates"""
1354
1355     def __init__(self, start=None, end=None):
1356         """start and end must be strings in the format accepted by date"""
1357         if start is not None:
1358             self.start = date_from_str(start, strict=True)
1359         else:
1360             self.start = datetime.datetime.min.date()
1361         if end is not None:
1362             self.end = date_from_str(end, strict=True)
1363         else:
1364             self.end = datetime.datetime.max.date()
1365         if self.start > self.end:
1366             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1367
1368     @classmethod
1369     def day(cls, day):
1370         """Returns a range that only contains the given day"""
1371         return cls(day, day)
1372
1373     def __contains__(self, date):
1374         """Check if the date is in the range"""
1375         if not isinstance(date, datetime.date):
1376             date = date_from_str(date)
1377         return self.start <= date <= self.end
1378
1379     def __repr__(self):
1380         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1381
1382     def __eq__(self, other):
1383         return (isinstance(other, DateRange)
1384                 and self.start == other.start and self.end == other.end)
1385
1386
1387 @functools.cache
1388 def system_identifier():
1389     python_implementation = platform.python_implementation()
1390     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1391         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1392     libc_ver = []
1393     with contextlib.suppress(OSError):  # We may not have access to the executable
1394         libc_ver = platform.libc_ver()
1395
1396     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1397         platform.python_version(),
1398         python_implementation,
1399         platform.machine(),
1400         platform.architecture()[0],
1401         platform.platform(),
1402         ssl.OPENSSL_VERSION,
1403         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1404     )
1405
1406
1407 @functools.cache
1408 def get_windows_version():
1409     ''' Get Windows version. returns () if it's not running on Windows '''
1410     if compat_os_name == 'nt':
1411         return version_tuple(platform.win32_ver()[1])
1412     else:
1413         return ()
1414
1415
1416 def write_string(s, out=None, encoding=None):
1417     assert isinstance(s, str)
1418     out = out or sys.stderr
1419     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1420     if not out:
1421         return
1422
1423     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1424         s = re.sub(r'([\r\n]+)', r' \1', s)
1425
1426     enc, buffer = None, out
1427     # `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
1428     if 'b' in (getattr(out, 'mode', None) or ''):
1429         enc = encoding or preferredencoding()
1430     elif hasattr(out, 'buffer'):
1431         buffer = out.buffer
1432         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1433
1434     buffer.write(s.encode(enc, 'ignore') if enc else s)
1435     out.flush()
1436
1437
1438 # TODO: Use global logger
1439 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1440     from .. import _IN_CLI
1441     if _IN_CLI:
1442         if msg in deprecation_warning._cache:
1443             return
1444         deprecation_warning._cache.add(msg)
1445         if printer:
1446             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1447         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1448     else:
1449         import warnings
1450         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1451
1452
1453 deprecation_warning._cache = set()
1454
1455
1456 def bytes_to_intlist(bs):
1457     if not bs:
1458         return []
1459     if isinstance(bs[0], int):  # Python 3
1460         return list(bs)
1461     else:
1462         return [ord(c) for c in bs]
1463
1464
1465 def intlist_to_bytes(xs):
1466     if not xs:
1467         return b''
1468     return struct.pack('%dB' % len(xs), *xs)
1469
1470
1471 class LockingUnsupportedError(OSError):
1472     msg = 'File locking is not supported'
1473
1474     def __init__(self):
1475         super().__init__(self.msg)
1476
1477
1478 # Cross-platform file locking
1479 if sys.platform == 'win32':
1480     import ctypes
1481     import ctypes.wintypes
1482     import msvcrt
1483
1484     class OVERLAPPED(ctypes.Structure):
1485         _fields_ = [
1486             ('Internal', ctypes.wintypes.LPVOID),
1487             ('InternalHigh', ctypes.wintypes.LPVOID),
1488             ('Offset', ctypes.wintypes.DWORD),
1489             ('OffsetHigh', ctypes.wintypes.DWORD),
1490             ('hEvent', ctypes.wintypes.HANDLE),
1491         ]
1492
1493     kernel32 = ctypes.WinDLL('kernel32')
1494     LockFileEx = kernel32.LockFileEx
1495     LockFileEx.argtypes = [
1496         ctypes.wintypes.HANDLE,     # hFile
1497         ctypes.wintypes.DWORD,      # dwFlags
1498         ctypes.wintypes.DWORD,      # dwReserved
1499         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1500         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1501         ctypes.POINTER(OVERLAPPED)  # Overlapped
1502     ]
1503     LockFileEx.restype = ctypes.wintypes.BOOL
1504     UnlockFileEx = kernel32.UnlockFileEx
1505     UnlockFileEx.argtypes = [
1506         ctypes.wintypes.HANDLE,     # hFile
1507         ctypes.wintypes.DWORD,      # dwReserved
1508         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1509         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1510         ctypes.POINTER(OVERLAPPED)  # Overlapped
1511     ]
1512     UnlockFileEx.restype = ctypes.wintypes.BOOL
1513     whole_low = 0xffffffff
1514     whole_high = 0x7fffffff
1515
1516     def _lock_file(f, exclusive, block):
1517         overlapped = OVERLAPPED()
1518         overlapped.Offset = 0
1519         overlapped.OffsetHigh = 0
1520         overlapped.hEvent = 0
1521         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1522
1523         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1524                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1525                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1526             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1527             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1528
1529     def _unlock_file(f):
1530         assert f._lock_file_overlapped_p
1531         handle = msvcrt.get_osfhandle(f.fileno())
1532         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1533             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1534
1535 else:
1536     try:
1537         import fcntl
1538
1539         def _lock_file(f, exclusive, block):
1540             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1541             if not block:
1542                 flags |= fcntl.LOCK_NB
1543             try:
1544                 fcntl.flock(f, flags)
1545             except BlockingIOError:
1546                 raise
1547             except OSError:  # AOSP does not have flock()
1548                 fcntl.lockf(f, flags)
1549
1550         def _unlock_file(f):
1551             with contextlib.suppress(OSError):
1552                 return fcntl.flock(f, fcntl.LOCK_UN)
1553             with contextlib.suppress(OSError):
1554                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1555             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1556
1557     except ImportError:
1558
1559         def _lock_file(f, exclusive, block):
1560             raise LockingUnsupportedError()
1561
1562         def _unlock_file(f):
1563             raise LockingUnsupportedError()
1564
1565
1566 class locked_file:
1567     locked = False
1568
1569     def __init__(self, filename, mode, block=True, encoding=None):
1570         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1571             raise NotImplementedError(mode)
1572         self.mode, self.block = mode, block
1573
1574         writable = any(f in mode for f in 'wax+')
1575         readable = any(f in mode for f in 'r+')
1576         flags = functools.reduce(operator.ior, (
1577             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
1578             getattr(os, 'O_BINARY', 0),  # Windows only
1579             getattr(os, 'O_NOINHERIT', 0),  # Windows only
1580             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
1581             os.O_APPEND if 'a' in mode else 0,
1582             os.O_EXCL if 'x' in mode else 0,
1583             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1584         ))
1585
1586         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1587
1588     def __enter__(self):
1589         exclusive = 'r' not in self.mode
1590         try:
1591             _lock_file(self.f, exclusive, self.block)
1592             self.locked = True
1593         except OSError:
1594             self.f.close()
1595             raise
1596         if 'w' in self.mode:
1597             try:
1598                 self.f.truncate()
1599             except OSError as e:
1600                 if e.errno not in (
1601                     errno.ESPIPE,  # Illegal seek - expected for FIFO
1602                     errno.EINVAL,  # Invalid argument - expected for /dev/null
1603                 ):
1604                     raise
1605         return self
1606
1607     def unlock(self):
1608         if not self.locked:
1609             return
1610         try:
1611             _unlock_file(self.f)
1612         finally:
1613             self.locked = False
1614
1615     def __exit__(self, *_):
1616         try:
1617             self.unlock()
1618         finally:
1619             self.f.close()
1620
1621     open = __enter__
1622     close = __exit__
1623
1624     def __getattr__(self, attr):
1625         return getattr(self.f, attr)
1626
1627     def __iter__(self):
1628         return iter(self.f)
1629
1630
1631 @functools.cache
1632 def get_filesystem_encoding():
1633     encoding = sys.getfilesystemencoding()
1634     return encoding if encoding is not None else 'utf-8'
1635
1636
1637 def shell_quote(args):
1638     quoted_args = []
1639     encoding = get_filesystem_encoding()
1640     for a in args:
1641         if isinstance(a, bytes):
1642             # We may get a filename encoded with 'encodeFilename'
1643             a = a.decode(encoding)
1644         quoted_args.append(compat_shlex_quote(a))
1645     return ' '.join(quoted_args)
1646
1647
1648 def smuggle_url(url, data):
1649     """ Pass additional data in a URL for internal use. """
1650
1651     url, idata = unsmuggle_url(url, {})
1652     data.update(idata)
1653     sdata = urllib.parse.urlencode(
1654         {'__youtubedl_smuggle': json.dumps(data)})
1655     return url + '#' + sdata
1656
1657
1658 def unsmuggle_url(smug_url, default=None):
1659     if '#__youtubedl_smuggle' not in smug_url:
1660         return smug_url, default
1661     url, _, sdata = smug_url.rpartition('#')
1662     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1663     data = json.loads(jsond)
1664     return url, data
1665
1666
1667 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1668     """ Formats numbers with decimal sufixes like K, M, etc """
1669     num, factor = float_or_none(num), float(factor)
1670     if num is None or num < 0:
1671         return None
1672     POSSIBLE_SUFFIXES = 'kMGTPEZY'
1673     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1674     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1675     if factor == 1024:
1676         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1677     converted = num / (factor ** exponent)
1678     return fmt % (converted, suffix)
1679
1680
1681 def format_bytes(bytes):
1682     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1683
1684
1685 def lookup_unit_table(unit_table, s, strict=False):
1686     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1687     units_re = '|'.join(re.escape(u) for u in unit_table)
1688     m = (re.fullmatch if strict else re.match)(
1689         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1690     if not m:
1691         return None
1692
1693     num = float(m.group('num').replace(',', '.'))
1694     mult = unit_table[m.group('unit')]
1695     return round(num * mult)
1696
1697
1698 def parse_bytes(s):
1699     """Parse a string indicating a byte quantity into an integer"""
1700     return lookup_unit_table(
1701         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1702         s.upper(), strict=True)
1703
1704
1705 def parse_filesize(s):
1706     if s is None:
1707         return None
1708
1709     # The lower-case forms are of course incorrect and unofficial,
1710     # but we support those too
1711     _UNIT_TABLE = {
1712         'B': 1,
1713         'b': 1,
1714         'bytes': 1,
1715         'KiB': 1024,
1716         'KB': 1000,
1717         'kB': 1024,
1718         'Kb': 1000,
1719         'kb': 1000,
1720         'kilobytes': 1000,
1721         'kibibytes': 1024,
1722         'MiB': 1024 ** 2,
1723         'MB': 1000 ** 2,
1724         'mB': 1024 ** 2,
1725         'Mb': 1000 ** 2,
1726         'mb': 1000 ** 2,
1727         'megabytes': 1000 ** 2,
1728         'mebibytes': 1024 ** 2,
1729         'GiB': 1024 ** 3,
1730         'GB': 1000 ** 3,
1731         'gB': 1024 ** 3,
1732         'Gb': 1000 ** 3,
1733         'gb': 1000 ** 3,
1734         'gigabytes': 1000 ** 3,
1735         'gibibytes': 1024 ** 3,
1736         'TiB': 1024 ** 4,
1737         'TB': 1000 ** 4,
1738         'tB': 1024 ** 4,
1739         'Tb': 1000 ** 4,
1740         'tb': 1000 ** 4,
1741         'terabytes': 1000 ** 4,
1742         'tebibytes': 1024 ** 4,
1743         'PiB': 1024 ** 5,
1744         'PB': 1000 ** 5,
1745         'pB': 1024 ** 5,
1746         'Pb': 1000 ** 5,
1747         'pb': 1000 ** 5,
1748         'petabytes': 1000 ** 5,
1749         'pebibytes': 1024 ** 5,
1750         'EiB': 1024 ** 6,
1751         'EB': 1000 ** 6,
1752         'eB': 1024 ** 6,
1753         'Eb': 1000 ** 6,
1754         'eb': 1000 ** 6,
1755         'exabytes': 1000 ** 6,
1756         'exbibytes': 1024 ** 6,
1757         'ZiB': 1024 ** 7,
1758         'ZB': 1000 ** 7,
1759         'zB': 1024 ** 7,
1760         'Zb': 1000 ** 7,
1761         'zb': 1000 ** 7,
1762         'zettabytes': 1000 ** 7,
1763         'zebibytes': 1024 ** 7,
1764         'YiB': 1024 ** 8,
1765         'YB': 1000 ** 8,
1766         'yB': 1024 ** 8,
1767         'Yb': 1000 ** 8,
1768         'yb': 1000 ** 8,
1769         'yottabytes': 1000 ** 8,
1770         'yobibytes': 1024 ** 8,
1771     }
1772
1773     return lookup_unit_table(_UNIT_TABLE, s)
1774
1775
1776 def parse_count(s):
1777     if s is None:
1778         return None
1779
1780     s = re.sub(r'^[^\d]+\s', '', s).strip()
1781
1782     if re.match(r'^[\d,.]+$', s):
1783         return str_to_int(s)
1784
1785     _UNIT_TABLE = {
1786         'k': 1000,
1787         'K': 1000,
1788         'm': 1000 ** 2,
1789         'M': 1000 ** 2,
1790         'kk': 1000 ** 2,
1791         'KK': 1000 ** 2,
1792         'b': 1000 ** 3,
1793         'B': 1000 ** 3,
1794     }
1795
1796     ret = lookup_unit_table(_UNIT_TABLE, s)
1797     if ret is not None:
1798         return ret
1799
1800     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1801     if mobj:
1802         return str_to_int(mobj.group(1))
1803
1804
1805 def parse_resolution(s, *, lenient=False):
1806     if s is None:
1807         return {}
1808
1809     if lenient:
1810         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1811     else:
1812         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1813     if mobj:
1814         return {
1815             'width': int(mobj.group('w')),
1816             'height': int(mobj.group('h')),
1817         }
1818
1819     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1820     if mobj:
1821         return {'height': int(mobj.group(1))}
1822
1823     mobj = re.search(r'\b([48])[kK]\b', s)
1824     if mobj:
1825         return {'height': int(mobj.group(1)) * 540}
1826
1827     return {}
1828
1829
1830 def parse_bitrate(s):
1831     if not isinstance(s, str):
1832         return
1833     mobj = re.search(r'\b(\d+)\s*kbps', s)
1834     if mobj:
1835         return int(mobj.group(1))
1836
1837
1838 def month_by_name(name, lang='en'):
1839     """ Return the number of a month by (locale-independently) English name """
1840
1841     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1842
1843     try:
1844         return month_names.index(name) + 1
1845     except ValueError:
1846         return None
1847
1848
1849 def month_by_abbreviation(abbrev):
1850     """ Return the number of a month by (locale-independently) English
1851         abbreviations """
1852
1853     try:
1854         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1855     except ValueError:
1856         return None
1857
1858
1859 def fix_xml_ampersands(xml_str):
1860     """Replace all the '&' by '&amp;' in XML"""
1861     return re.sub(
1862         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1863         '&amp;',
1864         xml_str)
1865
1866
1867 def setproctitle(title):
1868     assert isinstance(title, str)
1869
1870     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1871     try:
1872         import ctypes
1873     except ImportError:
1874         return
1875
1876     try:
1877         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1878     except OSError:
1879         return
1880     except TypeError:
1881         # LoadLibrary in Windows Python 2.7.13 only expects
1882         # a bytestring, but since unicode_literals turns
1883         # every string into a unicode string, it fails.
1884         return
1885     title_bytes = title.encode()
1886     buf = ctypes.create_string_buffer(len(title_bytes))
1887     buf.value = title_bytes
1888     try:
1889         # PR_SET_NAME = 15      Ref: /usr/include/linux/prctl.h
1890         libc.prctl(15, buf, 0, 0, 0)
1891     except AttributeError:
1892         return  # Strange libc, just skip this
1893
1894
1895 def remove_start(s, start):
1896     return s[len(start):] if s is not None and s.startswith(start) else s
1897
1898
1899 def remove_end(s, end):
1900     return s[:-len(end)] if s is not None and s.endswith(end) else s
1901
1902
1903 def remove_quotes(s):
1904     if s is None or len(s) < 2:
1905         return s
1906     for quote in ('"', "'", ):
1907         if s[0] == quote and s[-1] == quote:
1908             return s[1:-1]
1909     return s
1910
1911
1912 def get_domain(url):
1913     """
1914     This implementation is inconsistent, but is kept for compatibility.
1915     Use this only for "webpage_url_domain"
1916     """
1917     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1918
1919
1920 def url_basename(url):
1921     path = urllib.parse.urlparse(url).path
1922     return path.strip('/').split('/')[-1]
1923
1924
1925 def base_url(url):
1926     return re.match(r'https?://[^?#]+/', url).group()
1927
1928
1929 def urljoin(base, path):
1930     if isinstance(path, bytes):
1931         path = path.decode()
1932     if not isinstance(path, str) or not path:
1933         return None
1934     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1935         return path
1936     if isinstance(base, bytes):
1937         base = base.decode()
1938     if not isinstance(base, str) or not re.match(
1939             r'^(?:https?:)?//', base):
1940         return None
1941     return urllib.parse.urljoin(base, path)
1942
1943
1944 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1945     if get_attr and v is not None:
1946         v = getattr(v, get_attr, None)
1947     try:
1948         return int(v) * invscale // scale
1949     except (ValueError, TypeError, OverflowError):
1950         return default
1951
1952
1953 def str_or_none(v, default=None):
1954     return default if v is None else str(v)
1955
1956
1957 def str_to_int(int_str):
1958     """ A more relaxed version of int_or_none """
1959     if isinstance(int_str, int):
1960         return int_str
1961     elif isinstance(int_str, str):
1962         int_str = re.sub(r'[,\.\+]', '', int_str)
1963         return int_or_none(int_str)
1964
1965
1966 def float_or_none(v, scale=1, invscale=1, default=None):
1967     if v is None:
1968         return default
1969     try:
1970         return float(v) * invscale / scale
1971     except (ValueError, TypeError):
1972         return default
1973
1974
1975 def bool_or_none(v, default=None):
1976     return v if isinstance(v, bool) else default
1977
1978
1979 def strip_or_none(v, default=None):
1980     return v.strip() if isinstance(v, str) else default
1981
1982
1983 def url_or_none(url):
1984     if not url or not isinstance(url, str):
1985         return None
1986     url = url.strip()
1987     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
1988
1989
1990 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
1991     datetime_object = None
1992     try:
1993         if isinstance(timestamp, (int, float)):  # unix timestamp
1994             # Using naive datetime here can break timestamp() in Windows
1995             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
1996             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
1997             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
1998             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
1999                                + datetime.timedelta(seconds=timestamp))
2000         elif isinstance(timestamp, str):  # assume YYYYMMDD
2001             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2002         date_format = re.sub(  # Support %s on windows
2003             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2004         return datetime_object.strftime(date_format)
2005     except (ValueError, TypeError, AttributeError):
2006         return default
2007
2008
2009 def parse_duration(s):
2010     if not isinstance(s, str):
2011         return None
2012     s = s.strip()
2013     if not s:
2014         return None
2015
2016     days, hours, mins, secs, ms = [None] * 5
2017     m = re.match(r'''(?x)
2018             (?P<before_secs>
2019                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2020             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2021             (?P<ms>[.:][0-9]+)?Z?$
2022         ''', s)
2023     if m:
2024         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2025     else:
2026         m = re.match(
2027             r'''(?ix)(?:P?
2028                 (?:
2029                     [0-9]+\s*y(?:ears?)?,?\s*
2030                 )?
2031                 (?:
2032                     [0-9]+\s*m(?:onths?)?,?\s*
2033                 )?
2034                 (?:
2035                     [0-9]+\s*w(?:eeks?)?,?\s*
2036                 )?
2037                 (?:
2038                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2039                 )?
2040                 T)?
2041                 (?:
2042                     (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2043                 )?
2044                 (?:
2045                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2046                 )?
2047                 (?:
2048                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2049                 )?Z?$''', s)
2050         if m:
2051             days, hours, mins, secs, ms = m.groups()
2052         else:
2053             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2054             if m:
2055                 hours, mins = m.groups()
2056             else:
2057                 return None
2058
2059     if ms:
2060         ms = ms.replace(':', '.')
2061     return sum(float(part or 0) * mult for part, mult in (
2062         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2063
2064
2065 def prepend_extension(filename, ext, expected_real_ext=None):
2066     name, real_ext = os.path.splitext(filename)
2067     return (
2068         f'{name}.{ext}{real_ext}'
2069         if not expected_real_ext or real_ext[1:] == expected_real_ext
2070         else f'{filename}.{ext}')
2071
2072
2073 def replace_extension(filename, ext, expected_real_ext=None):
2074     name, real_ext = os.path.splitext(filename)
2075     return '{}.{}'.format(
2076         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2077         ext)
2078
2079
2080 def check_executable(exe, args=[]):
2081     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2082     args can be a list of arguments for a short output (like -version) """
2083     try:
2084         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2085     except OSError:
2086         return False
2087     return exe
2088
2089
2090 def _get_exe_version_output(exe, args):
2091     try:
2092         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2093         # SIGTTOU if yt-dlp is run in the background.
2094         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2095         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2096                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2097         if ret:
2098             return None
2099     except OSError:
2100         return False
2101     return stdout
2102
2103
2104 def detect_exe_version(output, version_re=None, unrecognized='present'):
2105     assert isinstance(output, str)
2106     if version_re is None:
2107         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2108     m = re.search(version_re, output)
2109     if m:
2110         return m.group(1)
2111     else:
2112         return unrecognized
2113
2114
2115 def get_exe_version(exe, args=['--version'],
2116                     version_re=None, unrecognized=('present', 'broken')):
2117     """ Returns the version of the specified executable,
2118     or False if the executable is not present """
2119     unrecognized = variadic(unrecognized)
2120     assert len(unrecognized) in (1, 2)
2121     out = _get_exe_version_output(exe, args)
2122     if out is None:
2123         return unrecognized[-1]
2124     return out and detect_exe_version(out, version_re, unrecognized[0])
2125
2126
2127 def frange(start=0, stop=None, step=1):
2128     """Float range"""
2129     if stop is None:
2130         start, stop = 0, start
2131     sign = [-1, 1][step > 0] if step else 0
2132     while sign * start < sign * stop:
2133         yield start
2134         start += step
2135
2136
2137 class LazyList(collections.abc.Sequence):
2138     """Lazy immutable list from an iterable
2139     Note that slices of a LazyList are lists and not LazyList"""
2140
2141     class IndexError(IndexError):
2142         pass
2143
2144     def __init__(self, iterable, *, reverse=False, _cache=None):
2145         self._iterable = iter(iterable)
2146         self._cache = [] if _cache is None else _cache
2147         self._reversed = reverse
2148
2149     def __iter__(self):
2150         if self._reversed:
2151             # We need to consume the entire iterable to iterate in reverse
2152             yield from self.exhaust()
2153             return
2154         yield from self._cache
2155         for item in self._iterable:
2156             self._cache.append(item)
2157             yield item
2158
2159     def _exhaust(self):
2160         self._cache.extend(self._iterable)
2161         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2162         return self._cache
2163
2164     def exhaust(self):
2165         """Evaluate the entire iterable"""
2166         return self._exhaust()[::-1 if self._reversed else 1]
2167
2168     @staticmethod
2169     def _reverse_index(x):
2170         return None if x is None else ~x
2171
2172     def __getitem__(self, idx):
2173         if isinstance(idx, slice):
2174             if self._reversed:
2175                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2176             start, stop, step = idx.start, idx.stop, idx.step or 1
2177         elif isinstance(idx, int):
2178             if self._reversed:
2179                 idx = self._reverse_index(idx)
2180             start, stop, step = idx, idx, 0
2181         else:
2182             raise TypeError('indices must be integers or slices')
2183         if ((start or 0) < 0 or (stop or 0) < 0
2184                 or (start is None and step < 0)
2185                 or (stop is None and step > 0)):
2186             # We need to consume the entire iterable to be able to slice from the end
2187             # Obviously, never use this with infinite iterables
2188             self._exhaust()
2189             try:
2190                 return self._cache[idx]
2191             except IndexError as e:
2192                 raise self.IndexError(e) from e
2193         n = max(start or 0, stop or 0) - len(self._cache) + 1
2194         if n > 0:
2195             self._cache.extend(itertools.islice(self._iterable, n))
2196         try:
2197             return self._cache[idx]
2198         except IndexError as e:
2199             raise self.IndexError(e) from e
2200
2201     def __bool__(self):
2202         try:
2203             self[-1] if self._reversed else self[0]
2204         except self.IndexError:
2205             return False
2206         return True
2207
2208     def __len__(self):
2209         self._exhaust()
2210         return len(self._cache)
2211
2212     def __reversed__(self):
2213         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2214
2215     def __copy__(self):
2216         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2217
2218     def __repr__(self):
2219         # repr and str should mimic a list. So we exhaust the iterable
2220         return repr(self.exhaust())
2221
2222     def __str__(self):
2223         return repr(self.exhaust())
2224
2225
2226 class PagedList:
2227
2228     class IndexError(IndexError):
2229         pass
2230
2231     def __len__(self):
2232         # This is only useful for tests
2233         return len(self.getslice())
2234
2235     def __init__(self, pagefunc, pagesize, use_cache=True):
2236         self._pagefunc = pagefunc
2237         self._pagesize = pagesize
2238         self._pagecount = float('inf')
2239         self._use_cache = use_cache
2240         self._cache = {}
2241
2242     def getpage(self, pagenum):
2243         page_results = self._cache.get(pagenum)
2244         if page_results is None:
2245             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2246         if self._use_cache:
2247             self._cache[pagenum] = page_results
2248         return page_results
2249
2250     def getslice(self, start=0, end=None):
2251         return list(self._getslice(start, end))
2252
2253     def _getslice(self, start, end):
2254         raise NotImplementedError('This method must be implemented by subclasses')
2255
2256     def __getitem__(self, idx):
2257         assert self._use_cache, 'Indexing PagedList requires cache'
2258         if not isinstance(idx, int) or idx < 0:
2259             raise TypeError('indices must be non-negative integers')
2260         entries = self.getslice(idx, idx + 1)
2261         if not entries:
2262             raise self.IndexError()
2263         return entries[0]
2264
2265     def __bool__(self):
2266         return bool(self.getslice(0, 1))
2267
2268
2269 class OnDemandPagedList(PagedList):
2270     """Download pages until a page with less than maximum results"""
2271
2272     def _getslice(self, start, end):
2273         for pagenum in itertools.count(start // self._pagesize):
2274             firstid = pagenum * self._pagesize
2275             nextfirstid = pagenum * self._pagesize + self._pagesize
2276             if start >= nextfirstid:
2277                 continue
2278
2279             startv = (
2280                 start % self._pagesize
2281                 if firstid <= start < nextfirstid
2282                 else 0)
2283             endv = (
2284                 ((end - 1) % self._pagesize) + 1
2285                 if (end is not None and firstid <= end <= nextfirstid)
2286                 else None)
2287
2288             try:
2289                 page_results = self.getpage(pagenum)
2290             except Exception:
2291                 self._pagecount = pagenum - 1
2292                 raise
2293             if startv != 0 or endv is not None:
2294                 page_results = page_results[startv:endv]
2295             yield from page_results
2296
2297             # A little optimization - if current page is not "full", ie. does
2298             # not contain page_size videos then we can assume that this page
2299             # is the last one - there are no more ids on further pages -
2300             # i.e. no need to query again.
2301             if len(page_results) + startv < self._pagesize:
2302                 break
2303
2304             # If we got the whole page, but the next page is not interesting,
2305             # break out early as well
2306             if end == nextfirstid:
2307                 break
2308
2309
2310 class InAdvancePagedList(PagedList):
2311     """PagedList with total number of pages known in advance"""
2312
2313     def __init__(self, pagefunc, pagecount, pagesize):
2314         PagedList.__init__(self, pagefunc, pagesize, True)
2315         self._pagecount = pagecount
2316
2317     def _getslice(self, start, end):
2318         start_page = start // self._pagesize
2319         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2320         skip_elems = start - start_page * self._pagesize
2321         only_more = None if end is None else end - start
2322         for pagenum in range(start_page, end_page):
2323             page_results = self.getpage(pagenum)
2324             if skip_elems:
2325                 page_results = page_results[skip_elems:]
2326                 skip_elems = None
2327             if only_more is not None:
2328                 if len(page_results) < only_more:
2329                     only_more -= len(page_results)
2330                 else:
2331                     yield from page_results[:only_more]
2332                     break
2333             yield from page_results
2334
2335
2336 class PlaylistEntries:
2337     MissingEntry = object()
2338     is_exhausted = False
2339
2340     def __init__(self, ydl, info_dict):
2341         self.ydl = ydl
2342
2343         # _entries must be assigned now since infodict can change during iteration
2344         entries = info_dict.get('entries')
2345         if entries is None:
2346             raise EntryNotInPlaylist('There are no entries')
2347         elif isinstance(entries, list):
2348             self.is_exhausted = True
2349
2350         requested_entries = info_dict.get('requested_entries')
2351         self.is_incomplete = requested_entries is not None
2352         if self.is_incomplete:
2353             assert self.is_exhausted
2354             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2355             for i, entry in zip(requested_entries, entries):
2356                 self._entries[i - 1] = entry
2357         elif isinstance(entries, (list, PagedList, LazyList)):
2358             self._entries = entries
2359         else:
2360             self._entries = LazyList(entries)
2361
2362     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2363         (?P<start>[+-]?\d+)?
2364         (?P<range>[:-]
2365             (?P<end>[+-]?\d+|inf(?:inite)?)?
2366             (?::(?P<step>[+-]?\d+))?
2367         )?''')
2368
2369     @classmethod
2370     def parse_playlist_items(cls, string):
2371         for segment in string.split(','):
2372             if not segment:
2373                 raise ValueError('There is two or more consecutive commas')
2374             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2375             if not mobj:
2376                 raise ValueError(f'{segment!r} is not a valid specification')
2377             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2378             if int_or_none(step) == 0:
2379                 raise ValueError(f'Step in {segment!r} cannot be zero')
2380             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2381
2382     def get_requested_items(self):
2383         playlist_items = self.ydl.params.get('playlist_items')
2384         playlist_start = self.ydl.params.get('playliststart', 1)
2385         playlist_end = self.ydl.params.get('playlistend')
2386         # For backwards compatibility, interpret -1 as whole list
2387         if playlist_end in (-1, None):
2388             playlist_end = ''
2389         if not playlist_items:
2390             playlist_items = f'{playlist_start}:{playlist_end}'
2391         elif playlist_start != 1 or playlist_end:
2392             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2393
2394         for index in self.parse_playlist_items(playlist_items):
2395             for i, entry in self[index]:
2396                 yield i, entry
2397                 if not entry:
2398                     continue
2399                 try:
2400                     # The item may have just been added to archive. Don't break due to it
2401                     if not self.ydl.params.get('lazy_playlist'):
2402                         # TODO: Add auto-generated fields
2403                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2404                 except (ExistingVideoReached, RejectedVideoReached):
2405                     return
2406
2407     def get_full_count(self):
2408         if self.is_exhausted and not self.is_incomplete:
2409             return len(self)
2410         elif isinstance(self._entries, InAdvancePagedList):
2411             if self._entries._pagesize == 1:
2412                 return self._entries._pagecount
2413
2414     @functools.cached_property
2415     def _getter(self):
2416         if isinstance(self._entries, list):
2417             def get_entry(i):
2418                 try:
2419                     entry = self._entries[i]
2420                 except IndexError:
2421                     entry = self.MissingEntry
2422                     if not self.is_incomplete:
2423                         raise self.IndexError()
2424                 if entry is self.MissingEntry:
2425                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2426                 return entry
2427         else:
2428             def get_entry(i):
2429                 try:
2430                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2431                 except (LazyList.IndexError, PagedList.IndexError):
2432                     raise self.IndexError()
2433         return get_entry
2434
2435     def __getitem__(self, idx):
2436         if isinstance(idx, int):
2437             idx = slice(idx, idx)
2438
2439         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2440         step = 1 if idx.step is None else idx.step
2441         if idx.start is None:
2442             start = 0 if step > 0 else len(self) - 1
2443         else:
2444             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2445
2446         # NB: Do not call len(self) when idx == [:]
2447         if idx.stop is None:
2448             stop = 0 if step < 0 else float('inf')
2449         else:
2450             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2451         stop += [-1, 1][step > 0]
2452
2453         for i in frange(start, stop, step):
2454             if i < 0:
2455                 continue
2456             try:
2457                 entry = self._getter(i)
2458             except self.IndexError:
2459                 self.is_exhausted = True
2460                 if step > 0:
2461                     break
2462                 continue
2463             yield i + 1, entry
2464
2465     def __len__(self):
2466         return len(tuple(self[:]))
2467
2468     class IndexError(IndexError):
2469         pass
2470
2471
2472 def uppercase_escape(s):
2473     unicode_escape = codecs.getdecoder('unicode_escape')
2474     return re.sub(
2475         r'\\U[0-9a-fA-F]{8}',
2476         lambda m: unicode_escape(m.group(0))[0],
2477         s)
2478
2479
2480 def lowercase_escape(s):
2481     unicode_escape = codecs.getdecoder('unicode_escape')
2482     return re.sub(
2483         r'\\u[0-9a-fA-F]{4}',
2484         lambda m: unicode_escape(m.group(0))[0],
2485         s)
2486
2487
2488 def parse_qs(url, **kwargs):
2489     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2490
2491
2492 def read_batch_urls(batch_fd):
2493     def fixup(url):
2494         if not isinstance(url, str):
2495             url = url.decode('utf-8', 'replace')
2496         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2497         for bom in BOM_UTF8:
2498             if url.startswith(bom):
2499                 url = url[len(bom):]
2500         url = url.lstrip()
2501         if not url or url.startswith(('#', ';', ']')):
2502             return False
2503         # "#" cannot be stripped out since it is part of the URI
2504         # However, it can be safely stripped out if following a whitespace
2505         return re.split(r'\s#', url, 1)[0].rstrip()
2506
2507     with contextlib.closing(batch_fd) as fd:
2508         return [url for url in map(fixup, fd) if url]
2509
2510
2511 def urlencode_postdata(*args, **kargs):
2512     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2513
2514
2515 def update_url(url, *, query_update=None, **kwargs):
2516     """Replace URL components specified by kwargs
2517        @param url           str or parse url tuple
2518        @param query_update  update query
2519        @returns             str
2520     """
2521     if isinstance(url, str):
2522         if not kwargs and not query_update:
2523             return url
2524         else:
2525             url = urllib.parse.urlparse(url)
2526     if query_update:
2527         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2528         kwargs['query'] = urllib.parse.urlencode({
2529             **urllib.parse.parse_qs(url.query),
2530             **query_update
2531         }, True)
2532     return urllib.parse.urlunparse(url._replace(**kwargs))
2533
2534
2535 def update_url_query(url, query):
2536     return update_url(url, query_update=query)
2537
2538
2539 def _multipart_encode_impl(data, boundary):
2540     content_type = 'multipart/form-data; boundary=%s' % boundary
2541
2542     out = b''
2543     for k, v in data.items():
2544         out += b'--' + boundary.encode('ascii') + b'\r\n'
2545         if isinstance(k, str):
2546             k = k.encode()
2547         if isinstance(v, str):
2548             v = v.encode()
2549         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2550         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2551         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2552         if boundary.encode('ascii') in content:
2553             raise ValueError('Boundary overlaps with data')
2554         out += content
2555
2556     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2557
2558     return out, content_type
2559
2560
2561 def multipart_encode(data, boundary=None):
2562     '''
2563     Encode a dict to RFC 7578-compliant form-data
2564
2565     data:
2566         A dict where keys and values can be either Unicode or bytes-like
2567         objects.
2568     boundary:
2569         If specified a Unicode object, it's used as the boundary. Otherwise
2570         a random boundary is generated.
2571
2572     Reference: https://tools.ietf.org/html/rfc7578
2573     '''
2574     has_specified_boundary = boundary is not None
2575
2576     while True:
2577         if boundary is None:
2578             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2579
2580         try:
2581             out, content_type = _multipart_encode_impl(data, boundary)
2582             break
2583         except ValueError:
2584             if has_specified_boundary:
2585                 raise
2586             boundary = None
2587
2588     return out, content_type
2589
2590
2591 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2592     if blocked_types is NO_DEFAULT:
2593         blocked_types = (str, bytes, collections.abc.Mapping)
2594     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2595
2596
2597 def variadic(x, allowed_types=NO_DEFAULT):
2598     if not isinstance(allowed_types, (tuple, type)):
2599         deprecation_warning('allowed_types should be a tuple or a type')
2600         allowed_types = tuple(allowed_types)
2601     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2602
2603
2604 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2605     for f in funcs:
2606         try:
2607             val = f(*args, **kwargs)
2608         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2609             pass
2610         else:
2611             if expected_type is None or isinstance(val, expected_type):
2612                 return val
2613
2614
2615 def try_get(src, getter, expected_type=None):
2616     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2617
2618
2619 def filter_dict(dct, cndn=lambda _, v: v is not None):
2620     return {k: v for k, v in dct.items() if cndn(k, v)}
2621
2622
2623 def merge_dicts(*dicts):
2624     merged = {}
2625     for a_dict in dicts:
2626         for k, v in a_dict.items():
2627             if (v is not None and k not in merged
2628                     or isinstance(v, str) and merged[k] == ''):
2629                 merged[k] = v
2630     return merged
2631
2632
2633 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2634     return string if isinstance(string, str) else str(string, encoding, errors)
2635
2636
2637 US_RATINGS = {
2638     'G': 0,
2639     'PG': 10,
2640     'PG-13': 13,
2641     'R': 16,
2642     'NC': 18,
2643 }
2644
2645
2646 TV_PARENTAL_GUIDELINES = {
2647     'TV-Y': 0,
2648     'TV-Y7': 7,
2649     'TV-G': 0,
2650     'TV-PG': 0,
2651     'TV-14': 14,
2652     'TV-MA': 17,
2653 }
2654
2655
2656 def parse_age_limit(s):
2657     # isinstance(False, int) is True. So type() must be used instead
2658     if type(s) is int:  # noqa: E721
2659         return s if 0 <= s <= 21 else None
2660     elif not isinstance(s, str):
2661         return None
2662     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2663     if m:
2664         return int(m.group('age'))
2665     s = s.upper()
2666     if s in US_RATINGS:
2667         return US_RATINGS[s]
2668     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2669     if m:
2670         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2671     return None
2672
2673
2674 def strip_jsonp(code):
2675     return re.sub(
2676         r'''(?sx)^
2677             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2678             (?:\s*&&\s*(?P=func_name))?
2679             \s*\(\s*(?P<callback_data>.*)\);?
2680             \s*?(?://[^\n]*)*$''',
2681         r'\g<callback_data>', code)
2682
2683
2684 def js_to_json(code, vars={}, *, strict=False):
2685     # vars is a dict of var, val pairs to substitute
2686     STRING_QUOTES = '\'"`'
2687     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2688     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2689     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2690     INTEGER_TABLE = (
2691         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2692         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2693     )
2694
2695     def process_escape(match):
2696         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2697         escape = match.group(1) or match.group(2)
2698
2699         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2700                 else R'\u00' if escape == 'x'
2701                 else '' if escape == '\n'
2702                 else escape)
2703
2704     def template_substitute(match):
2705         evaluated = js_to_json(match.group(1), vars, strict=strict)
2706         if evaluated[0] == '"':
2707             return json.loads(evaluated)
2708         return evaluated
2709
2710     def fix_kv(m):
2711         v = m.group(0)
2712         if v in ('true', 'false', 'null'):
2713             return v
2714         elif v in ('undefined', 'void 0'):
2715             return 'null'
2716         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2717             return ''
2718
2719         if v[0] in STRING_QUOTES:
2720             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2721             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2722             return f'"{escaped}"'
2723
2724         for regex, base in INTEGER_TABLE:
2725             im = re.match(regex, v)
2726             if im:
2727                 i = int(im.group(1), base)
2728                 return f'"{i}":' if v.endswith(':') else str(i)
2729
2730         if v in vars:
2731             try:
2732                 if not strict:
2733                     json.loads(vars[v])
2734             except json.JSONDecodeError:
2735                 return json.dumps(vars[v])
2736             else:
2737                 return vars[v]
2738
2739         if not strict:
2740             return f'"{v}"'
2741
2742         raise ValueError(f'Unknown value: {v}')
2743
2744     def create_map(mobj):
2745         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2746
2747     code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
2748     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2749     if not strict:
2750         code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
2751         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2752         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2753         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2754
2755     return re.sub(rf'''(?sx)
2756         {STRING_RE}|
2757         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2758         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2759         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2760         [0-9]+(?={SKIP_RE}:)|
2761         !+
2762         ''', fix_kv, code)
2763
2764
2765 def qualities(quality_ids):
2766     """ Get a numeric quality value out of a list of possible values """
2767     def q(qid):
2768         try:
2769             return quality_ids.index(qid)
2770         except ValueError:
2771             return -1
2772     return q
2773
2774
2775 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2776
2777
2778 DEFAULT_OUTTMPL = {
2779     'default': '%(title)s [%(id)s].%(ext)s',
2780     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2781 }
2782 OUTTMPL_TYPES = {
2783     'chapter': None,
2784     'subtitle': None,
2785     'thumbnail': None,
2786     'description': 'description',
2787     'annotation': 'annotations.xml',
2788     'infojson': 'info.json',
2789     'link': None,
2790     'pl_video': None,
2791     'pl_thumbnail': None,
2792     'pl_description': 'description',
2793     'pl_infojson': 'info.json',
2794 }
2795
2796 # As of [1] format syntax is:
2797 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2798 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2799 STR_FORMAT_RE_TMPL = r'''(?x)
2800     (?<!%)(?P<prefix>(?:%%)*)
2801     %
2802     (?P<has_key>\((?P<key>{0})\))?
2803     (?P<format>
2804         (?P<conversion>[#0\-+ ]+)?
2805         (?P<min_width>\d+)?
2806         (?P<precision>\.\d+)?
2807         (?P<len_mod>[hlL])?  # unused in python
2808         {1}  # conversion type
2809     )
2810 '''
2811
2812
2813 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2814
2815
2816 def limit_length(s, length):
2817     """ Add ellipses to overly long strings """
2818     if s is None:
2819         return None
2820     ELLIPSES = '...'
2821     if len(s) > length:
2822         return s[:length - len(ELLIPSES)] + ELLIPSES
2823     return s
2824
2825
2826 def version_tuple(v):
2827     return tuple(int(e) for e in re.split(r'[-.]', v))
2828
2829
2830 def is_outdated_version(version, limit, assume_new=True):
2831     if not version:
2832         return not assume_new
2833     try:
2834         return version_tuple(version) < version_tuple(limit)
2835     except ValueError:
2836         return not assume_new
2837
2838
2839 def ytdl_is_updateable():
2840     """ Returns if yt-dlp can be updated with -U """
2841
2842     from ..update import is_non_updateable
2843
2844     return not is_non_updateable()
2845
2846
2847 def args_to_str(args):
2848     # Get a short string representation for a subprocess command
2849     return ' '.join(compat_shlex_quote(a) for a in args)
2850
2851
2852 def error_to_str(err):
2853     return f'{type(err).__name__}: {err}'
2854
2855
2856 def mimetype2ext(mt, default=NO_DEFAULT):
2857     if not isinstance(mt, str):
2858         if default is not NO_DEFAULT:
2859             return default
2860         return None
2861
2862     MAP = {
2863         # video
2864         '3gpp': '3gp',
2865         'mp2t': 'ts',
2866         'mp4': 'mp4',
2867         'mpeg': 'mpeg',
2868         'mpegurl': 'm3u8',
2869         'quicktime': 'mov',
2870         'webm': 'webm',
2871         'vp9': 'vp9',
2872         'video/ogg': 'ogv',
2873         'x-flv': 'flv',
2874         'x-m4v': 'm4v',
2875         'x-matroska': 'mkv',
2876         'x-mng': 'mng',
2877         'x-mp4-fragmented': 'mp4',
2878         'x-ms-asf': 'asf',
2879         'x-ms-wmv': 'wmv',
2880         'x-msvideo': 'avi',
2881
2882         # application (streaming playlists)
2883         'dash+xml': 'mpd',
2884         'f4m+xml': 'f4m',
2885         'hds+xml': 'f4m',
2886         'vnd.apple.mpegurl': 'm3u8',
2887         'vnd.ms-sstr+xml': 'ism',
2888         'x-mpegurl': 'm3u8',
2889
2890         # audio
2891         'audio/mp4': 'm4a',
2892         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2893         # Using .mp3 as it's the most popular one
2894         'audio/mpeg': 'mp3',
2895         'audio/webm': 'webm',
2896         'audio/x-matroska': 'mka',
2897         'audio/x-mpegurl': 'm3u',
2898         'midi': 'mid',
2899         'ogg': 'ogg',
2900         'wav': 'wav',
2901         'wave': 'wav',
2902         'x-aac': 'aac',
2903         'x-flac': 'flac',
2904         'x-m4a': 'm4a',
2905         'x-realaudio': 'ra',
2906         'x-wav': 'wav',
2907
2908         # image
2909         'avif': 'avif',
2910         'bmp': 'bmp',
2911         'gif': 'gif',
2912         'jpeg': 'jpg',
2913         'png': 'png',
2914         'svg+xml': 'svg',
2915         'tiff': 'tif',
2916         'vnd.wap.wbmp': 'wbmp',
2917         'webp': 'webp',
2918         'x-icon': 'ico',
2919         'x-jng': 'jng',
2920         'x-ms-bmp': 'bmp',
2921
2922         # caption
2923         'filmstrip+json': 'fs',
2924         'smptett+xml': 'tt',
2925         'ttaf+xml': 'dfxp',
2926         'ttml+xml': 'ttml',
2927         'x-ms-sami': 'sami',
2928
2929         # misc
2930         'gzip': 'gz',
2931         'json': 'json',
2932         'xml': 'xml',
2933         'zip': 'zip',
2934     }
2935
2936     mimetype = mt.partition(';')[0].strip().lower()
2937     _, _, subtype = mimetype.rpartition('/')
2938
2939     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2940     if ext:
2941         return ext
2942     elif default is not NO_DEFAULT:
2943         return default
2944     return subtype.replace('+', '.')
2945
2946
2947 def ext2mimetype(ext_or_url):
2948     if not ext_or_url:
2949         return None
2950     if '.' not in ext_or_url:
2951         ext_or_url = f'file.{ext_or_url}'
2952     return mimetypes.guess_type(ext_or_url)[0]
2953
2954
2955 def parse_codecs(codecs_str):
2956     # http://tools.ietf.org/html/rfc6381
2957     if not codecs_str:
2958         return {}
2959     split_codecs = list(filter(None, map(
2960         str.strip, codecs_str.strip().strip(',').split(','))))
2961     vcodec, acodec, scodec, hdr = None, None, None, None
2962     for full_codec in split_codecs:
2963         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2964         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2965                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2966             if vcodec:
2967                 continue
2968             vcodec = full_codec
2969             if parts[0] in ('dvh1', 'dvhe'):
2970                 hdr = 'DV'
2971             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2972                 hdr = 'HDR10'
2973             elif parts[:2] == ['vp9', '2']:
2974                 hdr = 'HDR10'
2975         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
2976                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2977             acodec = acodec or full_codec
2978         elif parts[0] in ('stpp', 'wvtt'):
2979             scodec = scodec or full_codec
2980         else:
2981             write_string(f'WARNING: Unknown codec {full_codec}\n')
2982     if vcodec or acodec or scodec:
2983         return {
2984             'vcodec': vcodec or 'none',
2985             'acodec': acodec or 'none',
2986             'dynamic_range': hdr,
2987             **({'scodec': scodec} if scodec is not None else {}),
2988         }
2989     elif len(split_codecs) == 2:
2990         return {
2991             'vcodec': split_codecs[0],
2992             'acodec': split_codecs[1],
2993         }
2994     return {}
2995
2996
2997 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
2998     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
2999
3000     allow_mkv = not preferences or 'mkv' in preferences
3001
3002     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3003         return 'mkv'  # TODO: any other format allows this?
3004
3005     # TODO: All codecs supported by parse_codecs isn't handled here
3006     COMPATIBLE_CODECS = {
3007         'mp4': {
3008             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3009             'h264', 'aacl', 'ec-3',  # Set in ISM
3010         },
3011         'webm': {
3012             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3013             'vp9x', 'vp8x',  # in the webm spec
3014         },
3015     }
3016
3017     sanitize_codec = functools.partial(
3018         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3019     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3020
3021     for ext in preferences or COMPATIBLE_CODECS.keys():
3022         codec_set = COMPATIBLE_CODECS.get(ext, set())
3023         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3024             return ext
3025
3026     COMPATIBLE_EXTS = (
3027         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3028         {'webm', 'weba'},
3029     )
3030     for ext in preferences or vexts:
3031         current_exts = {ext, *vexts, *aexts}
3032         if ext == 'mkv' or current_exts == {ext} or any(
3033                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3034             return ext
3035     return 'mkv' if allow_mkv else preferences[-1]
3036
3037
3038 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3039     getheader = url_handle.headers.get
3040
3041     cd = getheader('Content-Disposition')
3042     if cd:
3043         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3044         if m:
3045             e = determine_ext(m.group('filename'), default_ext=None)
3046             if e:
3047                 return e
3048
3049     meta_ext = getheader('x-amz-meta-name')
3050     if meta_ext:
3051         e = meta_ext.rpartition('.')[2]
3052         if e:
3053             return e
3054
3055     return mimetype2ext(getheader('Content-Type'), default=default)
3056
3057
3058 def encode_data_uri(data, mime_type):
3059     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3060
3061
3062 def age_restricted(content_limit, age_limit):
3063     """ Returns True iff the content should be blocked """
3064
3065     if age_limit is None:  # No limit set
3066         return False
3067     if content_limit is None:
3068         return False  # Content available for everyone
3069     return age_limit < content_limit
3070
3071
3072 # List of known byte-order-marks (BOM)
3073 BOMS = [
3074     (b'\xef\xbb\xbf', 'utf-8'),
3075     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3076     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3077     (b'\xff\xfe', 'utf-16-le'),
3078     (b'\xfe\xff', 'utf-16-be'),
3079 ]
3080
3081
3082 def is_html(first_bytes):
3083     """ Detect whether a file contains HTML by examining its first bytes. """
3084
3085     encoding = 'utf-8'
3086     for bom, enc in BOMS:
3087         while first_bytes.startswith(bom):
3088             encoding, first_bytes = enc, first_bytes[len(bom):]
3089
3090     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3091
3092
3093 def determine_protocol(info_dict):
3094     protocol = info_dict.get('protocol')
3095     if protocol is not None:
3096         return protocol
3097
3098     url = sanitize_url(info_dict['url'])
3099     if url.startswith('rtmp'):
3100         return 'rtmp'
3101     elif url.startswith('mms'):
3102         return 'mms'
3103     elif url.startswith('rtsp'):
3104         return 'rtsp'
3105
3106     ext = determine_ext(url)
3107     if ext == 'm3u8':
3108         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3109     elif ext == 'f4m':
3110         return 'f4m'
3111
3112     return urllib.parse.urlparse(url).scheme
3113
3114
3115 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3116     """ Render a list of rows, each as a list of values.
3117     Text after a \t will be right aligned """
3118     def width(string):
3119         return len(remove_terminal_sequences(string).replace('\t', ''))
3120
3121     def get_max_lens(table):
3122         return [max(width(str(v)) for v in col) for col in zip(*table)]
3123
3124     def filter_using_list(row, filterArray):
3125         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3126
3127     max_lens = get_max_lens(data) if hide_empty else []
3128     header_row = filter_using_list(header_row, max_lens)
3129     data = [filter_using_list(row, max_lens) for row in data]
3130
3131     table = [header_row] + data
3132     max_lens = get_max_lens(table)
3133     extra_gap += 1
3134     if delim:
3135         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3136         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3137     for row in table:
3138         for pos, text in enumerate(map(str, row)):
3139             if '\t' in text:
3140                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3141             else:
3142                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3143     ret = '\n'.join(''.join(row).rstrip() for row in table)
3144     return ret
3145
3146
3147 def _match_one(filter_part, dct, incomplete):
3148     # TODO: Generalize code with YoutubeDL._build_format_filter
3149     STRING_OPERATORS = {
3150         '*=': operator.contains,
3151         '^=': lambda attr, value: attr.startswith(value),
3152         '$=': lambda attr, value: attr.endswith(value),
3153         '~=': lambda attr, value: re.search(value, attr),
3154     }
3155     COMPARISON_OPERATORS = {
3156         **STRING_OPERATORS,
3157         '<=': operator.le,  # "<=" must be defined above "<"
3158         '<': operator.lt,
3159         '>=': operator.ge,
3160         '>': operator.gt,
3161         '=': operator.eq,
3162     }
3163
3164     if isinstance(incomplete, bool):
3165         is_incomplete = lambda _: incomplete
3166     else:
3167         is_incomplete = lambda k: k in incomplete
3168
3169     operator_rex = re.compile(r'''(?x)
3170         (?P<key>[a-z_]+)
3171         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3172         (?:
3173             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3174             (?P<strval>.+?)
3175         )
3176         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3177     m = operator_rex.fullmatch(filter_part.strip())
3178     if m:
3179         m = m.groupdict()
3180         unnegated_op = COMPARISON_OPERATORS[m['op']]
3181         if m['negation']:
3182             op = lambda attr, value: not unnegated_op(attr, value)
3183         else:
3184             op = unnegated_op
3185         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3186         if m['quote']:
3187             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3188         actual_value = dct.get(m['key'])
3189         numeric_comparison = None
3190         if isinstance(actual_value, (int, float)):
3191             # If the original field is a string and matching comparisonvalue is
3192             # a number we should respect the origin of the original field
3193             # and process comparison value as a string (see
3194             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3195             try:
3196                 numeric_comparison = int(comparison_value)
3197             except ValueError:
3198                 numeric_comparison = parse_filesize(comparison_value)
3199                 if numeric_comparison is None:
3200                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3201                 if numeric_comparison is None:
3202                     numeric_comparison = parse_duration(comparison_value)
3203         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3204             raise ValueError('Operator %s only supports string values!' % m['op'])
3205         if actual_value is None:
3206             return is_incomplete(m['key']) or m['none_inclusive']
3207         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3208
3209     UNARY_OPERATORS = {
3210         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3211         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3212     }
3213     operator_rex = re.compile(r'''(?x)
3214         (?P<op>%s)\s*(?P<key>[a-z_]+)
3215         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3216     m = operator_rex.fullmatch(filter_part.strip())
3217     if m:
3218         op = UNARY_OPERATORS[m.group('op')]
3219         actual_value = dct.get(m.group('key'))
3220         if is_incomplete(m.group('key')) and actual_value is None:
3221             return True
3222         return op(actual_value)
3223
3224     raise ValueError('Invalid filter part %r' % filter_part)
3225
3226
3227 def match_str(filter_str, dct, incomplete=False):
3228     """ Filter a dictionary with a simple string syntax.
3229     @returns           Whether the filter passes
3230     @param incomplete  Set of keys that is expected to be missing from dct.
3231                        Can be True/False to indicate all/none of the keys may be missing.
3232                        All conditions on incomplete keys pass if the key is missing
3233     """
3234     return all(
3235         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3236         for filter_part in re.split(r'(?<!\\)&', filter_str))
3237
3238
3239 def match_filter_func(filters, breaking_filters=None):
3240     if not filters and not breaking_filters:
3241         return None
3242     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3243     filters = set(variadic(filters or []))
3244
3245     interactive = '-' in filters
3246     if interactive:
3247         filters.remove('-')
3248
3249     def _match_func(info_dict, incomplete=False):
3250         ret = breaking_filters(info_dict, incomplete)
3251         if ret is not None:
3252             raise RejectedVideoReached(ret)
3253
3254         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3255             return NO_DEFAULT if interactive and not incomplete else None
3256         else:
3257             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3258             filter_str = ') | ('.join(map(str.strip, filters))
3259             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3260     return _match_func
3261
3262
3263 class download_range_func:
3264     def __init__(self, chapters, ranges, from_info=False):
3265         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3266
3267     def __call__(self, info_dict, ydl):
3268
3269         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3270                    else 'Cannot match chapters since chapter information is unavailable')
3271         for regex in self.chapters or []:
3272             for i, chapter in enumerate(info_dict.get('chapters') or []):
3273                 if re.search(regex, chapter['title']):
3274                     warning = None
3275                     yield {**chapter, 'index': i}
3276         if self.chapters and warning:
3277             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3278
3279         for start, end in self.ranges or []:
3280             yield {
3281                 'start_time': self._handle_negative_timestamp(start, info_dict),
3282                 'end_time': self._handle_negative_timestamp(end, info_dict),
3283             }
3284
3285         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3286             yield {
3287                 'start_time': info_dict.get('start_time') or 0,
3288                 'end_time': info_dict.get('end_time') or float('inf'),
3289             }
3290         elif not self.ranges and not self.chapters:
3291             yield {}
3292
3293     @staticmethod
3294     def _handle_negative_timestamp(time, info):
3295         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3296
3297     def __eq__(self, other):
3298         return (isinstance(other, download_range_func)
3299                 and self.chapters == other.chapters and self.ranges == other.ranges)
3300
3301     def __repr__(self):
3302         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3303
3304
3305 def parse_dfxp_time_expr(time_expr):
3306     if not time_expr:
3307         return
3308
3309     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3310     if mobj:
3311         return float(mobj.group('time_offset'))
3312
3313     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3314     if mobj:
3315         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3316
3317
3318 def srt_subtitles_timecode(seconds):
3319     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3320
3321
3322 def ass_subtitles_timecode(seconds):
3323     time = timetuple_from_msec(seconds * 1000)
3324     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3325
3326
3327 def dfxp2srt(dfxp_data):
3328     '''
3329     @param dfxp_data A bytes-like object containing DFXP data
3330     @returns A unicode object containing converted SRT data
3331     '''
3332     LEGACY_NAMESPACES = (
3333         (b'http://www.w3.org/ns/ttml', [
3334             b'http://www.w3.org/2004/11/ttaf1',
3335             b'http://www.w3.org/2006/04/ttaf1',
3336             b'http://www.w3.org/2006/10/ttaf1',
3337         ]),
3338         (b'http://www.w3.org/ns/ttml#styling', [
3339             b'http://www.w3.org/ns/ttml#style',
3340         ]),
3341     )
3342
3343     SUPPORTED_STYLING = [
3344         'color',
3345         'fontFamily',
3346         'fontSize',
3347         'fontStyle',
3348         'fontWeight',
3349         'textDecoration'
3350     ]
3351
3352     _x = functools.partial(xpath_with_ns, ns_map={
3353         'xml': 'http://www.w3.org/XML/1998/namespace',
3354         'ttml': 'http://www.w3.org/ns/ttml',
3355         'tts': 'http://www.w3.org/ns/ttml#styling',
3356     })
3357
3358     styles = {}
3359     default_style = {}
3360
3361     class TTMLPElementParser:
3362         _out = ''
3363         _unclosed_elements = []
3364         _applied_styles = []
3365
3366         def start(self, tag, attrib):
3367             if tag in (_x('ttml:br'), 'br'):
3368                 self._out += '\n'
3369             else:
3370                 unclosed_elements = []
3371                 style = {}
3372                 element_style_id = attrib.get('style')
3373                 if default_style:
3374                     style.update(default_style)
3375                 if element_style_id:
3376                     style.update(styles.get(element_style_id, {}))
3377                 for prop in SUPPORTED_STYLING:
3378                     prop_val = attrib.get(_x('tts:' + prop))
3379                     if prop_val:
3380                         style[prop] = prop_val
3381                 if style:
3382                     font = ''
3383                     for k, v in sorted(style.items()):
3384                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3385                             continue
3386                         if k == 'color':
3387                             font += ' color="%s"' % v
3388                         elif k == 'fontSize':
3389                             font += ' size="%s"' % v
3390                         elif k == 'fontFamily':
3391                             font += ' face="%s"' % v
3392                         elif k == 'fontWeight' and v == 'bold':
3393                             self._out += '<b>'
3394                             unclosed_elements.append('b')
3395                         elif k == 'fontStyle' and v == 'italic':
3396                             self._out += '<i>'
3397                             unclosed_elements.append('i')
3398                         elif k == 'textDecoration' and v == 'underline':
3399                             self._out += '<u>'
3400                             unclosed_elements.append('u')
3401                     if font:
3402                         self._out += '<font' + font + '>'
3403                         unclosed_elements.append('font')
3404                     applied_style = {}
3405                     if self._applied_styles:
3406                         applied_style.update(self._applied_styles[-1])
3407                     applied_style.update(style)
3408                     self._applied_styles.append(applied_style)
3409                 self._unclosed_elements.append(unclosed_elements)
3410
3411         def end(self, tag):
3412             if tag not in (_x('ttml:br'), 'br'):
3413                 unclosed_elements = self._unclosed_elements.pop()
3414                 for element in reversed(unclosed_elements):
3415                     self._out += '</%s>' % element
3416                 if unclosed_elements and self._applied_styles:
3417                     self._applied_styles.pop()
3418
3419         def data(self, data):
3420             self._out += data
3421
3422         def close(self):
3423             return self._out.strip()
3424
3425     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3426     # This will not trigger false positives since only UTF-8 text is being replaced
3427     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3428
3429     def parse_node(node):
3430         target = TTMLPElementParser()
3431         parser = xml.etree.ElementTree.XMLParser(target=target)
3432         parser.feed(xml.etree.ElementTree.tostring(node))
3433         return parser.close()
3434
3435     for k, v in LEGACY_NAMESPACES:
3436         for ns in v:
3437             dfxp_data = dfxp_data.replace(ns, k)
3438
3439     dfxp = compat_etree_fromstring(dfxp_data)
3440     out = []
3441     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3442
3443     if not paras:
3444         raise ValueError('Invalid dfxp/TTML subtitle')
3445
3446     repeat = False
3447     while True:
3448         for style in dfxp.findall(_x('.//ttml:style')):
3449             style_id = style.get('id') or style.get(_x('xml:id'))
3450             if not style_id:
3451                 continue
3452             parent_style_id = style.get('style')
3453             if parent_style_id:
3454                 if parent_style_id not in styles:
3455                     repeat = True
3456                     continue
3457                 styles[style_id] = styles[parent_style_id].copy()
3458             for prop in SUPPORTED_STYLING:
3459                 prop_val = style.get(_x('tts:' + prop))
3460                 if prop_val:
3461                     styles.setdefault(style_id, {})[prop] = prop_val
3462         if repeat:
3463             repeat = False
3464         else:
3465             break
3466
3467     for p in ('body', 'div'):
3468         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3469         if ele is None:
3470             continue
3471         style = styles.get(ele.get('style'))
3472         if not style:
3473             continue
3474         default_style.update(style)
3475
3476     for para, index in zip(paras, itertools.count(1)):
3477         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3478         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3479         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3480         if begin_time is None:
3481             continue
3482         if not end_time:
3483             if not dur:
3484                 continue
3485             end_time = begin_time + dur
3486         out.append('%d\n%s --> %s\n%s\n\n' % (
3487             index,
3488             srt_subtitles_timecode(begin_time),
3489             srt_subtitles_timecode(end_time),
3490             parse_node(para)))
3491
3492     return ''.join(out)
3493
3494
3495 def cli_option(params, command_option, param, separator=None):
3496     param = params.get(param)
3497     return ([] if param is None
3498             else [command_option, str(param)] if separator is None
3499             else [f'{command_option}{separator}{param}'])
3500
3501
3502 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3503     param = params.get(param)
3504     assert param in (True, False, None)
3505     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3506
3507
3508 def cli_valueless_option(params, command_option, param, expected_value=True):
3509     return [command_option] if params.get(param) == expected_value else []
3510
3511
3512 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3513     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3514         if use_compat:
3515             return argdict
3516         else:
3517             argdict = None
3518     if argdict is None:
3519         return default
3520     assert isinstance(argdict, dict)
3521
3522     assert isinstance(keys, (list, tuple))
3523     for key_list in keys:
3524         arg_list = list(filter(
3525             lambda x: x is not None,
3526             [argdict.get(key.lower()) for key in variadic(key_list)]))
3527         if arg_list:
3528             return [arg for args in arg_list for arg in args]
3529     return default
3530
3531
3532 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3533     main_key, exe = main_key.lower(), exe.lower()
3534     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3535     keys = [f'{root_key}{k}' for k in (keys or [''])]
3536     if root_key in keys:
3537         if main_key != exe:
3538             keys.append((main_key, exe))
3539         keys.append('default')
3540     else:
3541         use_compat = False
3542     return cli_configuration_args(argdict, keys, default, use_compat)
3543
3544
3545 class ISO639Utils:
3546     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3547     _lang_map = {
3548         'aa': 'aar',
3549         'ab': 'abk',
3550         'ae': 'ave',
3551         'af': 'afr',
3552         'ak': 'aka',
3553         'am': 'amh',
3554         'an': 'arg',
3555         'ar': 'ara',
3556         'as': 'asm',
3557         'av': 'ava',
3558         'ay': 'aym',
3559         'az': 'aze',
3560         'ba': 'bak',
3561         'be': 'bel',
3562         'bg': 'bul',
3563         'bh': 'bih',
3564         'bi': 'bis',
3565         'bm': 'bam',
3566         'bn': 'ben',
3567         'bo': 'bod',
3568         'br': 'bre',
3569         'bs': 'bos',
3570         'ca': 'cat',
3571         'ce': 'che',
3572         'ch': 'cha',
3573         'co': 'cos',
3574         'cr': 'cre',
3575         'cs': 'ces',
3576         'cu': 'chu',
3577         'cv': 'chv',
3578         'cy': 'cym',
3579         'da': 'dan',
3580         'de': 'deu',
3581         'dv': 'div',
3582         'dz': 'dzo',
3583         'ee': 'ewe',
3584         'el': 'ell',
3585         'en': 'eng',
3586         'eo': 'epo',
3587         'es': 'spa',
3588         'et': 'est',
3589         'eu': 'eus',
3590         'fa': 'fas',
3591         'ff': 'ful',
3592         'fi': 'fin',
3593         'fj': 'fij',
3594         'fo': 'fao',
3595         'fr': 'fra',
3596         'fy': 'fry',
3597         'ga': 'gle',
3598         'gd': 'gla',
3599         'gl': 'glg',
3600         'gn': 'grn',
3601         'gu': 'guj',
3602         'gv': 'glv',
3603         'ha': 'hau',
3604         'he': 'heb',
3605         'iw': 'heb',  # Replaced by he in 1989 revision
3606         'hi': 'hin',
3607         'ho': 'hmo',
3608         'hr': 'hrv',
3609         'ht': 'hat',
3610         'hu': 'hun',
3611         'hy': 'hye',
3612         'hz': 'her',
3613         'ia': 'ina',
3614         'id': 'ind',
3615         'in': 'ind',  # Replaced by id in 1989 revision
3616         'ie': 'ile',
3617         'ig': 'ibo',
3618         'ii': 'iii',
3619         'ik': 'ipk',
3620         'io': 'ido',
3621         'is': 'isl',
3622         'it': 'ita',
3623         'iu': 'iku',
3624         'ja': 'jpn',
3625         'jv': 'jav',
3626         'ka': 'kat',
3627         'kg': 'kon',
3628         'ki': 'kik',
3629         'kj': 'kua',
3630         'kk': 'kaz',
3631         'kl': 'kal',
3632         'km': 'khm',
3633         'kn': 'kan',
3634         'ko': 'kor',
3635         'kr': 'kau',
3636         'ks': 'kas',
3637         'ku': 'kur',
3638         'kv': 'kom',
3639         'kw': 'cor',
3640         'ky': 'kir',
3641         'la': 'lat',
3642         'lb': 'ltz',
3643         'lg': 'lug',
3644         'li': 'lim',
3645         'ln': 'lin',
3646         'lo': 'lao',
3647         'lt': 'lit',
3648         'lu': 'lub',
3649         'lv': 'lav',
3650         'mg': 'mlg',
3651         'mh': 'mah',
3652         'mi': 'mri',
3653         'mk': 'mkd',
3654         'ml': 'mal',
3655         'mn': 'mon',
3656         'mr': 'mar',
3657         'ms': 'msa',
3658         'mt': 'mlt',
3659         'my': 'mya',
3660         'na': 'nau',
3661         'nb': 'nob',
3662         'nd': 'nde',
3663         'ne': 'nep',
3664         'ng': 'ndo',
3665         'nl': 'nld',
3666         'nn': 'nno',
3667         'no': 'nor',
3668         'nr': 'nbl',
3669         'nv': 'nav',
3670         'ny': 'nya',
3671         'oc': 'oci',
3672         'oj': 'oji',
3673         'om': 'orm',
3674         'or': 'ori',
3675         'os': 'oss',
3676         'pa': 'pan',
3677         'pe': 'per',
3678         'pi': 'pli',
3679         'pl': 'pol',
3680         'ps': 'pus',
3681         'pt': 'por',
3682         'qu': 'que',
3683         'rm': 'roh',
3684         'rn': 'run',
3685         'ro': 'ron',
3686         'ru': 'rus',
3687         'rw': 'kin',
3688         'sa': 'san',
3689         'sc': 'srd',
3690         'sd': 'snd',
3691         'se': 'sme',
3692         'sg': 'sag',
3693         'si': 'sin',
3694         'sk': 'slk',
3695         'sl': 'slv',
3696         'sm': 'smo',
3697         'sn': 'sna',
3698         'so': 'som',
3699         'sq': 'sqi',
3700         'sr': 'srp',
3701         'ss': 'ssw',
3702         'st': 'sot',
3703         'su': 'sun',
3704         'sv': 'swe',
3705         'sw': 'swa',
3706         'ta': 'tam',
3707         'te': 'tel',
3708         'tg': 'tgk',
3709         'th': 'tha',
3710         'ti': 'tir',
3711         'tk': 'tuk',
3712         'tl': 'tgl',
3713         'tn': 'tsn',
3714         'to': 'ton',
3715         'tr': 'tur',
3716         'ts': 'tso',
3717         'tt': 'tat',
3718         'tw': 'twi',
3719         'ty': 'tah',
3720         'ug': 'uig',
3721         'uk': 'ukr',
3722         'ur': 'urd',
3723         'uz': 'uzb',
3724         've': 'ven',
3725         'vi': 'vie',
3726         'vo': 'vol',
3727         'wa': 'wln',
3728         'wo': 'wol',
3729         'xh': 'xho',
3730         'yi': 'yid',
3731         'ji': 'yid',  # Replaced by yi in 1989 revision
3732         'yo': 'yor',
3733         'za': 'zha',
3734         'zh': 'zho',
3735         'zu': 'zul',
3736     }
3737
3738     @classmethod
3739     def short2long(cls, code):
3740         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3741         return cls._lang_map.get(code[:2])
3742
3743     @classmethod
3744     def long2short(cls, code):
3745         """Convert language code from ISO 639-2/T to ISO 639-1"""
3746         for short_name, long_name in cls._lang_map.items():
3747             if long_name == code:
3748                 return short_name
3749
3750
3751 class ISO3166Utils:
3752     # From http://data.okfn.org/data/core/country-list
3753     _country_map = {
3754         'AF': 'Afghanistan',
3755         'AX': 'Åland Islands',
3756         'AL': 'Albania',
3757         'DZ': 'Algeria',
3758         'AS': 'American Samoa',
3759         'AD': 'Andorra',
3760         'AO': 'Angola',
3761         'AI': 'Anguilla',
3762         'AQ': 'Antarctica',
3763         'AG': 'Antigua and Barbuda',
3764         'AR': 'Argentina',
3765         'AM': 'Armenia',
3766         'AW': 'Aruba',
3767         'AU': 'Australia',
3768         'AT': 'Austria',
3769         'AZ': 'Azerbaijan',
3770         'BS': 'Bahamas',
3771         'BH': 'Bahrain',
3772         'BD': 'Bangladesh',
3773         'BB': 'Barbados',
3774         'BY': 'Belarus',
3775         'BE': 'Belgium',
3776         'BZ': 'Belize',
3777         'BJ': 'Benin',
3778         'BM': 'Bermuda',
3779         'BT': 'Bhutan',
3780         'BO': 'Bolivia, Plurinational State of',
3781         'BQ': 'Bonaire, Sint Eustatius and Saba',
3782         'BA': 'Bosnia and Herzegovina',
3783         'BW': 'Botswana',
3784         'BV': 'Bouvet Island',
3785         'BR': 'Brazil',
3786         'IO': 'British Indian Ocean Territory',
3787         'BN': 'Brunei Darussalam',
3788         'BG': 'Bulgaria',
3789         'BF': 'Burkina Faso',
3790         'BI': 'Burundi',
3791         'KH': 'Cambodia',
3792         'CM': 'Cameroon',
3793         'CA': 'Canada',
3794         'CV': 'Cape Verde',
3795         'KY': 'Cayman Islands',
3796         'CF': 'Central African Republic',
3797         'TD': 'Chad',
3798         'CL': 'Chile',
3799         'CN': 'China',
3800         'CX': 'Christmas Island',
3801         'CC': 'Cocos (Keeling) Islands',
3802         'CO': 'Colombia',
3803         'KM': 'Comoros',
3804         'CG': 'Congo',
3805         'CD': 'Congo, the Democratic Republic of the',
3806         'CK': 'Cook Islands',
3807         'CR': 'Costa Rica',
3808         'CI': 'Côte d\'Ivoire',
3809         'HR': 'Croatia',
3810         'CU': 'Cuba',
3811         'CW': 'Curaçao',
3812         'CY': 'Cyprus',
3813         'CZ': 'Czech Republic',
3814         'DK': 'Denmark',
3815         'DJ': 'Djibouti',
3816         'DM': 'Dominica',
3817         'DO': 'Dominican Republic',
3818         'EC': 'Ecuador',
3819         'EG': 'Egypt',
3820         'SV': 'El Salvador',
3821         'GQ': 'Equatorial Guinea',
3822         'ER': 'Eritrea',
3823         'EE': 'Estonia',
3824         'ET': 'Ethiopia',
3825         'FK': 'Falkland Islands (Malvinas)',
3826         'FO': 'Faroe Islands',
3827         'FJ': 'Fiji',
3828         'FI': 'Finland',
3829         'FR': 'France',
3830         'GF': 'French Guiana',
3831         'PF': 'French Polynesia',
3832         'TF': 'French Southern Territories',
3833         'GA': 'Gabon',
3834         'GM': 'Gambia',
3835         'GE': 'Georgia',
3836         'DE': 'Germany',
3837         'GH': 'Ghana',
3838         'GI': 'Gibraltar',
3839         'GR': 'Greece',
3840         'GL': 'Greenland',
3841         'GD': 'Grenada',
3842         'GP': 'Guadeloupe',
3843         'GU': 'Guam',
3844         'GT': 'Guatemala',
3845         'GG': 'Guernsey',
3846         'GN': 'Guinea',
3847         'GW': 'Guinea-Bissau',
3848         'GY': 'Guyana',
3849         'HT': 'Haiti',
3850         'HM': 'Heard Island and McDonald Islands',
3851         'VA': 'Holy See (Vatican City State)',
3852         'HN': 'Honduras',
3853         'HK': 'Hong Kong',
3854         'HU': 'Hungary',
3855         'IS': 'Iceland',
3856         'IN': 'India',
3857         'ID': 'Indonesia',
3858         'IR': 'Iran, Islamic Republic of',
3859         'IQ': 'Iraq',
3860         'IE': 'Ireland',
3861         'IM': 'Isle of Man',
3862         'IL': 'Israel',
3863         'IT': 'Italy',
3864         'JM': 'Jamaica',
3865         'JP': 'Japan',
3866         'JE': 'Jersey',
3867         'JO': 'Jordan',
3868         'KZ': 'Kazakhstan',
3869         'KE': 'Kenya',
3870         'KI': 'Kiribati',
3871         'KP': 'Korea, Democratic People\'s Republic of',
3872         'KR': 'Korea, Republic of',
3873         'KW': 'Kuwait',
3874         'KG': 'Kyrgyzstan',
3875         'LA': 'Lao People\'s Democratic Republic',
3876         'LV': 'Latvia',
3877         'LB': 'Lebanon',
3878         'LS': 'Lesotho',
3879         'LR': 'Liberia',
3880         'LY': 'Libya',
3881         'LI': 'Liechtenstein',
3882         'LT': 'Lithuania',
3883         'LU': 'Luxembourg',
3884         'MO': 'Macao',
3885         'MK': 'Macedonia, the Former Yugoslav Republic of',
3886         'MG': 'Madagascar',
3887         'MW': 'Malawi',
3888         'MY': 'Malaysia',
3889         'MV': 'Maldives',
3890         'ML': 'Mali',
3891         'MT': 'Malta',
3892         'MH': 'Marshall Islands',
3893         'MQ': 'Martinique',
3894         'MR': 'Mauritania',
3895         'MU': 'Mauritius',
3896         'YT': 'Mayotte',
3897         'MX': 'Mexico',
3898         'FM': 'Micronesia, Federated States of',
3899         'MD': 'Moldova, Republic of',
3900         'MC': 'Monaco',
3901         'MN': 'Mongolia',
3902         'ME': 'Montenegro',
3903         'MS': 'Montserrat',
3904         'MA': 'Morocco',
3905         'MZ': 'Mozambique',
3906         'MM': 'Myanmar',
3907         'NA': 'Namibia',
3908         'NR': 'Nauru',
3909         'NP': 'Nepal',
3910         'NL': 'Netherlands',
3911         'NC': 'New Caledonia',
3912         'NZ': 'New Zealand',
3913         'NI': 'Nicaragua',
3914         'NE': 'Niger',
3915         'NG': 'Nigeria',
3916         'NU': 'Niue',
3917         'NF': 'Norfolk Island',
3918         'MP': 'Northern Mariana Islands',
3919         'NO': 'Norway',
3920         'OM': 'Oman',
3921         'PK': 'Pakistan',
3922         'PW': 'Palau',
3923         'PS': 'Palestine, State of',
3924         'PA': 'Panama',
3925         'PG': 'Papua New Guinea',
3926         'PY': 'Paraguay',
3927         'PE': 'Peru',
3928         'PH': 'Philippines',
3929         'PN': 'Pitcairn',
3930         'PL': 'Poland',
3931         'PT': 'Portugal',
3932         'PR': 'Puerto Rico',
3933         'QA': 'Qatar',
3934         'RE': 'Réunion',
3935         'RO': 'Romania',
3936         'RU': 'Russian Federation',
3937         'RW': 'Rwanda',
3938         'BL': 'Saint Barthélemy',
3939         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3940         'KN': 'Saint Kitts and Nevis',
3941         'LC': 'Saint Lucia',
3942         'MF': 'Saint Martin (French part)',
3943         'PM': 'Saint Pierre and Miquelon',
3944         'VC': 'Saint Vincent and the Grenadines',
3945         'WS': 'Samoa',
3946         'SM': 'San Marino',
3947         'ST': 'Sao Tome and Principe',
3948         'SA': 'Saudi Arabia',
3949         'SN': 'Senegal',
3950         'RS': 'Serbia',
3951         'SC': 'Seychelles',
3952         'SL': 'Sierra Leone',
3953         'SG': 'Singapore',
3954         'SX': 'Sint Maarten (Dutch part)',
3955         'SK': 'Slovakia',
3956         'SI': 'Slovenia',
3957         'SB': 'Solomon Islands',
3958         'SO': 'Somalia',
3959         'ZA': 'South Africa',
3960         'GS': 'South Georgia and the South Sandwich Islands',
3961         'SS': 'South Sudan',
3962         'ES': 'Spain',
3963         'LK': 'Sri Lanka',
3964         'SD': 'Sudan',
3965         'SR': 'Suriname',
3966         'SJ': 'Svalbard and Jan Mayen',
3967         'SZ': 'Swaziland',
3968         'SE': 'Sweden',
3969         'CH': 'Switzerland',
3970         'SY': 'Syrian Arab Republic',
3971         'TW': 'Taiwan, Province of China',
3972         'TJ': 'Tajikistan',
3973         'TZ': 'Tanzania, United Republic of',
3974         'TH': 'Thailand',
3975         'TL': 'Timor-Leste',
3976         'TG': 'Togo',
3977         'TK': 'Tokelau',
3978         'TO': 'Tonga',
3979         'TT': 'Trinidad and Tobago',
3980         'TN': 'Tunisia',
3981         'TR': 'Turkey',
3982         'TM': 'Turkmenistan',
3983         'TC': 'Turks and Caicos Islands',
3984         'TV': 'Tuvalu',
3985         'UG': 'Uganda',
3986         'UA': 'Ukraine',
3987         'AE': 'United Arab Emirates',
3988         'GB': 'United Kingdom',
3989         'US': 'United States',
3990         'UM': 'United States Minor Outlying Islands',
3991         'UY': 'Uruguay',
3992         'UZ': 'Uzbekistan',
3993         'VU': 'Vanuatu',
3994         'VE': 'Venezuela, Bolivarian Republic of',
3995         'VN': 'Viet Nam',
3996         'VG': 'Virgin Islands, British',
3997         'VI': 'Virgin Islands, U.S.',
3998         'WF': 'Wallis and Futuna',
3999         'EH': 'Western Sahara',
4000         'YE': 'Yemen',
4001         'ZM': 'Zambia',
4002         'ZW': 'Zimbabwe',
4003         # Not ISO 3166 codes, but used for IP blocks
4004         'AP': 'Asia/Pacific Region',
4005         'EU': 'Europe',
4006     }
4007
4008     @classmethod
4009     def short2full(cls, code):
4010         """Convert an ISO 3166-2 country code to the corresponding full name"""
4011         return cls._country_map.get(code.upper())
4012
4013
4014 class GeoUtils:
4015     # Major IPv4 address blocks per country
4016     _country_ip_map = {
4017         'AD': '46.172.224.0/19',
4018         'AE': '94.200.0.0/13',
4019         'AF': '149.54.0.0/17',
4020         'AG': '209.59.64.0/18',
4021         'AI': '204.14.248.0/21',
4022         'AL': '46.99.0.0/16',
4023         'AM': '46.70.0.0/15',
4024         'AO': '105.168.0.0/13',
4025         'AP': '182.50.184.0/21',
4026         'AQ': '23.154.160.0/24',
4027         'AR': '181.0.0.0/12',
4028         'AS': '202.70.112.0/20',
4029         'AT': '77.116.0.0/14',
4030         'AU': '1.128.0.0/11',
4031         'AW': '181.41.0.0/18',
4032         'AX': '185.217.4.0/22',
4033         'AZ': '5.197.0.0/16',
4034         'BA': '31.176.128.0/17',
4035         'BB': '65.48.128.0/17',
4036         'BD': '114.130.0.0/16',
4037         'BE': '57.0.0.0/8',
4038         'BF': '102.178.0.0/15',
4039         'BG': '95.42.0.0/15',
4040         'BH': '37.131.0.0/17',
4041         'BI': '154.117.192.0/18',
4042         'BJ': '137.255.0.0/16',
4043         'BL': '185.212.72.0/23',
4044         'BM': '196.12.64.0/18',
4045         'BN': '156.31.0.0/16',
4046         'BO': '161.56.0.0/16',
4047         'BQ': '161.0.80.0/20',
4048         'BR': '191.128.0.0/12',
4049         'BS': '24.51.64.0/18',
4050         'BT': '119.2.96.0/19',
4051         'BW': '168.167.0.0/16',
4052         'BY': '178.120.0.0/13',
4053         'BZ': '179.42.192.0/18',
4054         'CA': '99.224.0.0/11',
4055         'CD': '41.243.0.0/16',
4056         'CF': '197.242.176.0/21',
4057         'CG': '160.113.0.0/16',
4058         'CH': '85.0.0.0/13',
4059         'CI': '102.136.0.0/14',
4060         'CK': '202.65.32.0/19',
4061         'CL': '152.172.0.0/14',
4062         'CM': '102.244.0.0/14',
4063         'CN': '36.128.0.0/10',
4064         'CO': '181.240.0.0/12',
4065         'CR': '201.192.0.0/12',
4066         'CU': '152.206.0.0/15',
4067         'CV': '165.90.96.0/19',
4068         'CW': '190.88.128.0/17',
4069         'CY': '31.153.0.0/16',
4070         'CZ': '88.100.0.0/14',
4071         'DE': '53.0.0.0/8',
4072         'DJ': '197.241.0.0/17',
4073         'DK': '87.48.0.0/12',
4074         'DM': '192.243.48.0/20',
4075         'DO': '152.166.0.0/15',
4076         'DZ': '41.96.0.0/12',
4077         'EC': '186.68.0.0/15',
4078         'EE': '90.190.0.0/15',
4079         'EG': '156.160.0.0/11',
4080         'ER': '196.200.96.0/20',
4081         'ES': '88.0.0.0/11',
4082         'ET': '196.188.0.0/14',
4083         'EU': '2.16.0.0/13',
4084         'FI': '91.152.0.0/13',
4085         'FJ': '144.120.0.0/16',
4086         'FK': '80.73.208.0/21',
4087         'FM': '119.252.112.0/20',
4088         'FO': '88.85.32.0/19',
4089         'FR': '90.0.0.0/9',
4090         'GA': '41.158.0.0/15',
4091         'GB': '25.0.0.0/8',
4092         'GD': '74.122.88.0/21',
4093         'GE': '31.146.0.0/16',
4094         'GF': '161.22.64.0/18',
4095         'GG': '62.68.160.0/19',
4096         'GH': '154.160.0.0/12',
4097         'GI': '95.164.0.0/16',
4098         'GL': '88.83.0.0/19',
4099         'GM': '160.182.0.0/15',
4100         'GN': '197.149.192.0/18',
4101         'GP': '104.250.0.0/19',
4102         'GQ': '105.235.224.0/20',
4103         'GR': '94.64.0.0/13',
4104         'GT': '168.234.0.0/16',
4105         'GU': '168.123.0.0/16',
4106         'GW': '197.214.80.0/20',
4107         'GY': '181.41.64.0/18',
4108         'HK': '113.252.0.0/14',
4109         'HN': '181.210.0.0/16',
4110         'HR': '93.136.0.0/13',
4111         'HT': '148.102.128.0/17',
4112         'HU': '84.0.0.0/14',
4113         'ID': '39.192.0.0/10',
4114         'IE': '87.32.0.0/12',
4115         'IL': '79.176.0.0/13',
4116         'IM': '5.62.80.0/20',
4117         'IN': '117.192.0.0/10',
4118         'IO': '203.83.48.0/21',
4119         'IQ': '37.236.0.0/14',
4120         'IR': '2.176.0.0/12',
4121         'IS': '82.221.0.0/16',
4122         'IT': '79.0.0.0/10',
4123         'JE': '87.244.64.0/18',
4124         'JM': '72.27.0.0/17',
4125         'JO': '176.29.0.0/16',
4126         'JP': '133.0.0.0/8',
4127         'KE': '105.48.0.0/12',
4128         'KG': '158.181.128.0/17',
4129         'KH': '36.37.128.0/17',
4130         'KI': '103.25.140.0/22',
4131         'KM': '197.255.224.0/20',
4132         'KN': '198.167.192.0/19',
4133         'KP': '175.45.176.0/22',
4134         'KR': '175.192.0.0/10',
4135         'KW': '37.36.0.0/14',
4136         'KY': '64.96.0.0/15',
4137         'KZ': '2.72.0.0/13',
4138         'LA': '115.84.64.0/18',
4139         'LB': '178.135.0.0/16',
4140         'LC': '24.92.144.0/20',
4141         'LI': '82.117.0.0/19',
4142         'LK': '112.134.0.0/15',
4143         'LR': '102.183.0.0/16',
4144         'LS': '129.232.0.0/17',
4145         'LT': '78.56.0.0/13',
4146         'LU': '188.42.0.0/16',
4147         'LV': '46.109.0.0/16',
4148         'LY': '41.252.0.0/14',
4149         'MA': '105.128.0.0/11',
4150         'MC': '88.209.64.0/18',
4151         'MD': '37.246.0.0/16',
4152         'ME': '178.175.0.0/17',
4153         'MF': '74.112.232.0/21',
4154         'MG': '154.126.0.0/17',
4155         'MH': '117.103.88.0/21',
4156         'MK': '77.28.0.0/15',
4157         'ML': '154.118.128.0/18',
4158         'MM': '37.111.0.0/17',
4159         'MN': '49.0.128.0/17',
4160         'MO': '60.246.0.0/16',
4161         'MP': '202.88.64.0/20',
4162         'MQ': '109.203.224.0/19',
4163         'MR': '41.188.64.0/18',
4164         'MS': '208.90.112.0/22',
4165         'MT': '46.11.0.0/16',
4166         'MU': '105.16.0.0/12',
4167         'MV': '27.114.128.0/18',
4168         'MW': '102.70.0.0/15',
4169         'MX': '187.192.0.0/11',
4170         'MY': '175.136.0.0/13',
4171         'MZ': '197.218.0.0/15',
4172         'NA': '41.182.0.0/16',
4173         'NC': '101.101.0.0/18',
4174         'NE': '197.214.0.0/18',
4175         'NF': '203.17.240.0/22',
4176         'NG': '105.112.0.0/12',
4177         'NI': '186.76.0.0/15',
4178         'NL': '145.96.0.0/11',
4179         'NO': '84.208.0.0/13',
4180         'NP': '36.252.0.0/15',
4181         'NR': '203.98.224.0/19',
4182         'NU': '49.156.48.0/22',
4183         'NZ': '49.224.0.0/14',
4184         'OM': '5.36.0.0/15',
4185         'PA': '186.72.0.0/15',
4186         'PE': '186.160.0.0/14',
4187         'PF': '123.50.64.0/18',
4188         'PG': '124.240.192.0/19',
4189         'PH': '49.144.0.0/13',
4190         'PK': '39.32.0.0/11',
4191         'PL': '83.0.0.0/11',
4192         'PM': '70.36.0.0/20',
4193         'PR': '66.50.0.0/16',
4194         'PS': '188.161.0.0/16',
4195         'PT': '85.240.0.0/13',
4196         'PW': '202.124.224.0/20',
4197         'PY': '181.120.0.0/14',
4198         'QA': '37.210.0.0/15',
4199         'RE': '102.35.0.0/16',
4200         'RO': '79.112.0.0/13',
4201         'RS': '93.86.0.0/15',
4202         'RU': '5.136.0.0/13',
4203         'RW': '41.186.0.0/16',
4204         'SA': '188.48.0.0/13',
4205         'SB': '202.1.160.0/19',
4206         'SC': '154.192.0.0/11',
4207         'SD': '102.120.0.0/13',
4208         'SE': '78.64.0.0/12',
4209         'SG': '8.128.0.0/10',
4210         'SI': '188.196.0.0/14',
4211         'SK': '78.98.0.0/15',
4212         'SL': '102.143.0.0/17',
4213         'SM': '89.186.32.0/19',
4214         'SN': '41.82.0.0/15',
4215         'SO': '154.115.192.0/18',
4216         'SR': '186.179.128.0/17',
4217         'SS': '105.235.208.0/21',
4218         'ST': '197.159.160.0/19',
4219         'SV': '168.243.0.0/16',
4220         'SX': '190.102.0.0/20',
4221         'SY': '5.0.0.0/16',
4222         'SZ': '41.84.224.0/19',
4223         'TC': '65.255.48.0/20',
4224         'TD': '154.68.128.0/19',
4225         'TG': '196.168.0.0/14',
4226         'TH': '171.96.0.0/13',
4227         'TJ': '85.9.128.0/18',
4228         'TK': '27.96.24.0/21',
4229         'TL': '180.189.160.0/20',
4230         'TM': '95.85.96.0/19',
4231         'TN': '197.0.0.0/11',
4232         'TO': '175.176.144.0/21',
4233         'TR': '78.160.0.0/11',
4234         'TT': '186.44.0.0/15',
4235         'TV': '202.2.96.0/19',
4236         'TW': '120.96.0.0/11',
4237         'TZ': '156.156.0.0/14',
4238         'UA': '37.52.0.0/14',
4239         'UG': '102.80.0.0/13',
4240         'US': '6.0.0.0/8',
4241         'UY': '167.56.0.0/13',
4242         'UZ': '84.54.64.0/18',
4243         'VA': '212.77.0.0/19',
4244         'VC': '207.191.240.0/21',
4245         'VE': '186.88.0.0/13',
4246         'VG': '66.81.192.0/20',
4247         'VI': '146.226.0.0/16',
4248         'VN': '14.160.0.0/11',
4249         'VU': '202.80.32.0/20',
4250         'WF': '117.20.32.0/21',
4251         'WS': '202.4.32.0/19',
4252         'YE': '134.35.0.0/16',
4253         'YT': '41.242.116.0/22',
4254         'ZA': '41.0.0.0/11',
4255         'ZM': '102.144.0.0/13',
4256         'ZW': '102.177.192.0/18',
4257     }
4258
4259     @classmethod
4260     def random_ipv4(cls, code_or_block):
4261         if len(code_or_block) == 2:
4262             block = cls._country_ip_map.get(code_or_block.upper())
4263             if not block:
4264                 return None
4265         else:
4266             block = code_or_block
4267         addr, preflen = block.split('/')
4268         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4269         addr_max = addr_min | (0xffffffff >> int(preflen))
4270         return str(socket.inet_ntoa(
4271             struct.pack('!L', random.randint(addr_min, addr_max))))
4272
4273
4274 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4275 # released into Public Domain
4276 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4277
4278 def long_to_bytes(n, blocksize=0):
4279     """long_to_bytes(n:long, blocksize:int) : string
4280     Convert a long integer to a byte string.
4281
4282     If optional blocksize is given and greater than zero, pad the front of the
4283     byte string with binary zeros so that the length is a multiple of
4284     blocksize.
4285     """
4286     # after much testing, this algorithm was deemed to be the fastest
4287     s = b''
4288     n = int(n)
4289     while n > 0:
4290         s = struct.pack('>I', n & 0xffffffff) + s
4291         n = n >> 32
4292     # strip off leading zeros
4293     for i in range(len(s)):
4294         if s[i] != b'\000'[0]:
4295             break
4296     else:
4297         # only happens when n == 0
4298         s = b'\000'
4299         i = 0
4300     s = s[i:]
4301     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4302     # de-padding being done above, but sigh...
4303     if blocksize > 0 and len(s) % blocksize:
4304         s = (blocksize - len(s) % blocksize) * b'\000' + s
4305     return s
4306
4307
4308 def bytes_to_long(s):
4309     """bytes_to_long(string) : long
4310     Convert a byte string to a long integer.
4311
4312     This is (essentially) the inverse of long_to_bytes().
4313     """
4314     acc = 0
4315     length = len(s)
4316     if length % 4:
4317         extra = (4 - length % 4)
4318         s = b'\000' * extra + s
4319         length = length + extra
4320     for i in range(0, length, 4):
4321         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4322     return acc
4323
4324
4325 def ohdave_rsa_encrypt(data, exponent, modulus):
4326     '''
4327     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4328
4329     Input:
4330         data: data to encrypt, bytes-like object
4331         exponent, modulus: parameter e and N of RSA algorithm, both integer
4332     Output: hex string of encrypted data
4333
4334     Limitation: supports one block encryption only
4335     '''
4336
4337     payload = int(binascii.hexlify(data[::-1]), 16)
4338     encrypted = pow(payload, exponent, modulus)
4339     return '%x' % encrypted
4340
4341
4342 def pkcs1pad(data, length):
4343     """
4344     Padding input data with PKCS#1 scheme
4345
4346     @param {int[]} data        input data
4347     @param {int}   length      target length
4348     @returns {int[]}           padded data
4349     """
4350     if len(data) > length - 11:
4351         raise ValueError('Input data too long for PKCS#1 padding')
4352
4353     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4354     return [0, 2] + pseudo_random + [0] + data
4355
4356
4357 def _base_n_table(n, table):
4358     if not table and not n:
4359         raise ValueError('Either table or n must be specified')
4360     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4361
4362     if n and n != len(table):
4363         raise ValueError(f'base {n} exceeds table length {len(table)}')
4364     return table
4365
4366
4367 def encode_base_n(num, n=None, table=None):
4368     """Convert given int to a base-n string"""
4369     table = _base_n_table(n, table)
4370     if not num:
4371         return table[0]
4372
4373     result, base = '', len(table)
4374     while num:
4375         result = table[num % base] + result
4376         num = num // base
4377     return result
4378
4379
4380 def decode_base_n(string, n=None, table=None):
4381     """Convert given base-n string to int"""
4382     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4383     result, base = 0, len(table)
4384     for char in string:
4385         result = result * base + table[char]
4386     return result
4387
4388
4389 def decode_packed_codes(code):
4390     mobj = re.search(PACKED_CODES_RE, code)
4391     obfuscated_code, base, count, symbols = mobj.groups()
4392     base = int(base)
4393     count = int(count)
4394     symbols = symbols.split('|')
4395     symbol_table = {}
4396
4397     while count:
4398         count -= 1
4399         base_n_count = encode_base_n(count, base)
4400         symbol_table[base_n_count] = symbols[count] or base_n_count
4401
4402     return re.sub(
4403         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4404         obfuscated_code)
4405
4406
4407 def caesar(s, alphabet, shift):
4408     if shift == 0:
4409         return s
4410     l = len(alphabet)
4411     return ''.join(
4412         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4413         for c in s)
4414
4415
4416 def rot47(s):
4417     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4418
4419
4420 def parse_m3u8_attributes(attrib):
4421     info = {}
4422     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4423         if val.startswith('"'):
4424             val = val[1:-1]
4425         info[key] = val
4426     return info
4427
4428
4429 def urshift(val, n):
4430     return val >> n if val >= 0 else (val + 0x100000000) >> n
4431
4432
4433 def write_xattr(path, key, value):
4434     # Windows: Write xattrs to NTFS Alternate Data Streams:
4435     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4436     if compat_os_name == 'nt':
4437         assert ':' not in key
4438         assert os.path.exists(path)
4439
4440         try:
4441             with open(f'{path}:{key}', 'wb') as f:
4442                 f.write(value)
4443         except OSError as e:
4444             raise XAttrMetadataError(e.errno, e.strerror)
4445         return
4446
4447     # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
4448
4449     setxattr = None
4450     if callable(getattr(os, 'setxattr', None)):
4451         setxattr = os.setxattr
4452     elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4453         # Unicode arguments are not supported in pyxattr until version 0.5.0
4454         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4455         if version_tuple(xattr.__version__) >= (0, 5, 0):
4456             setxattr = xattr.set
4457     elif xattr:
4458         setxattr = xattr.setxattr
4459
4460     if setxattr:
4461         try:
4462             setxattr(path, key, value)
4463         except OSError as e:
4464             raise XAttrMetadataError(e.errno, e.strerror)
4465         return
4466
4467     # UNIX Method 2. Use setfattr/xattr executables
4468     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4469            else 'xattr' if check_executable('xattr', ['-h']) else None)
4470     if not exe:
4471         raise XAttrUnavailableError(
4472             'Couldn\'t find a tool to set the xattrs. Install either the "xattr" or "pyxattr" Python modules or the '
4473             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4474
4475     value = value.decode()
4476     try:
4477         _, stderr, returncode = Popen.run(
4478             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4479             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4480     except OSError as e:
4481         raise XAttrMetadataError(e.errno, e.strerror)
4482     if returncode:
4483         raise XAttrMetadataError(returncode, stderr)
4484
4485
4486 def random_birthday(year_field, month_field, day_field):
4487     start_date = datetime.date(1950, 1, 1)
4488     end_date = datetime.date(1995, 12, 31)
4489     offset = random.randint(0, (end_date - start_date).days)
4490     random_date = start_date + datetime.timedelta(offset)
4491     return {
4492         year_field: str(random_date.year),
4493         month_field: str(random_date.month),
4494         day_field: str(random_date.day),
4495     }
4496
4497
4498 def find_available_port(interface=''):
4499     try:
4500         with socket.socket() as sock:
4501             sock.bind((interface, 0))
4502             return sock.getsockname()[1]
4503     except OSError:
4504         return None
4505
4506
4507 # Templates for internet shortcut files, which are plain text files.
4508 DOT_URL_LINK_TEMPLATE = '''\
4509 [InternetShortcut]
4510 URL=%(url)s
4511 '''
4512
4513 DOT_WEBLOC_LINK_TEMPLATE = '''\
4514 <?xml version="1.0" encoding="UTF-8"?>
4515 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4516 <plist version="1.0">
4517 <dict>
4518 \t<key>URL</key>
4519 \t<string>%(url)s</string>
4520 </dict>
4521 </plist>
4522 '''
4523
4524 DOT_DESKTOP_LINK_TEMPLATE = '''\
4525 [Desktop Entry]
4526 Encoding=UTF-8
4527 Name=%(filename)s
4528 Type=Link
4529 URL=%(url)s
4530 Icon=text-html
4531 '''
4532
4533 LINK_TEMPLATES = {
4534     'url': DOT_URL_LINK_TEMPLATE,
4535     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4536     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4537 }
4538
4539
4540 def iri_to_uri(iri):
4541     """
4542     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4543
4544     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4545     """
4546
4547     iri_parts = urllib.parse.urlparse(iri)
4548
4549     if '[' in iri_parts.netloc:
4550         raise ValueError('IPv6 URIs are not, yet, supported.')
4551         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4552
4553     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4554
4555     net_location = ''
4556     if iri_parts.username:
4557         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4558         if iri_parts.password is not None:
4559             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4560         net_location += '@'
4561
4562     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4563     # The 'idna' encoding produces ASCII text.
4564     if iri_parts.port is not None and iri_parts.port != 80:
4565         net_location += ':' + str(iri_parts.port)
4566
4567     return urllib.parse.urlunparse(
4568         (iri_parts.scheme,
4569             net_location,
4570
4571             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4572
4573             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4574             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4575
4576             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4577             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4578
4579             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4580
4581     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4582
4583
4584 def to_high_limit_path(path):
4585     if sys.platform in ['win32', 'cygwin']:
4586         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4587         return '\\\\?\\' + os.path.abspath(path)
4588
4589     return path
4590
4591
4592 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4593     val = traversal.traverse_obj(obj, *variadic(field))
4594     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4595         return default
4596     return template % func(val)
4597
4598
4599 def clean_podcast_url(url):
4600     url = re.sub(r'''(?x)
4601         (?:
4602             (?:
4603                 chtbl\.com/track|
4604                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4605                 play\.podtrac\.com|
4606                 chrt\.fm/track|
4607                 mgln\.ai/e
4608             )(?:/[^/.]+)?|
4609             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4610             flex\.acast\.com|
4611             pd(?:
4612                 cn\.co| # https://podcorn.com/analytics-prefix/
4613                 st\.fm # https://podsights.com/docs/
4614             )/e|
4615             [0-9]\.gum\.fm|
4616             pscrb\.fm/rss/p
4617         )/''', '', url)
4618     return re.sub(r'^\w+://(\w+://)', r'\1', url)
4619
4620
4621 _HEX_TABLE = '0123456789abcdef'
4622
4623
4624 def random_uuidv4():
4625     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4626
4627
4628 def make_dir(path, to_screen=None):
4629     try:
4630         dn = os.path.dirname(path)
4631         if dn:
4632             os.makedirs(dn, exist_ok=True)
4633         return True
4634     except OSError as err:
4635         if callable(to_screen) is not None:
4636             to_screen(f'unable to create directory {err}')
4637         return False
4638
4639
4640 def get_executable_path():
4641     from ..update import _get_variant_and_executable_path
4642
4643     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4644
4645
4646 def get_user_config_dirs(package_name):
4647     # .config (e.g. ~/.config/package_name)
4648     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4649     yield os.path.join(xdg_config_home, package_name)
4650
4651     # appdata (%APPDATA%/package_name)
4652     appdata_dir = os.getenv('appdata')
4653     if appdata_dir:
4654         yield os.path.join(appdata_dir, package_name)
4655
4656     # home (~/.package_name)
4657     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4658
4659
4660 def get_system_config_dirs(package_name):
4661     # /etc/package_name
4662     yield os.path.join('/etc', package_name)
4663
4664
4665 def time_seconds(**kwargs):
4666     """
4667     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4668     """
4669     return time.time() + datetime.timedelta(**kwargs).total_seconds()
4670
4671
4672 # create a JSON Web Signature (jws) with HS256 algorithm
4673 # the resulting format is in JWS Compact Serialization
4674 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4675 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4676 def jwt_encode_hs256(payload_data, key, headers={}):
4677     header_data = {
4678         'alg': 'HS256',
4679         'typ': 'JWT',
4680     }
4681     if headers:
4682         header_data.update(headers)
4683     header_b64 = base64.b64encode(json.dumps(header_data).encode())
4684     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4685     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4686     signature_b64 = base64.b64encode(h.digest())
4687     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4688     return token
4689
4690
4691 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4692 def jwt_decode_hs256(jwt):
4693     header_b64, payload_b64, signature_b64 = jwt.split('.')
4694     # add trailing ='s that may have been stripped, superfluous ='s are ignored
4695     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4696     return payload_data
4697
4698
4699 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4700
4701
4702 @functools.cache
4703 def supports_terminal_sequences(stream):
4704     if compat_os_name == 'nt':
4705         if not WINDOWS_VT_MODE:
4706             return False
4707     elif not os.getenv('TERM'):
4708         return False
4709     try:
4710         return stream.isatty()
4711     except BaseException:
4712         return False
4713
4714
4715 def windows_enable_vt_mode():
4716     """Ref: https://bugs.python.org/issue30075 """
4717     if get_windows_version() < (10, 0, 10586):
4718         return
4719
4720     import ctypes
4721     import ctypes.wintypes
4722     import msvcrt
4723
4724     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4725
4726     dll = ctypes.WinDLL('kernel32', use_last_error=False)
4727     handle = os.open('CONOUT$', os.O_RDWR)
4728     try:
4729         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4730         dw_original_mode = ctypes.wintypes.DWORD()
4731         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4732         if not success:
4733             raise Exception('GetConsoleMode failed')
4734
4735         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4736             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4737         if not success:
4738             raise Exception('SetConsoleMode failed')
4739     finally:
4740         os.close(handle)
4741
4742     global WINDOWS_VT_MODE
4743     WINDOWS_VT_MODE = True
4744     supports_terminal_sequences.cache_clear()
4745
4746
4747 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4748
4749
4750 def remove_terminal_sequences(string):
4751     return _terminal_sequences_re.sub('', string)
4752
4753
4754 def number_of_digits(number):
4755     return len('%d' % number)
4756
4757
4758 def join_nonempty(*values, delim='-', from_dict=None):
4759     if from_dict is not None:
4760         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4761     return delim.join(map(str, filter(None, values)))
4762
4763
4764 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4765     """
4766     Find the largest format dimensions in terms of video width and, for each thumbnail:
4767     * Modify the URL: Match the width with the provided regex and replace with the former width
4768     * Update dimensions
4769
4770     This function is useful with video services that scale the provided thumbnails on demand
4771     """
4772     _keys = ('width', 'height')
4773     max_dimensions = max(
4774         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4775         default=(0, 0))
4776     if not max_dimensions[0]:
4777         return thumbnails
4778     return [
4779         merge_dicts(
4780             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4781             dict(zip(_keys, max_dimensions)), thumbnail)
4782         for thumbnail in thumbnails
4783     ]
4784
4785
4786 def parse_http_range(range):
4787     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4788     if not range:
4789         return None, None, None
4790     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4791     if not crg:
4792         return None, None, None
4793     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4794
4795
4796 def read_stdin(what):
4797     if what:
4798         eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4799         write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4800     return sys.stdin
4801
4802
4803 def determine_file_encoding(data):
4804     """
4805     Detect the text encoding used
4806     @returns (encoding, bytes to skip)
4807     """
4808
4809     # BOM marks are given priority over declarations
4810     for bom, enc in BOMS:
4811         if data.startswith(bom):
4812             return enc, len(bom)
4813
4814     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4815     # We ignore the endianness to get a good enough match
4816     data = data.replace(b'\0', b'')
4817     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4818     return mobj.group(1).decode() if mobj else None, 0
4819
4820
4821 class Config:
4822     own_args = None
4823     parsed_args = None
4824     filename = None
4825     __initialized = False
4826
4827     def __init__(self, parser, label=None):
4828         self.parser, self.label = parser, label
4829         self._loaded_paths, self.configs = set(), []
4830
4831     def init(self, args=None, filename=None):
4832         assert not self.__initialized
4833         self.own_args, self.filename = args, filename
4834         return self.load_configs()
4835
4836     def load_configs(self):
4837         directory = ''
4838         if self.filename:
4839             location = os.path.realpath(self.filename)
4840             directory = os.path.dirname(location)
4841             if location in self._loaded_paths:
4842                 return False
4843             self._loaded_paths.add(location)
4844
4845         self.__initialized = True
4846         opts, _ = self.parser.parse_known_args(self.own_args)
4847         self.parsed_args = self.own_args
4848         for location in opts.config_locations or []:
4849             if location == '-':
4850                 if location in self._loaded_paths:
4851                     continue
4852                 self._loaded_paths.add(location)
4853                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4854                 continue
4855             location = os.path.join(directory, expand_path(location))
4856             if os.path.isdir(location):
4857                 location = os.path.join(location, 'yt-dlp.conf')
4858             if not os.path.exists(location):
4859                 self.parser.error(f'config location {location} does not exist')
4860             self.append_config(self.read_file(location), location)
4861         return True
4862
4863     def __str__(self):
4864         label = join_nonempty(
4865             self.label, 'config', f'"{self.filename}"' if self.filename else '',
4866             delim=' ')
4867         return join_nonempty(
4868             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4869             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4870             delim='\n')
4871
4872     @staticmethod
4873     def read_file(filename, default=[]):
4874         try:
4875             optionf = open(filename, 'rb')
4876         except OSError:
4877             return default  # silently skip if file is not present
4878         try:
4879             enc, skip = determine_file_encoding(optionf.read(512))
4880             optionf.seek(skip, io.SEEK_SET)
4881         except OSError:
4882             enc = None  # silently skip read errors
4883         try:
4884             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4885             contents = optionf.read().decode(enc or preferredencoding())
4886             res = shlex.split(contents, comments=True)
4887         except Exception as err:
4888             raise ValueError(f'Unable to parse "{filename}": {err}')
4889         finally:
4890             optionf.close()
4891         return res
4892
4893     @staticmethod
4894     def hide_login_info(opts):
4895         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4896         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4897
4898         def _scrub_eq(o):
4899             m = eqre.match(o)
4900             if m:
4901                 return m.group('key') + '=PRIVATE'
4902             else:
4903                 return o
4904
4905         opts = list(map(_scrub_eq, opts))
4906         for idx, opt in enumerate(opts):
4907             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4908                 opts[idx + 1] = 'PRIVATE'
4909         return opts
4910
4911     def append_config(self, *args, label=None):
4912         config = type(self)(self.parser, label)
4913         config._loaded_paths = self._loaded_paths
4914         if config.init(*args):
4915             self.configs.append(config)
4916
4917     @property
4918     def all_args(self):
4919         for config in reversed(self.configs):
4920             yield from config.all_args
4921         yield from self.parsed_args or []
4922
4923     def parse_known_args(self, **kwargs):
4924         return self.parser.parse_known_args(self.all_args, **kwargs)
4925
4926     def parse_args(self):
4927         return self.parser.parse_args(self.all_args)
4928
4929
4930 def merge_headers(*dicts):
4931     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4932     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
4933
4934
4935 def cached_method(f):
4936     """Cache a method"""
4937     signature = inspect.signature(f)
4938
4939     @functools.wraps(f)
4940     def wrapper(self, *args, **kwargs):
4941         bound_args = signature.bind(self, *args, **kwargs)
4942         bound_args.apply_defaults()
4943         key = tuple(bound_args.arguments.values())[1:]
4944
4945         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
4946         if key not in cache:
4947             cache[key] = f(self, *args, **kwargs)
4948         return cache[key]
4949     return wrapper
4950
4951
4952 class classproperty:
4953     """property access for class methods with optional caching"""
4954     def __new__(cls, func=None, *args, **kwargs):
4955         if not func:
4956             return functools.partial(cls, *args, **kwargs)
4957         return super().__new__(cls)
4958
4959     def __init__(self, func, *, cache=False):
4960         functools.update_wrapper(self, func)
4961         self.func = func
4962         self._cache = {} if cache else None
4963
4964     def __get__(self, _, cls):
4965         if self._cache is None:
4966             return self.func(cls)
4967         elif cls not in self._cache:
4968             self._cache[cls] = self.func(cls)
4969         return self._cache[cls]
4970
4971
4972 class function_with_repr:
4973     def __init__(self, func, repr_=None):
4974         functools.update_wrapper(self, func)
4975         self.func, self.__repr = func, repr_
4976
4977     def __call__(self, *args, **kwargs):
4978         return self.func(*args, **kwargs)
4979
4980     def __repr__(self):
4981         if self.__repr:
4982             return self.__repr
4983         return f'{self.func.__module__}.{self.func.__qualname__}'
4984
4985
4986 class Namespace(types.SimpleNamespace):
4987     """Immutable namespace"""
4988
4989     def __iter__(self):
4990         return iter(self.__dict__.values())
4991
4992     @property
4993     def items_(self):
4994         return self.__dict__.items()
4995
4996
4997 MEDIA_EXTENSIONS = Namespace(
4998     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
4999     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5000     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5001     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5002     thumbnails=('jpg', 'png', 'webp'),
5003     storyboards=('mhtml', ),
5004     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5005     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5006 )
5007 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5008 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5009
5010 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5011
5012
5013 class RetryManager:
5014     """Usage:
5015         for retry in RetryManager(...):
5016             try:
5017                 ...
5018             except SomeException as err:
5019                 retry.error = err
5020                 continue
5021     """
5022     attempt, _error = 0, None
5023
5024     def __init__(self, _retries, _error_callback, **kwargs):
5025         self.retries = _retries or 0
5026         self.error_callback = functools.partial(_error_callback, **kwargs)
5027
5028     def _should_retry(self):
5029         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5030
5031     @property
5032     def error(self):
5033         if self._error is NO_DEFAULT:
5034             return None
5035         return self._error
5036
5037     @error.setter
5038     def error(self, value):
5039         self._error = value
5040
5041     def __iter__(self):
5042         while self._should_retry():
5043             self.error = NO_DEFAULT
5044             self.attempt += 1
5045             yield self
5046             if self.error:
5047                 self.error_callback(self.error, self.attempt, self.retries)
5048
5049     @staticmethod
5050     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5051         """Utility function for reporting retries"""
5052         if count > retries:
5053             if error:
5054                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5055             raise e
5056
5057         if not count:
5058             return warn(e)
5059         elif isinstance(e, ExtractorError):
5060             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5061         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5062
5063         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5064         if delay:
5065             info(f'Sleeping {delay:.2f} seconds ...')
5066             time.sleep(delay)
5067
5068
5069 def make_archive_id(ie, video_id):
5070     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5071     return f'{ie_key.lower()} {video_id}'
5072
5073
5074 def truncate_string(s, left, right=0):
5075     assert left > 3 and right >= 0
5076     if s is None or len(s) <= left + right:
5077         return s
5078     return f'{s[:left - 3]}...{s[-right:] if right else ""}'
5079
5080
5081 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5082     assert 'all' in alias_dict, '"all" alias is required'
5083     requested = list(start or [])
5084     for val in options:
5085         discard = val.startswith('-')
5086         if discard:
5087             val = val[1:]
5088
5089         if val in alias_dict:
5090             val = alias_dict[val] if not discard else [
5091                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5092             # NB: Do not allow regex in aliases for performance
5093             requested = orderedSet_from_options(val, alias_dict, start=requested)
5094             continue
5095
5096         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5097                    else [val] if val in alias_dict['all'] else None)
5098         if current is None:
5099             raise ValueError(val)
5100
5101         if discard:
5102             for item in current:
5103                 while item in requested:
5104                     requested.remove(item)
5105         else:
5106             requested.extend(current)
5107
5108     return orderedSet(requested)
5109
5110
5111 # TODO: Rewrite
5112 class FormatSorter:
5113     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5114
5115     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5116                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5117                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5118     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5119                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5120                     'fps', 'fs_approx', 'source', 'id')
5121
5122     settings = {
5123         'vcodec': {'type': 'ordered', 'regex': True,
5124                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5125         'acodec': {'type': 'ordered', 'regex': True,
5126                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5127         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5128                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5129         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5130                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5131         'vext': {'type': 'ordered', 'field': 'video_ext',
5132                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5133                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5134         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5135                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5136                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5137         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5138         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5139                        'field': ('vcodec', 'acodec'),
5140                        'function': lambda it: int(any(v != 'none' for v in it))},
5141         'ie_pref': {'priority': True, 'type': 'extractor'},
5142         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5143         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5144         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5145         'quality': {'convert': 'float', 'default': -1},
5146         'filesize': {'convert': 'bytes'},
5147         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5148         'id': {'convert': 'string', 'field': 'format_id'},
5149         'height': {'convert': 'float_none'},
5150         'width': {'convert': 'float_none'},
5151         'fps': {'convert': 'float_none'},
5152         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5153         'tbr': {'convert': 'float_none'},
5154         'vbr': {'convert': 'float_none'},
5155         'abr': {'convert': 'float_none'},
5156         'asr': {'convert': 'float_none'},
5157         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5158
5159         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5160         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5161                'function': lambda it: next(filter(None, it), None)},
5162         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5163                  'function': lambda it: next(filter(None, it), None)},
5164         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5165         'res': {'type': 'multiple', 'field': ('height', 'width'),
5166                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5167
5168         # Actual field names
5169         'format_id': {'type': 'alias', 'field': 'id'},
5170         'preference': {'type': 'alias', 'field': 'ie_pref'},
5171         'language_preference': {'type': 'alias', 'field': 'lang'},
5172         'source_preference': {'type': 'alias', 'field': 'source'},
5173         'protocol': {'type': 'alias', 'field': 'proto'},
5174         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5175         'audio_channels': {'type': 'alias', 'field': 'channels'},
5176
5177         # Deprecated
5178         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5179         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5180         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5181         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5182         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5183         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5184         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5185         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5186         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5187         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5188         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5189         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5190         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5191         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5192         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5193         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5194         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5195         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5196         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5197         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5198     }
5199
5200     def __init__(self, ydl, field_preference):
5201         self.ydl = ydl
5202         self._order = []
5203         self.evaluate_params(self.ydl.params, field_preference)
5204         if ydl.params.get('verbose'):
5205             self.print_verbose_info(self.ydl.write_debug)
5206
5207     def _get_field_setting(self, field, key):
5208         if field not in self.settings:
5209             if key in ('forced', 'priority'):
5210                 return False
5211             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5212                                         'deprecated and may be removed in a future version')
5213             self.settings[field] = {}
5214         propObj = self.settings[field]
5215         if key not in propObj:
5216             type = propObj.get('type')
5217             if key == 'field':
5218                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5219             elif key == 'convert':
5220                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5221             else:
5222                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5223             propObj[key] = default
5224         return propObj[key]
5225
5226     def _resolve_field_value(self, field, value, convertNone=False):
5227         if value is None:
5228             if not convertNone:
5229                 return None
5230         else:
5231             value = value.lower()
5232         conversion = self._get_field_setting(field, 'convert')
5233         if conversion == 'ignore':
5234             return None
5235         if conversion == 'string':
5236             return value
5237         elif conversion == 'float_none':
5238             return float_or_none(value)
5239         elif conversion == 'bytes':
5240             return parse_bytes(value)
5241         elif conversion == 'order':
5242             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5243             use_regex = self._get_field_setting(field, 'regex')
5244             list_length = len(order_list)
5245             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5246             if use_regex and value is not None:
5247                 for i, regex in enumerate(order_list):
5248                     if regex and re.match(regex, value):
5249                         return list_length - i
5250                 return list_length - empty_pos  # not in list
5251             else:  # not regex or  value = None
5252                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5253         else:
5254             if value.isnumeric():
5255                 return float(value)
5256             else:
5257                 self.settings[field]['convert'] = 'string'
5258                 return value
5259
5260     def evaluate_params(self, params, sort_extractor):
5261         self._use_free_order = params.get('prefer_free_formats', False)
5262         self._sort_user = params.get('format_sort', [])
5263         self._sort_extractor = sort_extractor
5264
5265         def add_item(field, reverse, closest, limit_text):
5266             field = field.lower()
5267             if field in self._order:
5268                 return
5269             self._order.append(field)
5270             limit = self._resolve_field_value(field, limit_text)
5271             data = {
5272                 'reverse': reverse,
5273                 'closest': False if limit is None else closest,
5274                 'limit_text': limit_text,
5275                 'limit': limit}
5276             if field in self.settings:
5277                 self.settings[field].update(data)
5278             else:
5279                 self.settings[field] = data
5280
5281         sort_list = (
5282             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5283             + (tuple() if params.get('format_sort_force', False)
5284                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5285             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5286
5287         for item in sort_list:
5288             match = re.match(self.regex, item)
5289             if match is None:
5290                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5291             field = match.group('field')
5292             if field is None:
5293                 continue
5294             if self._get_field_setting(field, 'type') == 'alias':
5295                 alias, field = field, self._get_field_setting(field, 'field')
5296                 if self._get_field_setting(alias, 'deprecated'):
5297                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5298                                                 f'be removed in a future version. Please use {field} instead')
5299             reverse = match.group('reverse') is not None
5300             closest = match.group('separator') == '~'
5301             limit_text = match.group('limit')
5302
5303             has_limit = limit_text is not None
5304             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5305             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5306
5307             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5308             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5309             limit_count = len(limits)
5310             for (i, f) in enumerate(fields):
5311                 add_item(f, reverse, closest,
5312                          limits[i] if i < limit_count
5313                          else limits[0] if has_limit and not has_multiple_limits
5314                          else None)
5315
5316     def print_verbose_info(self, write_debug):
5317         if self._sort_user:
5318             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5319         if self._sort_extractor:
5320             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5321         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5322             '+' if self._get_field_setting(field, 'reverse') else '', field,
5323             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5324                           self._get_field_setting(field, 'limit_text'),
5325                           self._get_field_setting(field, 'limit'))
5326             if self._get_field_setting(field, 'limit_text') is not None else '')
5327             for field in self._order if self._get_field_setting(field, 'visible')]))
5328
5329     def _calculate_field_preference_from_value(self, format, field, type, value):
5330         reverse = self._get_field_setting(field, 'reverse')
5331         closest = self._get_field_setting(field, 'closest')
5332         limit = self._get_field_setting(field, 'limit')
5333
5334         if type == 'extractor':
5335             maximum = self._get_field_setting(field, 'max')
5336             if value is None or (maximum is not None and value >= maximum):
5337                 value = -1
5338         elif type == 'boolean':
5339             in_list = self._get_field_setting(field, 'in_list')
5340             not_in_list = self._get_field_setting(field, 'not_in_list')
5341             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5342         elif type == 'ordered':
5343             value = self._resolve_field_value(field, value, True)
5344
5345         # try to convert to number
5346         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5347         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5348         if is_num:
5349             value = val_num
5350
5351         return ((-10, 0) if value is None
5352                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5353                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5354                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5355                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5356                 else (-1, value, 0))
5357
5358     def _calculate_field_preference(self, format, field):
5359         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5360         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5361         if type == 'multiple':
5362             type = 'field'  # Only 'field' is allowed in multiple for now
5363             actual_fields = self._get_field_setting(field, 'field')
5364
5365             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5366         else:
5367             value = get_value(field)
5368         return self._calculate_field_preference_from_value(format, field, type, value)
5369
5370     def calculate_preference(self, format):
5371         # Determine missing protocol
5372         if not format.get('protocol'):
5373             format['protocol'] = determine_protocol(format)
5374
5375         # Determine missing ext
5376         if not format.get('ext') and 'url' in format:
5377             format['ext'] = determine_ext(format['url'])
5378         if format.get('vcodec') == 'none':
5379             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5380             format['video_ext'] = 'none'
5381         else:
5382             format['video_ext'] = format['ext']
5383             format['audio_ext'] = 'none'
5384         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5385         #    format['preference'] = -1000
5386
5387         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5388             # HEVC-over-FLV is out-of-spec by FLV's original spec
5389             # ref. https://trac.ffmpeg.org/ticket/6389
5390             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5391             format['preference'] = -100
5392
5393         # Determine missing bitrates
5394         if format.get('vcodec') == 'none':
5395             format['vbr'] = 0
5396         if format.get('acodec') == 'none':
5397             format['abr'] = 0
5398         if not format.get('vbr') and format.get('vcodec') != 'none':
5399             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5400         if not format.get('abr') and format.get('acodec') != 'none':
5401             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5402         if not format.get('tbr'):
5403             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5404
5405         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5406
5407
5408 # XXX: Temporary
5409 class _YDLLogger:
5410     def __init__(self, ydl=None):
5411         self._ydl = ydl
5412
5413     def debug(self, message):
5414         if self._ydl:
5415             self._ydl.write_debug(message)
5416
5417     def info(self, message):
5418         if self._ydl:
5419             self._ydl.to_screen(message)
5420
5421     def warning(self, message, *, once=False):
5422         if self._ydl:
5423             self._ydl.report_warning(message, once)
5424
5425     def error(self, message, *, is_error=True):
5426         if self._ydl:
5427             self._ydl.report_error(message, is_error=is_error)
5428
5429     def stdout(self, message):
5430         if self._ydl:
5431             self._ydl.to_stdout(message)
5432
5433     def stderr(self, message):
5434         if self._ydl:
5435             self._ydl.to_stderr(message)