yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import inspect
  19 import io
  20 import itertools
  21 import json
  22 import locale
  23 import math
  24 import mimetypes
  25 import netrc
  26 import operator
  27 import os
  28 import platform
  29 import random
  30 import re
  31 import shlex
  32 import socket
  33 import ssl
  34 import struct
  35 import subprocess
  36 import sys
  37 import tempfile
  38 import time
  39 import traceback
  40 import types
  41 import unicodedata
  42 import urllib.error
  43 import urllib.parse
  44 import urllib.request
  45 import xml.etree.ElementTree
  46
  47 from . import traversal
  48
  49 from ..compat import functools  # isort: split
  50 from ..compat import (
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_HTMLParseError,
  54     compat_os_name,
  55     compat_shlex_quote,
  56 )
  57 from ..dependencies import websockets, xattr
  58
  59 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  60
  61 # This is not clearly defined otherwise
  62 compiled_regex_type = type(re.compile(''))
  63
  64
  65 class NO_DEFAULT:
  66     pass
  67
  68
  69 def IDENTITY(x):
  70     return x
  71
  72
  73 ENGLISH_MONTH_NAMES = [
  74     'January', 'February', 'March', 'April', 'May', 'June',
  75     'July', 'August', 'September', 'October', 'November', 'December']
  76
  77 MONTH_NAMES = {
  78     'en': ENGLISH_MONTH_NAMES,
  79     'fr': [
  80         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  81         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  82     # these follow the genitive grammatical case (dopełniacz)
  83     # some websites might be using nominative, which will require another month list
  84     # https://en.wikibooks.org/wiki/Polish/Noun_cases
  85     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  86            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  87 }
  88
  89 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  90 TIMEZONE_NAMES = {
  91     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  92     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
  93     'EST': -5, 'EDT': -4,  # Eastern
  94     'CST': -6, 'CDT': -5,  # Central
  95     'MST': -7, 'MDT': -6,  # Mountain
  96     'PST': -8, 'PDT': -7   # Pacific
  97 }
  98
  99 # needed for sanitizing filenames in restricted mode
 100 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 101                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 102                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 103
 104 DATE_FORMATS = (
 105     '%d %B %Y',
 106     '%d %b %Y',
 107     '%B %d %Y',
 108     '%B %dst %Y',
 109     '%B %dnd %Y',
 110     '%B %drd %Y',
 111     '%B %dth %Y',
 112     '%b %d %Y',
 113     '%b %dst %Y',
 114     '%b %dnd %Y',
 115     '%b %drd %Y',
 116     '%b %dth %Y',
 117     '%b %dst %Y %I:%M',
 118     '%b %dnd %Y %I:%M',
 119     '%b %drd %Y %I:%M',
 120     '%b %dth %Y %I:%M',
 121     '%Y %m %d',
 122     '%Y-%m-%d',
 123     '%Y.%m.%d.',
 124     '%Y/%m/%d',
 125     '%Y/%m/%d %H:%M',
 126     '%Y/%m/%d %H:%M:%S',
 127     '%Y%m%d%H%M',
 128     '%Y%m%d%H%M%S',
 129     '%Y%m%d',
 130     '%Y-%m-%d %H:%M',
 131     '%Y-%m-%d %H:%M:%S',
 132     '%Y-%m-%d %H:%M:%S.%f',
 133     '%Y-%m-%d %H:%M:%S:%f',
 134     '%d.%m.%Y %H:%M',
 135     '%d.%m.%Y %H.%M',
 136     '%Y-%m-%dT%H:%M:%SZ',
 137     '%Y-%m-%dT%H:%M:%S.%fZ',
 138     '%Y-%m-%dT%H:%M:%S.%f0Z',
 139     '%Y-%m-%dT%H:%M:%S',
 140     '%Y-%m-%dT%H:%M:%S.%f',
 141     '%Y-%m-%dT%H:%M',
 142     '%b %d %Y at %H:%M',
 143     '%b %d %Y at %H:%M:%S',
 144     '%B %d %Y at %H:%M',
 145     '%B %d %Y at %H:%M:%S',
 146     '%H:%M %d-%b-%Y',
 147 )
 148
 149 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 150 DATE_FORMATS_DAY_FIRST.extend([
 151     '%d-%m-%Y',
 152     '%d.%m.%Y',
 153     '%d.%m.%y',
 154     '%d/%m/%Y',
 155     '%d/%m/%y',
 156     '%d/%m/%Y %H:%M:%S',
 157     '%d-%m-%Y %H:%M',
 158     '%H:%M %d/%m/%Y',
 159 ])
 160
 161 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 162 DATE_FORMATS_MONTH_FIRST.extend([
 163     '%m-%d-%Y',
 164     '%m.%d.%Y',
 165     '%m/%d/%Y',
 166     '%m/%d/%y',
 167     '%m/%d/%Y %H:%M:%S',
 168 ])
 169
 170 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 171 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 172
 173 NUMBER_RE = r'\d+(?:\.\d+)?'
 174
 175
 176 @functools.cache
 177 def preferredencoding():
 178     """Get preferred encoding.
 179
 180     Returns the best encoding scheme for the system, based on
 181     locale.getpreferredencoding() and some further tweaks.
 182     """
 183     try:
 184         pref = locale.getpreferredencoding()
 185         'TEST'.encode(pref)
 186     except Exception:
 187         pref = 'UTF-8'
 188
 189     return pref
 190
 191
 192 def write_json_file(obj, fn):
 193     """ Encode obj as JSON and write it to fn, atomically if possible """
 194
 195     tf = tempfile.NamedTemporaryFile(
 196         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 197         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 198
 199     try:
 200         with tf:
 201             json.dump(obj, tf, ensure_ascii=False)
 202         if sys.platform == 'win32':
 203             # Need to remove existing file on Windows, else os.rename raises
 204             # WindowsError or FileExistsError.
 205             with contextlib.suppress(OSError):
 206                 os.unlink(fn)
 207         with contextlib.suppress(OSError):
 208             mask = os.umask(0)
 209             os.umask(mask)
 210             os.chmod(tf.name, 0o666 & ~mask)
 211         os.rename(tf.name, fn)
 212     except Exception:
 213         with contextlib.suppress(OSError):
 214             os.remove(tf.name)
 215         raise
 216
 217
 218 def find_xpath_attr(node, xpath, key, val=None):
 219     """ Find the xpath xpath[@key=val] """
 220     assert re.match(r'^[a-zA-Z_-]+$', key)
 221     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 222     return node.find(expr)
 223
 224 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 225 # the namespace parameter
 226
 227
 228 def xpath_with_ns(path, ns_map):
 229     components = [c.split(':') for c in path.split('/')]
 230     replaced = []
 231     for c in components:
 232         if len(c) == 1:
 233             replaced.append(c[0])
 234         else:
 235             ns, tag = c
 236             replaced.append('{%s}%s' % (ns_map[ns], tag))
 237     return '/'.join(replaced)
 238
 239
 240 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 241     def _find_xpath(xpath):
 242         return node.find(xpath)
 243
 244     if isinstance(xpath, str):
 245         n = _find_xpath(xpath)
 246     else:
 247         for xp in xpath:
 248             n = _find_xpath(xp)
 249             if n is not None:
 250                 break
 251
 252     if n is None:
 253         if default is not NO_DEFAULT:
 254             return default
 255         elif fatal:
 256             name = xpath if name is None else name
 257             raise ExtractorError('Could not find XML element %s' % name)
 258         else:
 259             return None
 260     return n
 261
 262
 263 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 264     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 265     if n is None or n == default:
 266         return n
 267     if n.text is None:
 268         if default is not NO_DEFAULT:
 269             return default
 270         elif fatal:
 271             name = xpath if name is None else name
 272             raise ExtractorError('Could not find XML element\'s text %s' % name)
 273         else:
 274             return None
 275     return n.text
 276
 277
 278 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 279     n = find_xpath_attr(node, xpath, key)
 280     if n is None:
 281         if default is not NO_DEFAULT:
 282             return default
 283         elif fatal:
 284             name = f'{xpath}[@{key}]' if name is None else name
 285             raise ExtractorError('Could not find XML attribute %s' % name)
 286         else:
 287             return None
 288     return n.attrib[key]
 289
 290
 291 def get_element_by_id(id, html, **kwargs):
 292     """Return the content of the tag with the specified ID in the passed HTML document"""
 293     return get_element_by_attribute('id', id, html, **kwargs)
 294
 295
 296 def get_element_html_by_id(id, html, **kwargs):
 297     """Return the html of the tag with the specified ID in the passed HTML document"""
 298     return get_element_html_by_attribute('id', id, html, **kwargs)
 299
 300
 301 def get_element_by_class(class_name, html):
 302     """Return the content of the first tag with the specified class in the passed HTML document"""
 303     retval = get_elements_by_class(class_name, html)
 304     return retval[0] if retval else None
 305
 306
 307 def get_element_html_by_class(class_name, html):
 308     """Return the html of the first tag with the specified class in the passed HTML document"""
 309     retval = get_elements_html_by_class(class_name, html)
 310     return retval[0] if retval else None
 311
 312
 313 def get_element_by_attribute(attribute, value, html, **kwargs):
 314     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 315     return retval[0] if retval else None
 316
 317
 318 def get_element_html_by_attribute(attribute, value, html, **kargs):
 319     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 320     return retval[0] if retval else None
 321
 322
 323 def get_elements_by_class(class_name, html, **kargs):
 324     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 325     return get_elements_by_attribute(
 326         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 327         html, escape_value=False)
 328
 329
 330 def get_elements_html_by_class(class_name, html):
 331     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 332     return get_elements_html_by_attribute(
 333         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 334         html, escape_value=False)
 335
 336
 337 def get_elements_by_attribute(*args, **kwargs):
 338     """Return the content of the tag with the specified attribute in the passed HTML document"""
 339     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 340
 341
 342 def get_elements_html_by_attribute(*args, **kwargs):
 343     """Return the html of the tag with the specified attribute in the passed HTML document"""
 344     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 345
 346
 347 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 348     """
 349     Return the text (content) and the html (whole) of the tag with the specified
 350     attribute in the passed HTML document
 351     """
 352     if not value:
 353         return
 354
 355     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 356
 357     value = re.escape(value) if escape_value else value
 358
 359     partial_element_re = rf'''(?x)
 360         <(?P<tag>{tag})
 361          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 362          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 363         '''
 364
 365     for m in re.finditer(partial_element_re, html):
 366         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 367
 368         yield (
 369             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 370             whole
 371         )
 372
 373
 374 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 375     """
 376     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 377     closing tag for the first opening tag it has encountered, and can be used
 378     as a context manager
 379     """
 380
 381     class HTMLBreakOnClosingTagException(Exception):
 382         pass
 383
 384     def __init__(self):
 385         self.tagstack = collections.deque()
 386         html.parser.HTMLParser.__init__(self)
 387
 388     def __enter__(self):
 389         return self
 390
 391     def __exit__(self, *_):
 392         self.close()
 393
 394     def close(self):
 395         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 396         # so data remains buffered; we no longer have any interest in it, thus
 397         # override this method to discard it
 398         pass
 399
 400     def handle_starttag(self, tag, _):
 401         self.tagstack.append(tag)
 402
 403     def handle_endtag(self, tag):
 404         if not self.tagstack:
 405             raise compat_HTMLParseError('no tags in the stack')
 406         while self.tagstack:
 407             inner_tag = self.tagstack.pop()
 408             if inner_tag == tag:
 409                 break
 410         else:
 411             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 412         if not self.tagstack:
 413             raise self.HTMLBreakOnClosingTagException()
 414
 415
 416 # XXX: This should be far less strict
 417 def get_element_text_and_html_by_tag(tag, html):
 418     """
 419     For the first element with the specified tag in the passed HTML document
 420     return its' content (text) and the whole element (html)
 421     """
 422     def find_or_raise(haystack, needle, exc):
 423         try:
 424             return haystack.index(needle)
 425         except ValueError:
 426             raise exc
 427     closing_tag = f'</{tag}>'
 428     whole_start = find_or_raise(
 429         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 430     content_start = find_or_raise(
 431         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 432     content_start += whole_start + 1
 433     with HTMLBreakOnClosingTagParser() as parser:
 434         parser.feed(html[whole_start:content_start])
 435         if not parser.tagstack or parser.tagstack[0] != tag:
 436             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 437         offset = content_start
 438         while offset < len(html):
 439             next_closing_tag_start = find_or_raise(
 440                 html[offset:], closing_tag,
 441                 compat_HTMLParseError(f'closing {tag} tag not found'))
 442             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 443             try:
 444                 parser.feed(html[offset:offset + next_closing_tag_end])
 445                 offset += next_closing_tag_end
 446             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 447                 return html[content_start:offset + next_closing_tag_start], \
 448                     html[whole_start:offset + next_closing_tag_end]
 449         raise compat_HTMLParseError('unexpected end of html')
 450
 451
 452 class HTMLAttributeParser(html.parser.HTMLParser):
 453     """Trivial HTML parser to gather the attributes for a single element"""
 454
 455     def __init__(self):
 456         self.attrs = {}
 457         html.parser.HTMLParser.__init__(self)
 458
 459     def handle_starttag(self, tag, attrs):
 460         self.attrs = dict(attrs)
 461         raise compat_HTMLParseError('done')
 462
 463
 464 class HTMLListAttrsParser(html.parser.HTMLParser):
 465     """HTML parser to gather the attributes for the elements of a list"""
 466
 467     def __init__(self):
 468         html.parser.HTMLParser.__init__(self)
 469         self.items = []
 470         self._level = 0
 471
 472     def handle_starttag(self, tag, attrs):
 473         if tag == 'li' and self._level == 0:
 474             self.items.append(dict(attrs))
 475         self._level += 1
 476
 477     def handle_endtag(self, tag):
 478         self._level -= 1
 479
 480
 481 def extract_attributes(html_element):
 482     """Given a string for an HTML element such as
 483     <el
 484          a="foo" B="bar" c="&98;az" d=boz
 485          empty= noval entity="&amp;"
 486          sq='"' dq="'"
 487     >
 488     Decode and return a dictionary of attributes.
 489     {
 490         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 491         'empty': '', 'noval': None, 'entity': '&',
 492         'sq': '"', 'dq': '\''
 493     }.
 494     """
 495     parser = HTMLAttributeParser()
 496     with contextlib.suppress(compat_HTMLParseError):
 497         parser.feed(html_element)
 498         parser.close()
 499     return parser.attrs
 500
 501
 502 def parse_list(webpage):
 503     """Given a string for an series of HTML <li> elements,
 504     return a dictionary of their attributes"""
 505     parser = HTMLListAttrsParser()
 506     parser.feed(webpage)
 507     parser.close()
 508     return parser.items
 509
 510
 511 def clean_html(html):
 512     """Clean an HTML snippet into a readable string"""
 513
 514     if html is None:  # Convenience for sanitizing descriptions etc.
 515         return html
 516
 517     html = re.sub(r'\s+', ' ', html)
 518     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 519     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 520     # Strip html tags
 521     html = re.sub('<.*?>', '', html)
 522     # Replace html entities
 523     html = unescapeHTML(html)
 524     return html.strip()
 525
 526
 527 class LenientJSONDecoder(json.JSONDecoder):
 528     # TODO: Write tests
 529     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 530         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 531         self._close_attempts = 2 * close_objects
 532         super().__init__(*args, **kwargs)
 533
 534     @staticmethod
 535     def _close_object(err):
 536         doc = err.doc[:err.pos]
 537         # We need to add comma first to get the correct error message
 538         if err.msg.startswith('Expecting \',\''):
 539             return doc + ','
 540         elif not doc.endswith(','):
 541             return
 542
 543         if err.msg.startswith('Expecting property name'):
 544             return doc[:-1] + '}'
 545         elif err.msg.startswith('Expecting value'):
 546             return doc[:-1] + ']'
 547
 548     def decode(self, s):
 549         if self.transform_source:
 550             s = self.transform_source(s)
 551         for attempt in range(self._close_attempts + 1):
 552             try:
 553                 if self.ignore_extra:
 554                     return self.raw_decode(s.lstrip())[0]
 555                 return super().decode(s)
 556             except json.JSONDecodeError as e:
 557                 if e.pos is None:
 558                     raise
 559                 elif attempt < self._close_attempts:
 560                     s = self._close_object(e)
 561                     if s is not None:
 562                         continue
 563                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 564         assert False, 'Too many attempts to decode JSON'
 565
 566
 567 def sanitize_open(filename, open_mode):
 568     """Try to open the given filename, and slightly tweak it if this fails.
 569
 570     Attempts to open the given filename. If this fails, it tries to change
 571     the filename slightly, step by step, until it's either able to open it
 572     or it fails and raises a final exception, like the standard open()
 573     function.
 574
 575     It returns the tuple (stream, definitive_file_name).
 576     """
 577     if filename == '-':
 578         if sys.platform == 'win32':
 579             import msvcrt
 580
 581             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 582             with contextlib.suppress(io.UnsupportedOperation):
 583                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 584         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 585
 586     for attempt in range(2):
 587         try:
 588             try:
 589                 if sys.platform == 'win32':
 590                     # FIXME: An exclusive lock also locks the file from being read.
 591                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 592                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 593                     raise LockingUnsupportedError()
 594                 stream = locked_file(filename, open_mode, block=False).__enter__()
 595             except OSError:
 596                 stream = open(filename, open_mode)
 597             return stream, filename
 598         except OSError as err:
 599             if attempt or err.errno in (errno.EACCES,):
 600                 raise
 601             old_filename, filename = filename, sanitize_path(filename)
 602             if old_filename == filename:
 603                 raise
 604
 605
 606 def timeconvert(timestr):
 607     """Convert RFC 2822 defined time string into system timestamp"""
 608     timestamp = None
 609     timetuple = email.utils.parsedate_tz(timestr)
 610     if timetuple is not None:
 611         timestamp = email.utils.mktime_tz(timetuple)
 612     return timestamp
 613
 614
 615 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 616     """Sanitizes a string so it could be used as part of a filename.
 617     @param restricted   Use a stricter subset of allowed characters
 618     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 619                         If unset, yt-dlp's new sanitization rules are in effect
 620     """
 621     if s == '':
 622         return ''
 623
 624     def replace_insane(char):
 625         if restricted and char in ACCENT_CHARS:
 626             return ACCENT_CHARS[char]
 627         elif not restricted and char == '\n':
 628             return '\0 '
 629         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 630             # Replace with their full-width unicode counterparts
 631             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 632         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 633             return ''
 634         elif char == '"':
 635             return '' if restricted else '\''
 636         elif char == ':':
 637             return '\0_\0-' if restricted else '\0 \0-'
 638         elif char in '\\/|*<>':
 639             return '\0_'
 640         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 641             return '\0_'
 642         return char
 643
 644     # Replace look-alike Unicode glyphs
 645     if restricted and (is_id is NO_DEFAULT or not is_id):
 646         s = unicodedata.normalize('NFKC', s)
 647     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 648     result = ''.join(map(replace_insane, s))
 649     if is_id is NO_DEFAULT:
 650         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 651         STRIP_RE = r'(?:\0.|[ _-])*'
 652         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 653     result = result.replace('\0', '') or '_'
 654
 655     if not is_id:
 656         while '__' in result:
 657             result = result.replace('__', '_')
 658         result = result.strip('_')
 659         # Common case of "Foreign band name - English song title"
 660         if restricted and result.startswith('-_'):
 661             result = result[2:]
 662         if result.startswith('-'):
 663             result = '_' + result[len('-'):]
 664         result = result.lstrip('.')
 665         if not result:
 666             result = '_'
 667     return result
 668
 669
 670 def sanitize_path(s, force=False):
 671     """Sanitizes and normalizes path on Windows"""
 672     # XXX: this handles drive relative paths (c:sth) incorrectly
 673     if sys.platform == 'win32':
 674         force = False
 675         drive_or_unc, _ = os.path.splitdrive(s)
 676     elif force:
 677         drive_or_unc = ''
 678     else:
 679         return s
 680
 681     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 682     if drive_or_unc:
 683         norm_path.pop(0)
 684     sanitized_path = [
 685         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 686         for path_part in norm_path]
 687     if drive_or_unc:
 688         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 689     elif force and s and s[0] == os.path.sep:
 690         sanitized_path.insert(0, os.path.sep)
 691     # TODO: Fix behavioral differences <3.12
 692     # The workaround using `normpath` only superficially passes tests
 693     # Ref: https://github.com/python/cpython/pull/100351
 694     return os.path.normpath(os.path.join(*sanitized_path))
 695
 696
 697 def sanitize_url(url, *, scheme='http'):
 698     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 699     # the number of unwanted failures due to missing protocol
 700     if url is None:
 701         return
 702     elif url.startswith('//'):
 703         return f'{scheme}:{url}'
 704     # Fix some common typos seen so far
 705     COMMON_TYPOS = (
 706         # https://github.com/ytdl-org/youtube-dl/issues/15649
 707         (r'^httpss://', r'https://'),
 708         # https://bx1.be/lives/direct-tv/
 709         (r'^rmtp([es]?)://', r'rtmp\1://'),
 710     )
 711     for mistake, fixup in COMMON_TYPOS:
 712         if re.match(mistake, url):
 713             return re.sub(mistake, fixup, url)
 714     return url
 715
 716
 717 def extract_basic_auth(url):
 718     parts = urllib.parse.urlsplit(url)
 719     if parts.username is None:
 720         return url, None
 721     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 722         parts.hostname if parts.port is None
 723         else '%s:%d' % (parts.hostname, parts.port))))
 724     auth_payload = base64.b64encode(
 725         ('%s:%s' % (parts.username, parts.password or '')).encode())
 726     return url, f'Basic {auth_payload.decode()}'
 727
 728
 729 def expand_path(s):
 730     """Expand shell variables and ~"""
 731     return os.path.expandvars(compat_expanduser(s))
 732
 733
 734 def orderedSet(iterable, *, lazy=False):
 735     """Remove all duplicates from the input iterable"""
 736     def _iter():
 737         seen = []  # Do not use set since the items can be unhashable
 738         for x in iterable:
 739             if x not in seen:
 740                 seen.append(x)
 741                 yield x
 742
 743     return _iter() if lazy else list(_iter())
 744
 745
 746 def _htmlentity_transform(entity_with_semicolon):
 747     """Transforms an HTML entity to a character."""
 748     entity = entity_with_semicolon[:-1]
 749
 750     # Known non-numeric HTML entity
 751     if entity in html.entities.name2codepoint:
 752         return chr(html.entities.name2codepoint[entity])
 753
 754     # TODO: HTML5 allows entities without a semicolon.
 755     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 756     if entity_with_semicolon in html.entities.html5:
 757         return html.entities.html5[entity_with_semicolon]
 758
 759     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 760     if mobj is not None:
 761         numstr = mobj.group(1)
 762         if numstr.startswith('x'):
 763             base = 16
 764             numstr = '0%s' % numstr
 765         else:
 766             base = 10
 767         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 768         with contextlib.suppress(ValueError):
 769             return chr(int(numstr, base))
 770
 771     # Unknown entity in name, return its literal representation
 772     return '&%s;' % entity
 773
 774
 775 def unescapeHTML(s):
 776     if s is None:
 777         return None
 778     assert isinstance(s, str)
 779
 780     return re.sub(
 781         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 782
 783
 784 def escapeHTML(text):
 785     return (
 786         text
 787         .replace('&', '&amp;')
 788         .replace('<', '&lt;')
 789         .replace('>', '&gt;')
 790         .replace('"', '&quot;')
 791         .replace("'", '&#39;')
 792     )
 793
 794
 795 class netrc_from_content(netrc.netrc):
 796     def __init__(self, content):
 797         self.hosts, self.macros = {}, {}
 798         with io.StringIO(content) as stream:
 799             self._parse('-', stream, False)
 800
 801
 802 class Popen(subprocess.Popen):
 803     if sys.platform == 'win32':
 804         _startupinfo = subprocess.STARTUPINFO()
 805         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 806     else:
 807         _startupinfo = None
 808
 809     @staticmethod
 810     def _fix_pyinstaller_ld_path(env):
 811         """Restore LD_LIBRARY_PATH when using PyInstaller
 812             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 813                  https://github.com/yt-dlp/yt-dlp/issues/4573
 814         """
 815         if not hasattr(sys, '_MEIPASS'):
 816             return
 817
 818         def _fix(key):
 819             orig = env.get(f'{key}_ORIG')
 820             if orig is None:
 821                 env.pop(key, None)
 822             else:
 823                 env[key] = orig
 824
 825         _fix('LD_LIBRARY_PATH')  # Linux
 826         _fix('DYLD_LIBRARY_PATH')  # macOS
 827
 828     def __init__(self, *args, env=None, text=False, **kwargs):
 829         if env is None:
 830             env = os.environ.copy()
 831         self._fix_pyinstaller_ld_path(env)
 832
 833         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 834         if text is True:
 835             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 836             kwargs.setdefault('encoding', 'utf-8')
 837             kwargs.setdefault('errors', 'replace')
 838         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 839
 840     def communicate_or_kill(self, *args, **kwargs):
 841         try:
 842             return self.communicate(*args, **kwargs)
 843         except BaseException:  # Including KeyboardInterrupt
 844             self.kill(timeout=None)
 845             raise
 846
 847     def kill(self, *, timeout=0):
 848         super().kill()
 849         if timeout != 0:
 850             self.wait(timeout=timeout)
 851
 852     @classmethod
 853     def run(cls, *args, timeout=None, **kwargs):
 854         with cls(*args, **kwargs) as proc:
 855             default = '' if proc.__text_mode else b''
 856             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 857             return stdout or default, stderr or default, proc.returncode
 858
 859
 860 def encodeArgument(s):
 861     # Legacy code that uses byte strings
 862     # Uncomment the following line after fixing all post processors
 863     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 864     return s if isinstance(s, str) else s.decode('ascii')
 865
 866
 867 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 868
 869
 870 def timetuple_from_msec(msec):
 871     secs, msec = divmod(msec, 1000)
 872     mins, secs = divmod(secs, 60)
 873     hrs, mins = divmod(mins, 60)
 874     return _timetuple(hrs, mins, secs, msec)
 875
 876
 877 def formatSeconds(secs, delim=':', msec=False):
 878     time = timetuple_from_msec(secs * 1000)
 879     if time.hours:
 880         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 881     elif time.minutes:
 882         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 883     else:
 884         ret = '%d' % time.seconds
 885     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 886
 887
 888 def bug_reports_message(before=';'):
 889     from ..update import REPOSITORY
 890
 891     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 892            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 893
 894     before = before.rstrip()
 895     if not before or before.endswith(('.', '!', '?')):
 896         msg = msg[0].title() + msg[1:]
 897
 898     return (before + ' ' if before else '') + msg
 899
 900
 901 class YoutubeDLError(Exception):
 902     """Base exception for YoutubeDL errors."""
 903     msg = None
 904
 905     def __init__(self, msg=None):
 906         if msg is not None:
 907             self.msg = msg
 908         elif self.msg is None:
 909             self.msg = type(self).__name__
 910         super().__init__(self.msg)
 911
 912
 913 class ExtractorError(YoutubeDLError):
 914     """Error during info extraction."""
 915
 916     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 917         """ tb, if given, is the original traceback (so that it can be printed out).
 918         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 919         """
 920         from ..networking.exceptions import network_exceptions
 921         if sys.exc_info()[0] in network_exceptions:
 922             expected = True
 923
 924         self.orig_msg = str(msg)
 925         self.traceback = tb
 926         self.expected = expected
 927         self.cause = cause
 928         self.video_id = video_id
 929         self.ie = ie
 930         self.exc_info = sys.exc_info()  # preserve original exception
 931         if isinstance(self.exc_info[1], ExtractorError):
 932             self.exc_info = self.exc_info[1].exc_info
 933         super().__init__(self.__msg)
 934
 935     @property
 936     def __msg(self):
 937         return ''.join((
 938             format_field(self.ie, None, '[%s] '),
 939             format_field(self.video_id, None, '%s: '),
 940             self.orig_msg,
 941             format_field(self.cause, None, ' (caused by %r)'),
 942             '' if self.expected else bug_reports_message()))
 943
 944     def format_traceback(self):
 945         return join_nonempty(
 946             self.traceback and ''.join(traceback.format_tb(self.traceback)),
 947             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
 948             delim='\n') or None
 949
 950     def __setattr__(self, name, value):
 951         super().__setattr__(name, value)
 952         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
 953             self.msg = self.__msg or type(self).__name__
 954             self.args = (self.msg, )  # Cannot be property
 955
 956
 957 class UnsupportedError(ExtractorError):
 958     def __init__(self, url):
 959         super().__init__(
 960             'Unsupported URL: %s' % url, expected=True)
 961         self.url = url
 962
 963
 964 class RegexNotFoundError(ExtractorError):
 965     """Error when a regex didn't match"""
 966     pass
 967
 968
 969 class GeoRestrictedError(ExtractorError):
 970     """Geographic restriction Error exception.
 971
 972     This exception may be thrown when a video is not available from your
 973     geographic location due to geographic restrictions imposed by a website.
 974     """
 975
 976     def __init__(self, msg, countries=None, **kwargs):
 977         kwargs['expected'] = True
 978         super().__init__(msg, **kwargs)
 979         self.countries = countries
 980
 981
 982 class UserNotLive(ExtractorError):
 983     """Error when a channel/user is not live"""
 984
 985     def __init__(self, msg=None, **kwargs):
 986         kwargs['expected'] = True
 987         super().__init__(msg or 'The channel is not currently live', **kwargs)
 988
 989
 990 class DownloadError(YoutubeDLError):
 991     """Download Error exception.
 992
 993     This exception may be thrown by FileDownloader objects if they are not
 994     configured to continue on errors. They will contain the appropriate
 995     error message.
 996     """
 997
 998     def __init__(self, msg, exc_info=None):
 999         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1000         super().__init__(msg)
1001         self.exc_info = exc_info
1002
1003
1004 class EntryNotInPlaylist(YoutubeDLError):
1005     """Entry not in playlist exception.
1006
1007     This exception will be thrown by YoutubeDL when a requested entry
1008     is not found in the playlist info_dict
1009     """
1010     msg = 'Entry not found in info'
1011
1012
1013 class SameFileError(YoutubeDLError):
1014     """Same File exception.
1015
1016     This exception will be thrown by FileDownloader objects if they detect
1017     multiple files would have to be downloaded to the same file on disk.
1018     """
1019     msg = 'Fixed output name but more than one file to download'
1020
1021     def __init__(self, filename=None):
1022         if filename is not None:
1023             self.msg += f': {filename}'
1024         super().__init__(self.msg)
1025
1026
1027 class PostProcessingError(YoutubeDLError):
1028     """Post Processing exception.
1029
1030     This exception may be raised by PostProcessor's .run() method to
1031     indicate an error in the postprocessing task.
1032     """
1033
1034
1035 class DownloadCancelled(YoutubeDLError):
1036     """ Exception raised when the download queue should be interrupted """
1037     msg = 'The download was cancelled'
1038
1039
1040 class ExistingVideoReached(DownloadCancelled):
1041     """ --break-on-existing triggered """
1042     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1043
1044
1045 class RejectedVideoReached(DownloadCancelled):
1046     """ --break-match-filter triggered """
1047     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1048
1049
1050 class MaxDownloadsReached(DownloadCancelled):
1051     """ --max-downloads limit has been reached. """
1052     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1053
1054
1055 class ReExtractInfo(YoutubeDLError):
1056     """ Video info needs to be re-extracted. """
1057
1058     def __init__(self, msg, expected=False):
1059         super().__init__(msg)
1060         self.expected = expected
1061
1062
1063 class ThrottledDownload(ReExtractInfo):
1064     """ Download speed below --throttled-rate. """
1065     msg = 'The download speed is below throttle limit'
1066
1067     def __init__(self):
1068         super().__init__(self.msg, expected=False)
1069
1070
1071 class UnavailableVideoError(YoutubeDLError):
1072     """Unavailable Format exception.
1073
1074     This exception will be thrown when a video is requested
1075     in a format that is not available for that video.
1076     """
1077     msg = 'Unable to download video'
1078
1079     def __init__(self, err=None):
1080         if err is not None:
1081             self.msg += f': {err}'
1082         super().__init__(self.msg)
1083
1084
1085 class ContentTooShortError(YoutubeDLError):
1086     """Content Too Short exception.
1087
1088     This exception may be raised by FileDownloader objects when a file they
1089     download is too small for what the server announced first, indicating
1090     the connection was probably interrupted.
1091     """
1092
1093     def __init__(self, downloaded, expected):
1094         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1095         # Both in bytes
1096         self.downloaded = downloaded
1097         self.expected = expected
1098
1099
1100 class XAttrMetadataError(YoutubeDLError):
1101     def __init__(self, code=None, msg='Unknown error'):
1102         super().__init__(msg)
1103         self.code = code
1104         self.msg = msg
1105
1106         # Parsing code and msg
1107         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1108                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1109             self.reason = 'NO_SPACE'
1110         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1111             self.reason = 'VALUE_TOO_LONG'
1112         else:
1113             self.reason = 'NOT_SUPPORTED'
1114
1115
1116 class XAttrUnavailableError(YoutubeDLError):
1117     pass
1118
1119
1120 def is_path_like(f):
1121     return isinstance(f, (str, bytes, os.PathLike))
1122
1123
1124 def extract_timezone(date_str):
1125     m = re.search(
1126         r'''(?x)
1127             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1128             (?P<tz>Z|                                            # just the UTC Z, or
1129                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1130                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1131                    [ ]?                                          # optional space
1132                 (?P<sign>\+|-)                                   # +/-
1133                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1134             $)
1135         ''', date_str)
1136     if not m:
1137         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1138         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1139         if timezone is not None:
1140             date_str = date_str[:-len(m.group('tz'))]
1141         timezone = datetime.timedelta(hours=timezone or 0)
1142     else:
1143         date_str = date_str[:-len(m.group('tz'))]
1144         if not m.group('sign'):
1145             timezone = datetime.timedelta()
1146         else:
1147             sign = 1 if m.group('sign') == '+' else -1
1148             timezone = datetime.timedelta(
1149                 hours=sign * int(m.group('hours')),
1150                 minutes=sign * int(m.group('minutes')))
1151     return timezone, date_str
1152
1153
1154 def parse_iso8601(date_str, delimiter='T', timezone=None):
1155     """ Return a UNIX timestamp from the given date """
1156
1157     if date_str is None:
1158         return None
1159
1160     date_str = re.sub(r'\.[0-9]+', '', date_str)
1161
1162     if timezone is None:
1163         timezone, date_str = extract_timezone(date_str)
1164
1165     with contextlib.suppress(ValueError):
1166         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1167         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1168         return calendar.timegm(dt.timetuple())
1169
1170
1171 def date_formats(day_first=True):
1172     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1173
1174
1175 def unified_strdate(date_str, day_first=True):
1176     """Return a string with the date in the format YYYYMMDD"""
1177
1178     if date_str is None:
1179         return None
1180     upload_date = None
1181     # Replace commas
1182     date_str = date_str.replace(',', ' ')
1183     # Remove AM/PM + timezone
1184     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1185     _, date_str = extract_timezone(date_str)
1186
1187     for expression in date_formats(day_first):
1188         with contextlib.suppress(ValueError):
1189             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1190     if upload_date is None:
1191         timetuple = email.utils.parsedate_tz(date_str)
1192         if timetuple:
1193             with contextlib.suppress(ValueError):
1194                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1195     if upload_date is not None:
1196         return str(upload_date)
1197
1198
1199 def unified_timestamp(date_str, day_first=True):
1200     if not isinstance(date_str, str):
1201         return None
1202
1203     date_str = re.sub(r'\s+', ' ', re.sub(
1204         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1205
1206     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1207     timezone, date_str = extract_timezone(date_str)
1208
1209     # Remove AM/PM + timezone
1210     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1211
1212     # Remove unrecognized timezones from ISO 8601 alike timestamps
1213     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1214     if m:
1215         date_str = date_str[:-len(m.group('tz'))]
1216
1217     # Python only supports microseconds, so remove nanoseconds
1218     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1219     if m:
1220         date_str = m.group(1)
1221
1222     for expression in date_formats(day_first):
1223         with contextlib.suppress(ValueError):
1224             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1225             return calendar.timegm(dt.timetuple())
1226
1227     timetuple = email.utils.parsedate_tz(date_str)
1228     if timetuple:
1229         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1230
1231
1232 def determine_ext(url, default_ext='unknown_video'):
1233     if url is None or '.' not in url:
1234         return default_ext
1235     guess = url.partition('?')[0].rpartition('.')[2]
1236     if re.match(r'^[A-Za-z0-9]+$', guess):
1237         return guess
1238     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1239     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1240         return guess.rstrip('/')
1241     else:
1242         return default_ext
1243
1244
1245 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1246     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1247
1248
1249 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1250     R"""
1251     Return a datetime object from a string.
1252     Supported format:
1253         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1254
1255     @param format       strftime format of DATE
1256     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1257                         auto: round to the unit provided in date_str (if applicable).
1258     """
1259     auto_precision = False
1260     if precision == 'auto':
1261         auto_precision = True
1262         precision = 'microsecond'
1263     today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
1264     if date_str in ('now', 'today'):
1265         return today
1266     if date_str == 'yesterday':
1267         return today - datetime.timedelta(days=1)
1268     match = re.match(
1269         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1270         date_str)
1271     if match is not None:
1272         start_time = datetime_from_str(match.group('start'), precision, format)
1273         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1274         unit = match.group('unit')
1275         if unit == 'month' or unit == 'year':
1276             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1277             unit = 'day'
1278         else:
1279             if unit == 'week':
1280                 unit = 'day'
1281                 time *= 7
1282             delta = datetime.timedelta(**{unit + 's': time})
1283             new_date = start_time + delta
1284         if auto_precision:
1285             return datetime_round(new_date, unit)
1286         return new_date
1287
1288     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1289
1290
1291 def date_from_str(date_str, format='%Y%m%d', strict=False):
1292     R"""
1293     Return a date object from a string using datetime_from_str
1294
1295     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1296                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1297     """
1298     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1299         raise ValueError(f'Invalid date format "{date_str}"')
1300     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1301
1302
1303 def datetime_add_months(dt, months):
1304     """Increment/Decrement a datetime object by months."""
1305     month = dt.month + months - 1
1306     year = dt.year + month // 12
1307     month = month % 12 + 1
1308     day = min(dt.day, calendar.monthrange(year, month)[1])
1309     return dt.replace(year, month, day)
1310
1311
1312 def datetime_round(dt, precision='day'):
1313     """
1314     Round a datetime object's time to a specific precision
1315     """
1316     if precision == 'microsecond':
1317         return dt
1318
1319     unit_seconds = {
1320         'day': 86400,
1321         'hour': 3600,
1322         'minute': 60,
1323         'second': 1,
1324     }
1325     roundto = lambda x, n: ((x + n / 2) // n) * n
1326     timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
1327     return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
1328
1329
1330 def hyphenate_date(date_str):
1331     """
1332     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1333     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1334     if match is not None:
1335         return '-'.join(match.groups())
1336     else:
1337         return date_str
1338
1339
1340 class DateRange:
1341     """Represents a time interval between two dates"""
1342
1343     def __init__(self, start=None, end=None):
1344         """start and end must be strings in the format accepted by date"""
1345         if start is not None:
1346             self.start = date_from_str(start, strict=True)
1347         else:
1348             self.start = datetime.datetime.min.date()
1349         if end is not None:
1350             self.end = date_from_str(end, strict=True)
1351         else:
1352             self.end = datetime.datetime.max.date()
1353         if self.start > self.end:
1354             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1355
1356     @classmethod
1357     def day(cls, day):
1358         """Returns a range that only contains the given day"""
1359         return cls(day, day)
1360
1361     def __contains__(self, date):
1362         """Check if the date is in the range"""
1363         if not isinstance(date, datetime.date):
1364             date = date_from_str(date)
1365         return self.start <= date <= self.end
1366
1367     def __repr__(self):
1368         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1369
1370     def __eq__(self, other):
1371         return (isinstance(other, DateRange)
1372                 and self.start == other.start and self.end == other.end)
1373
1374
1375 @functools.cache
1376 def system_identifier():
1377     python_implementation = platform.python_implementation()
1378     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1379         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1380     libc_ver = []
1381     with contextlib.suppress(OSError):  # We may not have access to the executable
1382         libc_ver = platform.libc_ver()
1383
1384     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1385         platform.python_version(),
1386         python_implementation,
1387         platform.machine(),
1388         platform.architecture()[0],
1389         platform.platform(),
1390         ssl.OPENSSL_VERSION,
1391         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1392     )
1393
1394
1395 @functools.cache
1396 def get_windows_version():
1397     ''' Get Windows version. returns () if it's not running on Windows '''
1398     if compat_os_name == 'nt':
1399         return version_tuple(platform.win32_ver()[1])
1400     else:
1401         return ()
1402
1403
1404 def write_string(s, out=None, encoding=None):
1405     assert isinstance(s, str)
1406     out = out or sys.stderr
1407     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1408     if not out:
1409         return
1410
1411     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1412         s = re.sub(r'([\r\n]+)', r' \1', s)
1413
1414     enc, buffer = None, out
1415     if 'b' in getattr(out, 'mode', ''):
1416         enc = encoding or preferredencoding()
1417     elif hasattr(out, 'buffer'):
1418         buffer = out.buffer
1419         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1420
1421     buffer.write(s.encode(enc, 'ignore') if enc else s)
1422     out.flush()
1423
1424
1425 # TODO: Use global logger
1426 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1427     from .. import _IN_CLI
1428     if _IN_CLI:
1429         if msg in deprecation_warning._cache:
1430             return
1431         deprecation_warning._cache.add(msg)
1432         if printer:
1433             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1434         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1435     else:
1436         import warnings
1437         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1438
1439
1440 deprecation_warning._cache = set()
1441
1442
1443 def bytes_to_intlist(bs):
1444     if not bs:
1445         return []
1446     if isinstance(bs[0], int):  # Python 3
1447         return list(bs)
1448     else:
1449         return [ord(c) for c in bs]
1450
1451
1452 def intlist_to_bytes(xs):
1453     if not xs:
1454         return b''
1455     return struct.pack('%dB' % len(xs), *xs)
1456
1457
1458 class LockingUnsupportedError(OSError):
1459     msg = 'File locking is not supported'
1460
1461     def __init__(self):
1462         super().__init__(self.msg)
1463
1464
1465 # Cross-platform file locking
1466 if sys.platform == 'win32':
1467     import ctypes
1468     import ctypes.wintypes
1469     import msvcrt
1470
1471     class OVERLAPPED(ctypes.Structure):
1472         _fields_ = [
1473             ('Internal', ctypes.wintypes.LPVOID),
1474             ('InternalHigh', ctypes.wintypes.LPVOID),
1475             ('Offset', ctypes.wintypes.DWORD),
1476             ('OffsetHigh', ctypes.wintypes.DWORD),
1477             ('hEvent', ctypes.wintypes.HANDLE),
1478         ]
1479
1480     kernel32 = ctypes.WinDLL('kernel32')
1481     LockFileEx = kernel32.LockFileEx
1482     LockFileEx.argtypes = [
1483         ctypes.wintypes.HANDLE,     # hFile
1484         ctypes.wintypes.DWORD,      # dwFlags
1485         ctypes.wintypes.DWORD,      # dwReserved
1486         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1487         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1488         ctypes.POINTER(OVERLAPPED)  # Overlapped
1489     ]
1490     LockFileEx.restype = ctypes.wintypes.BOOL
1491     UnlockFileEx = kernel32.UnlockFileEx
1492     UnlockFileEx.argtypes = [
1493         ctypes.wintypes.HANDLE,     # hFile
1494         ctypes.wintypes.DWORD,      # dwReserved
1495         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1496         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1497         ctypes.POINTER(OVERLAPPED)  # Overlapped
1498     ]
1499     UnlockFileEx.restype = ctypes.wintypes.BOOL
1500     whole_low = 0xffffffff
1501     whole_high = 0x7fffffff
1502
1503     def _lock_file(f, exclusive, block):
1504         overlapped = OVERLAPPED()
1505         overlapped.Offset = 0
1506         overlapped.OffsetHigh = 0
1507         overlapped.hEvent = 0
1508         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1509
1510         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1511                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1512                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1513             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1514             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1515
1516     def _unlock_file(f):
1517         assert f._lock_file_overlapped_p
1518         handle = msvcrt.get_osfhandle(f.fileno())
1519         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1520             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1521
1522 else:
1523     try:
1524         import fcntl
1525
1526         def _lock_file(f, exclusive, block):
1527             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1528             if not block:
1529                 flags |= fcntl.LOCK_NB
1530             try:
1531                 fcntl.flock(f, flags)
1532             except BlockingIOError:
1533                 raise
1534             except OSError:  # AOSP does not have flock()
1535                 fcntl.lockf(f, flags)
1536
1537         def _unlock_file(f):
1538             with contextlib.suppress(OSError):
1539                 return fcntl.flock(f, fcntl.LOCK_UN)
1540             with contextlib.suppress(OSError):
1541                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1542             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1543
1544     except ImportError:
1545
1546         def _lock_file(f, exclusive, block):
1547             raise LockingUnsupportedError()
1548
1549         def _unlock_file(f):
1550             raise LockingUnsupportedError()
1551
1552
1553 class locked_file:
1554     locked = False
1555
1556     def __init__(self, filename, mode, block=True, encoding=None):
1557         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1558             raise NotImplementedError(mode)
1559         self.mode, self.block = mode, block
1560
1561         writable = any(f in mode for f in 'wax+')
1562         readable = any(f in mode for f in 'r+')
1563         flags = functools.reduce(operator.ior, (
1564             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
1565             getattr(os, 'O_BINARY', 0),  # Windows only
1566             getattr(os, 'O_NOINHERIT', 0),  # Windows only
1567             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
1568             os.O_APPEND if 'a' in mode else 0,
1569             os.O_EXCL if 'x' in mode else 0,
1570             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1571         ))
1572
1573         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1574
1575     def __enter__(self):
1576         exclusive = 'r' not in self.mode
1577         try:
1578             _lock_file(self.f, exclusive, self.block)
1579             self.locked = True
1580         except OSError:
1581             self.f.close()
1582             raise
1583         if 'w' in self.mode:
1584             try:
1585                 self.f.truncate()
1586             except OSError as e:
1587                 if e.errno not in (
1588                     errno.ESPIPE,  # Illegal seek - expected for FIFO
1589                     errno.EINVAL,  # Invalid argument - expected for /dev/null
1590                 ):
1591                     raise
1592         return self
1593
1594     def unlock(self):
1595         if not self.locked:
1596             return
1597         try:
1598             _unlock_file(self.f)
1599         finally:
1600             self.locked = False
1601
1602     def __exit__(self, *_):
1603         try:
1604             self.unlock()
1605         finally:
1606             self.f.close()
1607
1608     open = __enter__
1609     close = __exit__
1610
1611     def __getattr__(self, attr):
1612         return getattr(self.f, attr)
1613
1614     def __iter__(self):
1615         return iter(self.f)
1616
1617
1618 @functools.cache
1619 def get_filesystem_encoding():
1620     encoding = sys.getfilesystemencoding()
1621     return encoding if encoding is not None else 'utf-8'
1622
1623
1624 def shell_quote(args):
1625     quoted_args = []
1626     encoding = get_filesystem_encoding()
1627     for a in args:
1628         if isinstance(a, bytes):
1629             # We may get a filename encoded with 'encodeFilename'
1630             a = a.decode(encoding)
1631         quoted_args.append(compat_shlex_quote(a))
1632     return ' '.join(quoted_args)
1633
1634
1635 def smuggle_url(url, data):
1636     """ Pass additional data in a URL for internal use. """
1637
1638     url, idata = unsmuggle_url(url, {})
1639     data.update(idata)
1640     sdata = urllib.parse.urlencode(
1641         {'__youtubedl_smuggle': json.dumps(data)})
1642     return url + '#' + sdata
1643
1644
1645 def unsmuggle_url(smug_url, default=None):
1646     if '#__youtubedl_smuggle' not in smug_url:
1647         return smug_url, default
1648     url, _, sdata = smug_url.rpartition('#')
1649     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1650     data = json.loads(jsond)
1651     return url, data
1652
1653
1654 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1655     """ Formats numbers with decimal sufixes like K, M, etc """
1656     num, factor = float_or_none(num), float(factor)
1657     if num is None or num < 0:
1658         return None
1659     POSSIBLE_SUFFIXES = 'kMGTPEZY'
1660     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1661     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1662     if factor == 1024:
1663         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1664     converted = num / (factor ** exponent)
1665     return fmt % (converted, suffix)
1666
1667
1668 def format_bytes(bytes):
1669     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1670
1671
1672 def lookup_unit_table(unit_table, s, strict=False):
1673     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1674     units_re = '|'.join(re.escape(u) for u in unit_table)
1675     m = (re.fullmatch if strict else re.match)(
1676         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1677     if not m:
1678         return None
1679
1680     num = float(m.group('num').replace(',', '.'))
1681     mult = unit_table[m.group('unit')]
1682     return round(num * mult)
1683
1684
1685 def parse_bytes(s):
1686     """Parse a string indicating a byte quantity into an integer"""
1687     return lookup_unit_table(
1688         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1689         s.upper(), strict=True)
1690
1691
1692 def parse_filesize(s):
1693     if s is None:
1694         return None
1695
1696     # The lower-case forms are of course incorrect and unofficial,
1697     # but we support those too
1698     _UNIT_TABLE = {
1699         'B': 1,
1700         'b': 1,
1701         'bytes': 1,
1702         'KiB': 1024,
1703         'KB': 1000,
1704         'kB': 1024,
1705         'Kb': 1000,
1706         'kb': 1000,
1707         'kilobytes': 1000,
1708         'kibibytes': 1024,
1709         'MiB': 1024 ** 2,
1710         'MB': 1000 ** 2,
1711         'mB': 1024 ** 2,
1712         'Mb': 1000 ** 2,
1713         'mb': 1000 ** 2,
1714         'megabytes': 1000 ** 2,
1715         'mebibytes': 1024 ** 2,
1716         'GiB': 1024 ** 3,
1717         'GB': 1000 ** 3,
1718         'gB': 1024 ** 3,
1719         'Gb': 1000 ** 3,
1720         'gb': 1000 ** 3,
1721         'gigabytes': 1000 ** 3,
1722         'gibibytes': 1024 ** 3,
1723         'TiB': 1024 ** 4,
1724         'TB': 1000 ** 4,
1725         'tB': 1024 ** 4,
1726         'Tb': 1000 ** 4,
1727         'tb': 1000 ** 4,
1728         'terabytes': 1000 ** 4,
1729         'tebibytes': 1024 ** 4,
1730         'PiB': 1024 ** 5,
1731         'PB': 1000 ** 5,
1732         'pB': 1024 ** 5,
1733         'Pb': 1000 ** 5,
1734         'pb': 1000 ** 5,
1735         'petabytes': 1000 ** 5,
1736         'pebibytes': 1024 ** 5,
1737         'EiB': 1024 ** 6,
1738         'EB': 1000 ** 6,
1739         'eB': 1024 ** 6,
1740         'Eb': 1000 ** 6,
1741         'eb': 1000 ** 6,
1742         'exabytes': 1000 ** 6,
1743         'exbibytes': 1024 ** 6,
1744         'ZiB': 1024 ** 7,
1745         'ZB': 1000 ** 7,
1746         'zB': 1024 ** 7,
1747         'Zb': 1000 ** 7,
1748         'zb': 1000 ** 7,
1749         'zettabytes': 1000 ** 7,
1750         'zebibytes': 1024 ** 7,
1751         'YiB': 1024 ** 8,
1752         'YB': 1000 ** 8,
1753         'yB': 1024 ** 8,
1754         'Yb': 1000 ** 8,
1755         'yb': 1000 ** 8,
1756         'yottabytes': 1000 ** 8,
1757         'yobibytes': 1024 ** 8,
1758     }
1759
1760     return lookup_unit_table(_UNIT_TABLE, s)
1761
1762
1763 def parse_count(s):
1764     if s is None:
1765         return None
1766
1767     s = re.sub(r'^[^\d]+\s', '', s).strip()
1768
1769     if re.match(r'^[\d,.]+$', s):
1770         return str_to_int(s)
1771
1772     _UNIT_TABLE = {
1773         'k': 1000,
1774         'K': 1000,
1775         'm': 1000 ** 2,
1776         'M': 1000 ** 2,
1777         'kk': 1000 ** 2,
1778         'KK': 1000 ** 2,
1779         'b': 1000 ** 3,
1780         'B': 1000 ** 3,
1781     }
1782
1783     ret = lookup_unit_table(_UNIT_TABLE, s)
1784     if ret is not None:
1785         return ret
1786
1787     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1788     if mobj:
1789         return str_to_int(mobj.group(1))
1790
1791
1792 def parse_resolution(s, *, lenient=False):
1793     if s is None:
1794         return {}
1795
1796     if lenient:
1797         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1798     else:
1799         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1800     if mobj:
1801         return {
1802             'width': int(mobj.group('w')),
1803             'height': int(mobj.group('h')),
1804         }
1805
1806     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1807     if mobj:
1808         return {'height': int(mobj.group(1))}
1809
1810     mobj = re.search(r'\b([48])[kK]\b', s)
1811     if mobj:
1812         return {'height': int(mobj.group(1)) * 540}
1813
1814     return {}
1815
1816
1817 def parse_bitrate(s):
1818     if not isinstance(s, str):
1819         return
1820     mobj = re.search(r'\b(\d+)\s*kbps', s)
1821     if mobj:
1822         return int(mobj.group(1))
1823
1824
1825 def month_by_name(name, lang='en'):
1826     """ Return the number of a month by (locale-independently) English name """
1827
1828     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1829
1830     try:
1831         return month_names.index(name) + 1
1832     except ValueError:
1833         return None
1834
1835
1836 def month_by_abbreviation(abbrev):
1837     """ Return the number of a month by (locale-independently) English
1838         abbreviations """
1839
1840     try:
1841         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1842     except ValueError:
1843         return None
1844
1845
1846 def fix_xml_ampersands(xml_str):
1847     """Replace all the '&' by '&amp;' in XML"""
1848     return re.sub(
1849         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1850         '&amp;',
1851         xml_str)
1852
1853
1854 def setproctitle(title):
1855     assert isinstance(title, str)
1856
1857     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1858     try:
1859         import ctypes
1860     except ImportError:
1861         return
1862
1863     try:
1864         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1865     except OSError:
1866         return
1867     except TypeError:
1868         # LoadLibrary in Windows Python 2.7.13 only expects
1869         # a bytestring, but since unicode_literals turns
1870         # every string into a unicode string, it fails.
1871         return
1872     title_bytes = title.encode()
1873     buf = ctypes.create_string_buffer(len(title_bytes))
1874     buf.value = title_bytes
1875     try:
1876         libc.prctl(15, buf, 0, 0, 0)
1877     except AttributeError:
1878         return  # Strange libc, just skip this
1879
1880
1881 def remove_start(s, start):
1882     return s[len(start):] if s is not None and s.startswith(start) else s
1883
1884
1885 def remove_end(s, end):
1886     return s[:-len(end)] if s is not None and s.endswith(end) else s
1887
1888
1889 def remove_quotes(s):
1890     if s is None or len(s) < 2:
1891         return s
1892     for quote in ('"', "'", ):
1893         if s[0] == quote and s[-1] == quote:
1894             return s[1:-1]
1895     return s
1896
1897
1898 def get_domain(url):
1899     """
1900     This implementation is inconsistent, but is kept for compatibility.
1901     Use this only for "webpage_url_domain"
1902     """
1903     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1904
1905
1906 def url_basename(url):
1907     path = urllib.parse.urlparse(url).path
1908     return path.strip('/').split('/')[-1]
1909
1910
1911 def base_url(url):
1912     return re.match(r'https?://[^?#]+/', url).group()
1913
1914
1915 def urljoin(base, path):
1916     if isinstance(path, bytes):
1917         path = path.decode()
1918     if not isinstance(path, str) or not path:
1919         return None
1920     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1921         return path
1922     if isinstance(base, bytes):
1923         base = base.decode()
1924     if not isinstance(base, str) or not re.match(
1925             r'^(?:https?:)?//', base):
1926         return None
1927     return urllib.parse.urljoin(base, path)
1928
1929
1930 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1931     if get_attr and v is not None:
1932         v = getattr(v, get_attr, None)
1933     try:
1934         return int(v) * invscale // scale
1935     except (ValueError, TypeError, OverflowError):
1936         return default
1937
1938
1939 def str_or_none(v, default=None):
1940     return default if v is None else str(v)
1941
1942
1943 def str_to_int(int_str):
1944     """ A more relaxed version of int_or_none """
1945     if isinstance(int_str, int):
1946         return int_str
1947     elif isinstance(int_str, str):
1948         int_str = re.sub(r'[,\.\+]', '', int_str)
1949         return int_or_none(int_str)
1950
1951
1952 def float_or_none(v, scale=1, invscale=1, default=None):
1953     if v is None:
1954         return default
1955     try:
1956         return float(v) * invscale / scale
1957     except (ValueError, TypeError):
1958         return default
1959
1960
1961 def bool_or_none(v, default=None):
1962     return v if isinstance(v, bool) else default
1963
1964
1965 def strip_or_none(v, default=None):
1966     return v.strip() if isinstance(v, str) else default
1967
1968
1969 def url_or_none(url):
1970     if not url or not isinstance(url, str):
1971         return None
1972     url = url.strip()
1973     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
1974
1975
1976 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
1977     datetime_object = None
1978     try:
1979         if isinstance(timestamp, (int, float)):  # unix timestamp
1980             # Using naive datetime here can break timestamp() in Windows
1981             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
1982             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
1983             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
1984             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
1985                                + datetime.timedelta(seconds=timestamp))
1986         elif isinstance(timestamp, str):  # assume YYYYMMDD
1987             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
1988         date_format = re.sub(  # Support %s on windows
1989             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
1990         return datetime_object.strftime(date_format)
1991     except (ValueError, TypeError, AttributeError):
1992         return default
1993
1994
1995 def parse_duration(s):
1996     if not isinstance(s, str):
1997         return None
1998     s = s.strip()
1999     if not s:
2000         return None
2001
2002     days, hours, mins, secs, ms = [None] * 5
2003     m = re.match(r'''(?x)
2004             (?P<before_secs>
2005                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2006             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2007             (?P<ms>[.:][0-9]+)?Z?$
2008         ''', s)
2009     if m:
2010         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2011     else:
2012         m = re.match(
2013             r'''(?ix)(?:P?
2014                 (?:
2015                     [0-9]+\s*y(?:ears?)?,?\s*
2016                 )?
2017                 (?:
2018                     [0-9]+\s*m(?:onths?)?,?\s*
2019                 )?
2020                 (?:
2021                     [0-9]+\s*w(?:eeks?)?,?\s*
2022                 )?
2023                 (?:
2024                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2025                 )?
2026                 T)?
2027                 (?:
2028                     (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2029                 )?
2030                 (?:
2031                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2032                 )?
2033                 (?:
2034                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2035                 )?Z?$''', s)
2036         if m:
2037             days, hours, mins, secs, ms = m.groups()
2038         else:
2039             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2040             if m:
2041                 hours, mins = m.groups()
2042             else:
2043                 return None
2044
2045     if ms:
2046         ms = ms.replace(':', '.')
2047     return sum(float(part or 0) * mult for part, mult in (
2048         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2049
2050
2051 def prepend_extension(filename, ext, expected_real_ext=None):
2052     name, real_ext = os.path.splitext(filename)
2053     return (
2054         f'{name}.{ext}{real_ext}'
2055         if not expected_real_ext or real_ext[1:] == expected_real_ext
2056         else f'{filename}.{ext}')
2057
2058
2059 def replace_extension(filename, ext, expected_real_ext=None):
2060     name, real_ext = os.path.splitext(filename)
2061     return '{}.{}'.format(
2062         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2063         ext)
2064
2065
2066 def check_executable(exe, args=[]):
2067     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2068     args can be a list of arguments for a short output (like -version) """
2069     try:
2070         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2071     except OSError:
2072         return False
2073     return exe
2074
2075
2076 def _get_exe_version_output(exe, args):
2077     try:
2078         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2079         # SIGTTOU if yt-dlp is run in the background.
2080         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2081         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2082                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2083         if ret:
2084             return None
2085     except OSError:
2086         return False
2087     return stdout
2088
2089
2090 def detect_exe_version(output, version_re=None, unrecognized='present'):
2091     assert isinstance(output, str)
2092     if version_re is None:
2093         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2094     m = re.search(version_re, output)
2095     if m:
2096         return m.group(1)
2097     else:
2098         return unrecognized
2099
2100
2101 def get_exe_version(exe, args=['--version'],
2102                     version_re=None, unrecognized=('present', 'broken')):
2103     """ Returns the version of the specified executable,
2104     or False if the executable is not present """
2105     unrecognized = variadic(unrecognized)
2106     assert len(unrecognized) in (1, 2)
2107     out = _get_exe_version_output(exe, args)
2108     if out is None:
2109         return unrecognized[-1]
2110     return out and detect_exe_version(out, version_re, unrecognized[0])
2111
2112
2113 def frange(start=0, stop=None, step=1):
2114     """Float range"""
2115     if stop is None:
2116         start, stop = 0, start
2117     sign = [-1, 1][step > 0] if step else 0
2118     while sign * start < sign * stop:
2119         yield start
2120         start += step
2121
2122
2123 class LazyList(collections.abc.Sequence):
2124     """Lazy immutable list from an iterable
2125     Note that slices of a LazyList are lists and not LazyList"""
2126
2127     class IndexError(IndexError):
2128         pass
2129
2130     def __init__(self, iterable, *, reverse=False, _cache=None):
2131         self._iterable = iter(iterable)
2132         self._cache = [] if _cache is None else _cache
2133         self._reversed = reverse
2134
2135     def __iter__(self):
2136         if self._reversed:
2137             # We need to consume the entire iterable to iterate in reverse
2138             yield from self.exhaust()
2139             return
2140         yield from self._cache
2141         for item in self._iterable:
2142             self._cache.append(item)
2143             yield item
2144
2145     def _exhaust(self):
2146         self._cache.extend(self._iterable)
2147         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2148         return self._cache
2149
2150     def exhaust(self):
2151         """Evaluate the entire iterable"""
2152         return self._exhaust()[::-1 if self._reversed else 1]
2153
2154     @staticmethod
2155     def _reverse_index(x):
2156         return None if x is None else ~x
2157
2158     def __getitem__(self, idx):
2159         if isinstance(idx, slice):
2160             if self._reversed:
2161                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2162             start, stop, step = idx.start, idx.stop, idx.step or 1
2163         elif isinstance(idx, int):
2164             if self._reversed:
2165                 idx = self._reverse_index(idx)
2166             start, stop, step = idx, idx, 0
2167         else:
2168             raise TypeError('indices must be integers or slices')
2169         if ((start or 0) < 0 or (stop or 0) < 0
2170                 or (start is None and step < 0)
2171                 or (stop is None and step > 0)):
2172             # We need to consume the entire iterable to be able to slice from the end
2173             # Obviously, never use this with infinite iterables
2174             self._exhaust()
2175             try:
2176                 return self._cache[idx]
2177             except IndexError as e:
2178                 raise self.IndexError(e) from e
2179         n = max(start or 0, stop or 0) - len(self._cache) + 1
2180         if n > 0:
2181             self._cache.extend(itertools.islice(self._iterable, n))
2182         try:
2183             return self._cache[idx]
2184         except IndexError as e:
2185             raise self.IndexError(e) from e
2186
2187     def __bool__(self):
2188         try:
2189             self[-1] if self._reversed else self[0]
2190         except self.IndexError:
2191             return False
2192         return True
2193
2194     def __len__(self):
2195         self._exhaust()
2196         return len(self._cache)
2197
2198     def __reversed__(self):
2199         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2200
2201     def __copy__(self):
2202         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2203
2204     def __repr__(self):
2205         # repr and str should mimic a list. So we exhaust the iterable
2206         return repr(self.exhaust())
2207
2208     def __str__(self):
2209         return repr(self.exhaust())
2210
2211
2212 class PagedList:
2213
2214     class IndexError(IndexError):
2215         pass
2216
2217     def __len__(self):
2218         # This is only useful for tests
2219         return len(self.getslice())
2220
2221     def __init__(self, pagefunc, pagesize, use_cache=True):
2222         self._pagefunc = pagefunc
2223         self._pagesize = pagesize
2224         self._pagecount = float('inf')
2225         self._use_cache = use_cache
2226         self._cache = {}
2227
2228     def getpage(self, pagenum):
2229         page_results = self._cache.get(pagenum)
2230         if page_results is None:
2231             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2232         if self._use_cache:
2233             self._cache[pagenum] = page_results
2234         return page_results
2235
2236     def getslice(self, start=0, end=None):
2237         return list(self._getslice(start, end))
2238
2239     def _getslice(self, start, end):
2240         raise NotImplementedError('This method must be implemented by subclasses')
2241
2242     def __getitem__(self, idx):
2243         assert self._use_cache, 'Indexing PagedList requires cache'
2244         if not isinstance(idx, int) or idx < 0:
2245             raise TypeError('indices must be non-negative integers')
2246         entries = self.getslice(idx, idx + 1)
2247         if not entries:
2248             raise self.IndexError()
2249         return entries[0]
2250
2251
2252 class OnDemandPagedList(PagedList):
2253     """Download pages until a page with less than maximum results"""
2254
2255     def _getslice(self, start, end):
2256         for pagenum in itertools.count(start // self._pagesize):
2257             firstid = pagenum * self._pagesize
2258             nextfirstid = pagenum * self._pagesize + self._pagesize
2259             if start >= nextfirstid:
2260                 continue
2261
2262             startv = (
2263                 start % self._pagesize
2264                 if firstid <= start < nextfirstid
2265                 else 0)
2266             endv = (
2267                 ((end - 1) % self._pagesize) + 1
2268                 if (end is not None and firstid <= end <= nextfirstid)
2269                 else None)
2270
2271             try:
2272                 page_results = self.getpage(pagenum)
2273             except Exception:
2274                 self._pagecount = pagenum - 1
2275                 raise
2276             if startv != 0 or endv is not None:
2277                 page_results = page_results[startv:endv]
2278             yield from page_results
2279
2280             # A little optimization - if current page is not "full", ie. does
2281             # not contain page_size videos then we can assume that this page
2282             # is the last one - there are no more ids on further pages -
2283             # i.e. no need to query again.
2284             if len(page_results) + startv < self._pagesize:
2285                 break
2286
2287             # If we got the whole page, but the next page is not interesting,
2288             # break out early as well
2289             if end == nextfirstid:
2290                 break
2291
2292
2293 class InAdvancePagedList(PagedList):
2294     """PagedList with total number of pages known in advance"""
2295
2296     def __init__(self, pagefunc, pagecount, pagesize):
2297         PagedList.__init__(self, pagefunc, pagesize, True)
2298         self._pagecount = pagecount
2299
2300     def _getslice(self, start, end):
2301         start_page = start // self._pagesize
2302         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2303         skip_elems = start - start_page * self._pagesize
2304         only_more = None if end is None else end - start
2305         for pagenum in range(start_page, end_page):
2306             page_results = self.getpage(pagenum)
2307             if skip_elems:
2308                 page_results = page_results[skip_elems:]
2309                 skip_elems = None
2310             if only_more is not None:
2311                 if len(page_results) < only_more:
2312                     only_more -= len(page_results)
2313                 else:
2314                     yield from page_results[:only_more]
2315                     break
2316             yield from page_results
2317
2318
2319 class PlaylistEntries:
2320     MissingEntry = object()
2321     is_exhausted = False
2322
2323     def __init__(self, ydl, info_dict):
2324         self.ydl = ydl
2325
2326         # _entries must be assigned now since infodict can change during iteration
2327         entries = info_dict.get('entries')
2328         if entries is None:
2329             raise EntryNotInPlaylist('There are no entries')
2330         elif isinstance(entries, list):
2331             self.is_exhausted = True
2332
2333         requested_entries = info_dict.get('requested_entries')
2334         self.is_incomplete = requested_entries is not None
2335         if self.is_incomplete:
2336             assert self.is_exhausted
2337             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2338             for i, entry in zip(requested_entries, entries):
2339                 self._entries[i - 1] = entry
2340         elif isinstance(entries, (list, PagedList, LazyList)):
2341             self._entries = entries
2342         else:
2343             self._entries = LazyList(entries)
2344
2345     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2346         (?P<start>[+-]?\d+)?
2347         (?P<range>[:-]
2348             (?P<end>[+-]?\d+|inf(?:inite)?)?
2349             (?::(?P<step>[+-]?\d+))?
2350         )?''')
2351
2352     @classmethod
2353     def parse_playlist_items(cls, string):
2354         for segment in string.split(','):
2355             if not segment:
2356                 raise ValueError('There is two or more consecutive commas')
2357             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2358             if not mobj:
2359                 raise ValueError(f'{segment!r} is not a valid specification')
2360             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2361             if int_or_none(step) == 0:
2362                 raise ValueError(f'Step in {segment!r} cannot be zero')
2363             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2364
2365     def get_requested_items(self):
2366         playlist_items = self.ydl.params.get('playlist_items')
2367         playlist_start = self.ydl.params.get('playliststart', 1)
2368         playlist_end = self.ydl.params.get('playlistend')
2369         # For backwards compatibility, interpret -1 as whole list
2370         if playlist_end in (-1, None):
2371             playlist_end = ''
2372         if not playlist_items:
2373             playlist_items = f'{playlist_start}:{playlist_end}'
2374         elif playlist_start != 1 or playlist_end:
2375             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2376
2377         for index in self.parse_playlist_items(playlist_items):
2378             for i, entry in self[index]:
2379                 yield i, entry
2380                 if not entry:
2381                     continue
2382                 try:
2383                     # The item may have just been added to archive. Don't break due to it
2384                     if not self.ydl.params.get('lazy_playlist'):
2385                         # TODO: Add auto-generated fields
2386                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2387                 except (ExistingVideoReached, RejectedVideoReached):
2388                     return
2389
2390     def get_full_count(self):
2391         if self.is_exhausted and not self.is_incomplete:
2392             return len(self)
2393         elif isinstance(self._entries, InAdvancePagedList):
2394             if self._entries._pagesize == 1:
2395                 return self._entries._pagecount
2396
2397     @functools.cached_property
2398     def _getter(self):
2399         if isinstance(self._entries, list):
2400             def get_entry(i):
2401                 try:
2402                     entry = self._entries[i]
2403                 except IndexError:
2404                     entry = self.MissingEntry
2405                     if not self.is_incomplete:
2406                         raise self.IndexError()
2407                 if entry is self.MissingEntry:
2408                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2409                 return entry
2410         else:
2411             def get_entry(i):
2412                 try:
2413                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2414                 except (LazyList.IndexError, PagedList.IndexError):
2415                     raise self.IndexError()
2416         return get_entry
2417
2418     def __getitem__(self, idx):
2419         if isinstance(idx, int):
2420             idx = slice(idx, idx)
2421
2422         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2423         step = 1 if idx.step is None else idx.step
2424         if idx.start is None:
2425             start = 0 if step > 0 else len(self) - 1
2426         else:
2427             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2428
2429         # NB: Do not call len(self) when idx == [:]
2430         if idx.stop is None:
2431             stop = 0 if step < 0 else float('inf')
2432         else:
2433             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2434         stop += [-1, 1][step > 0]
2435
2436         for i in frange(start, stop, step):
2437             if i < 0:
2438                 continue
2439             try:
2440                 entry = self._getter(i)
2441             except self.IndexError:
2442                 self.is_exhausted = True
2443                 if step > 0:
2444                     break
2445                 continue
2446             yield i + 1, entry
2447
2448     def __len__(self):
2449         return len(tuple(self[:]))
2450
2451     class IndexError(IndexError):
2452         pass
2453
2454
2455 def uppercase_escape(s):
2456     unicode_escape = codecs.getdecoder('unicode_escape')
2457     return re.sub(
2458         r'\\U[0-9a-fA-F]{8}',
2459         lambda m: unicode_escape(m.group(0))[0],
2460         s)
2461
2462
2463 def lowercase_escape(s):
2464     unicode_escape = codecs.getdecoder('unicode_escape')
2465     return re.sub(
2466         r'\\u[0-9a-fA-F]{4}',
2467         lambda m: unicode_escape(m.group(0))[0],
2468         s)
2469
2470
2471 def parse_qs(url, **kwargs):
2472     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2473
2474
2475 def read_batch_urls(batch_fd):
2476     def fixup(url):
2477         if not isinstance(url, str):
2478             url = url.decode('utf-8', 'replace')
2479         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2480         for bom in BOM_UTF8:
2481             if url.startswith(bom):
2482                 url = url[len(bom):]
2483         url = url.lstrip()
2484         if not url or url.startswith(('#', ';', ']')):
2485             return False
2486         # "#" cannot be stripped out since it is part of the URI
2487         # However, it can be safely stripped out if following a whitespace
2488         return re.split(r'\s#', url, 1)[0].rstrip()
2489
2490     with contextlib.closing(batch_fd) as fd:
2491         return [url for url in map(fixup, fd) if url]
2492
2493
2494 def urlencode_postdata(*args, **kargs):
2495     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2496
2497
2498 def update_url(url, *, query_update=None, **kwargs):
2499     """Replace URL components specified by kwargs
2500        @param url           str or parse url tuple
2501        @param query_update  update query
2502        @returns             str
2503     """
2504     if isinstance(url, str):
2505         if not kwargs and not query_update:
2506             return url
2507         else:
2508             url = urllib.parse.urlparse(url)
2509     if query_update:
2510         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2511         kwargs['query'] = urllib.parse.urlencode({
2512             **urllib.parse.parse_qs(url.query),
2513             **query_update
2514         }, True)
2515     return urllib.parse.urlunparse(url._replace(**kwargs))
2516
2517
2518 def update_url_query(url, query):
2519     return update_url(url, query_update=query)
2520
2521
2522 def _multipart_encode_impl(data, boundary):
2523     content_type = 'multipart/form-data; boundary=%s' % boundary
2524
2525     out = b''
2526     for k, v in data.items():
2527         out += b'--' + boundary.encode('ascii') + b'\r\n'
2528         if isinstance(k, str):
2529             k = k.encode()
2530         if isinstance(v, str):
2531             v = v.encode()
2532         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2533         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2534         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2535         if boundary.encode('ascii') in content:
2536             raise ValueError('Boundary overlaps with data')
2537         out += content
2538
2539     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2540
2541     return out, content_type
2542
2543
2544 def multipart_encode(data, boundary=None):
2545     '''
2546     Encode a dict to RFC 7578-compliant form-data
2547
2548     data:
2549         A dict where keys and values can be either Unicode or bytes-like
2550         objects.
2551     boundary:
2552         If specified a Unicode object, it's used as the boundary. Otherwise
2553         a random boundary is generated.
2554
2555     Reference: https://tools.ietf.org/html/rfc7578
2556     '''
2557     has_specified_boundary = boundary is not None
2558
2559     while True:
2560         if boundary is None:
2561             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2562
2563         try:
2564             out, content_type = _multipart_encode_impl(data, boundary)
2565             break
2566         except ValueError:
2567             if has_specified_boundary:
2568                 raise
2569             boundary = None
2570
2571     return out, content_type
2572
2573
2574 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2575     if blocked_types is NO_DEFAULT:
2576         blocked_types = (str, bytes, collections.abc.Mapping)
2577     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2578
2579
2580 def variadic(x, allowed_types=NO_DEFAULT):
2581     if not isinstance(allowed_types, (tuple, type)):
2582         deprecation_warning('allowed_types should be a tuple or a type')
2583         allowed_types = tuple(allowed_types)
2584     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2585
2586
2587 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2588     for f in funcs:
2589         try:
2590             val = f(*args, **kwargs)
2591         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2592             pass
2593         else:
2594             if expected_type is None or isinstance(val, expected_type):
2595                 return val
2596
2597
2598 def try_get(src, getter, expected_type=None):
2599     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2600
2601
2602 def filter_dict(dct, cndn=lambda _, v: v is not None):
2603     return {k: v for k, v in dct.items() if cndn(k, v)}
2604
2605
2606 def merge_dicts(*dicts):
2607     merged = {}
2608     for a_dict in dicts:
2609         for k, v in a_dict.items():
2610             if (v is not None and k not in merged
2611                     or isinstance(v, str) and merged[k] == ''):
2612                 merged[k] = v
2613     return merged
2614
2615
2616 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2617     return string if isinstance(string, str) else str(string, encoding, errors)
2618
2619
2620 US_RATINGS = {
2621     'G': 0,
2622     'PG': 10,
2623     'PG-13': 13,
2624     'R': 16,
2625     'NC': 18,
2626 }
2627
2628
2629 TV_PARENTAL_GUIDELINES = {
2630     'TV-Y': 0,
2631     'TV-Y7': 7,
2632     'TV-G': 0,
2633     'TV-PG': 0,
2634     'TV-14': 14,
2635     'TV-MA': 17,
2636 }
2637
2638
2639 def parse_age_limit(s):
2640     # isinstance(False, int) is True. So type() must be used instead
2641     if type(s) is int:  # noqa: E721
2642         return s if 0 <= s <= 21 else None
2643     elif not isinstance(s, str):
2644         return None
2645     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2646     if m:
2647         return int(m.group('age'))
2648     s = s.upper()
2649     if s in US_RATINGS:
2650         return US_RATINGS[s]
2651     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2652     if m:
2653         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2654     return None
2655
2656
2657 def strip_jsonp(code):
2658     return re.sub(
2659         r'''(?sx)^
2660             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2661             (?:\s*&&\s*(?P=func_name))?
2662             \s*\(\s*(?P<callback_data>.*)\);?
2663             \s*?(?://[^\n]*)*$''',
2664         r'\g<callback_data>', code)
2665
2666
2667 def js_to_json(code, vars={}, *, strict=False):
2668     # vars is a dict of var, val pairs to substitute
2669     STRING_QUOTES = '\'"`'
2670     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2671     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2672     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2673     INTEGER_TABLE = (
2674         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2675         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2676     )
2677
2678     def process_escape(match):
2679         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2680         escape = match.group(1) or match.group(2)
2681
2682         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2683                 else R'\u00' if escape == 'x'
2684                 else '' if escape == '\n'
2685                 else escape)
2686
2687     def template_substitute(match):
2688         evaluated = js_to_json(match.group(1), vars, strict=strict)
2689         if evaluated[0] == '"':
2690             return json.loads(evaluated)
2691         return evaluated
2692
2693     def fix_kv(m):
2694         v = m.group(0)
2695         if v in ('true', 'false', 'null'):
2696             return v
2697         elif v in ('undefined', 'void 0'):
2698             return 'null'
2699         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2700             return ''
2701
2702         if v[0] in STRING_QUOTES:
2703             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2704             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2705             return f'"{escaped}"'
2706
2707         for regex, base in INTEGER_TABLE:
2708             im = re.match(regex, v)
2709             if im:
2710                 i = int(im.group(1), base)
2711                 return f'"{i}":' if v.endswith(':') else str(i)
2712
2713         if v in vars:
2714             try:
2715                 if not strict:
2716                     json.loads(vars[v])
2717             except json.JSONDecodeError:
2718                 return json.dumps(vars[v])
2719             else:
2720                 return vars[v]
2721
2722         if not strict:
2723             return f'"{v}"'
2724
2725         raise ValueError(f'Unknown value: {v}')
2726
2727     def create_map(mobj):
2728         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2729
2730     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2731     if not strict:
2732         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
2733         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2734         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2735         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2736
2737     return re.sub(rf'''(?sx)
2738         {STRING_RE}|
2739         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2740         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2741         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2742         [0-9]+(?={SKIP_RE}:)|
2743         !+
2744         ''', fix_kv, code)
2745
2746
2747 def qualities(quality_ids):
2748     """ Get a numeric quality value out of a list of possible values """
2749     def q(qid):
2750         try:
2751             return quality_ids.index(qid)
2752         except ValueError:
2753             return -1
2754     return q
2755
2756
2757 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2758
2759
2760 DEFAULT_OUTTMPL = {
2761     'default': '%(title)s [%(id)s].%(ext)s',
2762     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2763 }
2764 OUTTMPL_TYPES = {
2765     'chapter': None,
2766     'subtitle': None,
2767     'thumbnail': None,
2768     'description': 'description',
2769     'annotation': 'annotations.xml',
2770     'infojson': 'info.json',
2771     'link': None,
2772     'pl_video': None,
2773     'pl_thumbnail': None,
2774     'pl_description': 'description',
2775     'pl_infojson': 'info.json',
2776 }
2777
2778 # As of [1] format syntax is:
2779 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2780 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2781 STR_FORMAT_RE_TMPL = r'''(?x)
2782     (?<!%)(?P<prefix>(?:%%)*)
2783     %
2784     (?P<has_key>\((?P<key>{0})\))?
2785     (?P<format>
2786         (?P<conversion>[#0\-+ ]+)?
2787         (?P<min_width>\d+)?
2788         (?P<precision>\.\d+)?
2789         (?P<len_mod>[hlL])?  # unused in python
2790         {1}  # conversion type
2791     )
2792 '''
2793
2794
2795 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2796
2797
2798 def limit_length(s, length):
2799     """ Add ellipses to overly long strings """
2800     if s is None:
2801         return None
2802     ELLIPSES = '...'
2803     if len(s) > length:
2804         return s[:length - len(ELLIPSES)] + ELLIPSES
2805     return s
2806
2807
2808 def version_tuple(v):
2809     return tuple(int(e) for e in re.split(r'[-.]', v))
2810
2811
2812 def is_outdated_version(version, limit, assume_new=True):
2813     if not version:
2814         return not assume_new
2815     try:
2816         return version_tuple(version) < version_tuple(limit)
2817     except ValueError:
2818         return not assume_new
2819
2820
2821 def ytdl_is_updateable():
2822     """ Returns if yt-dlp can be updated with -U """
2823
2824     from ..update import is_non_updateable
2825
2826     return not is_non_updateable()
2827
2828
2829 def args_to_str(args):
2830     # Get a short string representation for a subprocess command
2831     return ' '.join(compat_shlex_quote(a) for a in args)
2832
2833
2834 def error_to_str(err):
2835     return f'{type(err).__name__}: {err}'
2836
2837
2838 def mimetype2ext(mt, default=NO_DEFAULT):
2839     if not isinstance(mt, str):
2840         if default is not NO_DEFAULT:
2841             return default
2842         return None
2843
2844     MAP = {
2845         # video
2846         '3gpp': '3gp',
2847         'mp2t': 'ts',
2848         'mp4': 'mp4',
2849         'mpeg': 'mpeg',
2850         'mpegurl': 'm3u8',
2851         'quicktime': 'mov',
2852         'webm': 'webm',
2853         'vp9': 'vp9',
2854         'video/ogg': 'ogv',
2855         'x-flv': 'flv',
2856         'x-m4v': 'm4v',
2857         'x-matroska': 'mkv',
2858         'x-mng': 'mng',
2859         'x-mp4-fragmented': 'mp4',
2860         'x-ms-asf': 'asf',
2861         'x-ms-wmv': 'wmv',
2862         'x-msvideo': 'avi',
2863
2864         # application (streaming playlists)
2865         'dash+xml': 'mpd',
2866         'f4m+xml': 'f4m',
2867         'hds+xml': 'f4m',
2868         'vnd.apple.mpegurl': 'm3u8',
2869         'vnd.ms-sstr+xml': 'ism',
2870         'x-mpegurl': 'm3u8',
2871
2872         # audio
2873         'audio/mp4': 'm4a',
2874         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2875         # Using .mp3 as it's the most popular one
2876         'audio/mpeg': 'mp3',
2877         'audio/webm': 'webm',
2878         'audio/x-matroska': 'mka',
2879         'audio/x-mpegurl': 'm3u',
2880         'midi': 'mid',
2881         'ogg': 'ogg',
2882         'wav': 'wav',
2883         'wave': 'wav',
2884         'x-aac': 'aac',
2885         'x-flac': 'flac',
2886         'x-m4a': 'm4a',
2887         'x-realaudio': 'ra',
2888         'x-wav': 'wav',
2889
2890         # image
2891         'avif': 'avif',
2892         'bmp': 'bmp',
2893         'gif': 'gif',
2894         'jpeg': 'jpg',
2895         'png': 'png',
2896         'svg+xml': 'svg',
2897         'tiff': 'tif',
2898         'vnd.wap.wbmp': 'wbmp',
2899         'webp': 'webp',
2900         'x-icon': 'ico',
2901         'x-jng': 'jng',
2902         'x-ms-bmp': 'bmp',
2903
2904         # caption
2905         'filmstrip+json': 'fs',
2906         'smptett+xml': 'tt',
2907         'ttaf+xml': 'dfxp',
2908         'ttml+xml': 'ttml',
2909         'x-ms-sami': 'sami',
2910
2911         # misc
2912         'gzip': 'gz',
2913         'json': 'json',
2914         'xml': 'xml',
2915         'zip': 'zip',
2916     }
2917
2918     mimetype = mt.partition(';')[0].strip().lower()
2919     _, _, subtype = mimetype.rpartition('/')
2920
2921     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2922     if ext:
2923         return ext
2924     elif default is not NO_DEFAULT:
2925         return default
2926     return subtype.replace('+', '.')
2927
2928
2929 def ext2mimetype(ext_or_url):
2930     if not ext_or_url:
2931         return None
2932     if '.' not in ext_or_url:
2933         ext_or_url = f'file.{ext_or_url}'
2934     return mimetypes.guess_type(ext_or_url)[0]
2935
2936
2937 def parse_codecs(codecs_str):
2938     # http://tools.ietf.org/html/rfc6381
2939     if not codecs_str:
2940         return {}
2941     split_codecs = list(filter(None, map(
2942         str.strip, codecs_str.strip().strip(',').split(','))))
2943     vcodec, acodec, scodec, hdr = None, None, None, None
2944     for full_codec in split_codecs:
2945         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2946         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2947                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2948             if vcodec:
2949                 continue
2950             vcodec = full_codec
2951             if parts[0] in ('dvh1', 'dvhe'):
2952                 hdr = 'DV'
2953             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2954                 hdr = 'HDR10'
2955             elif parts[:2] == ['vp9', '2']:
2956                 hdr = 'HDR10'
2957         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
2958                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2959             acodec = acodec or full_codec
2960         elif parts[0] in ('stpp', 'wvtt'):
2961             scodec = scodec or full_codec
2962         else:
2963             write_string(f'WARNING: Unknown codec {full_codec}\n')
2964     if vcodec or acodec or scodec:
2965         return {
2966             'vcodec': vcodec or 'none',
2967             'acodec': acodec or 'none',
2968             'dynamic_range': hdr,
2969             **({'scodec': scodec} if scodec is not None else {}),
2970         }
2971     elif len(split_codecs) == 2:
2972         return {
2973             'vcodec': split_codecs[0],
2974             'acodec': split_codecs[1],
2975         }
2976     return {}
2977
2978
2979 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
2980     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
2981
2982     allow_mkv = not preferences or 'mkv' in preferences
2983
2984     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
2985         return 'mkv'  # TODO: any other format allows this?
2986
2987     # TODO: All codecs supported by parse_codecs isn't handled here
2988     COMPATIBLE_CODECS = {
2989         'mp4': {
2990             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
2991             'h264', 'aacl', 'ec-3',  # Set in ISM
2992         },
2993         'webm': {
2994             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
2995             'vp9x', 'vp8x',  # in the webm spec
2996         },
2997     }
2998
2999     sanitize_codec = functools.partial(
3000         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3001     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3002
3003     for ext in preferences or COMPATIBLE_CODECS.keys():
3004         codec_set = COMPATIBLE_CODECS.get(ext, set())
3005         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3006             return ext
3007
3008     COMPATIBLE_EXTS = (
3009         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3010         {'webm', 'weba'},
3011     )
3012     for ext in preferences or vexts:
3013         current_exts = {ext, *vexts, *aexts}
3014         if ext == 'mkv' or current_exts == {ext} or any(
3015                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3016             return ext
3017     return 'mkv' if allow_mkv else preferences[-1]
3018
3019
3020 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3021     getheader = url_handle.headers.get
3022
3023     cd = getheader('Content-Disposition')
3024     if cd:
3025         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3026         if m:
3027             e = determine_ext(m.group('filename'), default_ext=None)
3028             if e:
3029                 return e
3030
3031     meta_ext = getheader('x-amz-meta-name')
3032     if meta_ext:
3033         e = meta_ext.rpartition('.')[2]
3034         if e:
3035             return e
3036
3037     return mimetype2ext(getheader('Content-Type'), default=default)
3038
3039
3040 def encode_data_uri(data, mime_type):
3041     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3042
3043
3044 def age_restricted(content_limit, age_limit):
3045     """ Returns True iff the content should be blocked """
3046
3047     if age_limit is None:  # No limit set
3048         return False
3049     if content_limit is None:
3050         return False  # Content available for everyone
3051     return age_limit < content_limit
3052
3053
3054 # List of known byte-order-marks (BOM)
3055 BOMS = [
3056     (b'\xef\xbb\xbf', 'utf-8'),
3057     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3058     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3059     (b'\xff\xfe', 'utf-16-le'),
3060     (b'\xfe\xff', 'utf-16-be'),
3061 ]
3062
3063
3064 def is_html(first_bytes):
3065     """ Detect whether a file contains HTML by examining its first bytes. """
3066
3067     encoding = 'utf-8'
3068     for bom, enc in BOMS:
3069         while first_bytes.startswith(bom):
3070             encoding, first_bytes = enc, first_bytes[len(bom):]
3071
3072     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3073
3074
3075 def determine_protocol(info_dict):
3076     protocol = info_dict.get('protocol')
3077     if protocol is not None:
3078         return protocol
3079
3080     url = sanitize_url(info_dict['url'])
3081     if url.startswith('rtmp'):
3082         return 'rtmp'
3083     elif url.startswith('mms'):
3084         return 'mms'
3085     elif url.startswith('rtsp'):
3086         return 'rtsp'
3087
3088     ext = determine_ext(url)
3089     if ext == 'm3u8':
3090         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3091     elif ext == 'f4m':
3092         return 'f4m'
3093
3094     return urllib.parse.urlparse(url).scheme
3095
3096
3097 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3098     """ Render a list of rows, each as a list of values.
3099     Text after a \t will be right aligned """
3100     def width(string):
3101         return len(remove_terminal_sequences(string).replace('\t', ''))
3102
3103     def get_max_lens(table):
3104         return [max(width(str(v)) for v in col) for col in zip(*table)]
3105
3106     def filter_using_list(row, filterArray):
3107         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3108
3109     max_lens = get_max_lens(data) if hide_empty else []
3110     header_row = filter_using_list(header_row, max_lens)
3111     data = [filter_using_list(row, max_lens) for row in data]
3112
3113     table = [header_row] + data
3114     max_lens = get_max_lens(table)
3115     extra_gap += 1
3116     if delim:
3117         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3118         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3119     for row in table:
3120         for pos, text in enumerate(map(str, row)):
3121             if '\t' in text:
3122                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3123             else:
3124                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3125     ret = '\n'.join(''.join(row).rstrip() for row in table)
3126     return ret
3127
3128
3129 def _match_one(filter_part, dct, incomplete):
3130     # TODO: Generalize code with YoutubeDL._build_format_filter
3131     STRING_OPERATORS = {
3132         '*=': operator.contains,
3133         '^=': lambda attr, value: attr.startswith(value),
3134         '$=': lambda attr, value: attr.endswith(value),
3135         '~=': lambda attr, value: re.search(value, attr),
3136     }
3137     COMPARISON_OPERATORS = {
3138         **STRING_OPERATORS,
3139         '<=': operator.le,  # "<=" must be defined above "<"
3140         '<': operator.lt,
3141         '>=': operator.ge,
3142         '>': operator.gt,
3143         '=': operator.eq,
3144     }
3145
3146     if isinstance(incomplete, bool):
3147         is_incomplete = lambda _: incomplete
3148     else:
3149         is_incomplete = lambda k: k in incomplete
3150
3151     operator_rex = re.compile(r'''(?x)
3152         (?P<key>[a-z_]+)
3153         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3154         (?:
3155             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3156             (?P<strval>.+?)
3157         )
3158         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3159     m = operator_rex.fullmatch(filter_part.strip())
3160     if m:
3161         m = m.groupdict()
3162         unnegated_op = COMPARISON_OPERATORS[m['op']]
3163         if m['negation']:
3164             op = lambda attr, value: not unnegated_op(attr, value)
3165         else:
3166             op = unnegated_op
3167         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3168         if m['quote']:
3169             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3170         actual_value = dct.get(m['key'])
3171         numeric_comparison = None
3172         if isinstance(actual_value, (int, float)):
3173             # If the original field is a string and matching comparisonvalue is
3174             # a number we should respect the origin of the original field
3175             # and process comparison value as a string (see
3176             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3177             try:
3178                 numeric_comparison = int(comparison_value)
3179             except ValueError:
3180                 numeric_comparison = parse_filesize(comparison_value)
3181                 if numeric_comparison is None:
3182                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3183                 if numeric_comparison is None:
3184                     numeric_comparison = parse_duration(comparison_value)
3185         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3186             raise ValueError('Operator %s only supports string values!' % m['op'])
3187         if actual_value is None:
3188             return is_incomplete(m['key']) or m['none_inclusive']
3189         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3190
3191     UNARY_OPERATORS = {
3192         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3193         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3194     }
3195     operator_rex = re.compile(r'''(?x)
3196         (?P<op>%s)\s*(?P<key>[a-z_]+)
3197         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3198     m = operator_rex.fullmatch(filter_part.strip())
3199     if m:
3200         op = UNARY_OPERATORS[m.group('op')]
3201         actual_value = dct.get(m.group('key'))
3202         if is_incomplete(m.group('key')) and actual_value is None:
3203             return True
3204         return op(actual_value)
3205
3206     raise ValueError('Invalid filter part %r' % filter_part)
3207
3208
3209 def match_str(filter_str, dct, incomplete=False):
3210     """ Filter a dictionary with a simple string syntax.
3211     @returns           Whether the filter passes
3212     @param incomplete  Set of keys that is expected to be missing from dct.
3213                        Can be True/False to indicate all/none of the keys may be missing.
3214                        All conditions on incomplete keys pass if the key is missing
3215     """
3216     return all(
3217         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3218         for filter_part in re.split(r'(?<!\\)&', filter_str))
3219
3220
3221 def match_filter_func(filters, breaking_filters=None):
3222     if not filters and not breaking_filters:
3223         return None
3224     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3225     filters = set(variadic(filters or []))
3226
3227     interactive = '-' in filters
3228     if interactive:
3229         filters.remove('-')
3230
3231     def _match_func(info_dict, incomplete=False):
3232         ret = breaking_filters(info_dict, incomplete)
3233         if ret is not None:
3234             raise RejectedVideoReached(ret)
3235
3236         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3237             return NO_DEFAULT if interactive and not incomplete else None
3238         else:
3239             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3240             filter_str = ') | ('.join(map(str.strip, filters))
3241             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3242     return _match_func
3243
3244
3245 class download_range_func:
3246     def __init__(self, chapters, ranges, from_info=False):
3247         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3248
3249     def __call__(self, info_dict, ydl):
3250
3251         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3252                    else 'Cannot match chapters since chapter information is unavailable')
3253         for regex in self.chapters or []:
3254             for i, chapter in enumerate(info_dict.get('chapters') or []):
3255                 if re.search(regex, chapter['title']):
3256                     warning = None
3257                     yield {**chapter, 'index': i}
3258         if self.chapters and warning:
3259             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3260
3261         for start, end in self.ranges or []:
3262             yield {
3263                 'start_time': self._handle_negative_timestamp(start, info_dict),
3264                 'end_time': self._handle_negative_timestamp(end, info_dict),
3265             }
3266
3267         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3268             yield {
3269                 'start_time': info_dict.get('start_time') or 0,
3270                 'end_time': info_dict.get('end_time') or float('inf'),
3271             }
3272         elif not self.ranges and not self.chapters:
3273             yield {}
3274
3275     @staticmethod
3276     def _handle_negative_timestamp(time, info):
3277         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3278
3279     def __eq__(self, other):
3280         return (isinstance(other, download_range_func)
3281                 and self.chapters == other.chapters and self.ranges == other.ranges)
3282
3283     def __repr__(self):
3284         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3285
3286
3287 def parse_dfxp_time_expr(time_expr):
3288     if not time_expr:
3289         return
3290
3291     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3292     if mobj:
3293         return float(mobj.group('time_offset'))
3294
3295     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3296     if mobj:
3297         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3298
3299
3300 def srt_subtitles_timecode(seconds):
3301     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3302
3303
3304 def ass_subtitles_timecode(seconds):
3305     time = timetuple_from_msec(seconds * 1000)
3306     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3307
3308
3309 def dfxp2srt(dfxp_data):
3310     '''
3311     @param dfxp_data A bytes-like object containing DFXP data
3312     @returns A unicode object containing converted SRT data
3313     '''
3314     LEGACY_NAMESPACES = (
3315         (b'http://www.w3.org/ns/ttml', [
3316             b'http://www.w3.org/2004/11/ttaf1',
3317             b'http://www.w3.org/2006/04/ttaf1',
3318             b'http://www.w3.org/2006/10/ttaf1',
3319         ]),
3320         (b'http://www.w3.org/ns/ttml#styling', [
3321             b'http://www.w3.org/ns/ttml#style',
3322         ]),
3323     )
3324
3325     SUPPORTED_STYLING = [
3326         'color',
3327         'fontFamily',
3328         'fontSize',
3329         'fontStyle',
3330         'fontWeight',
3331         'textDecoration'
3332     ]
3333
3334     _x = functools.partial(xpath_with_ns, ns_map={
3335         'xml': 'http://www.w3.org/XML/1998/namespace',
3336         'ttml': 'http://www.w3.org/ns/ttml',
3337         'tts': 'http://www.w3.org/ns/ttml#styling',
3338     })
3339
3340     styles = {}
3341     default_style = {}
3342
3343     class TTMLPElementParser:
3344         _out = ''
3345         _unclosed_elements = []
3346         _applied_styles = []
3347
3348         def start(self, tag, attrib):
3349             if tag in (_x('ttml:br'), 'br'):
3350                 self._out += '\n'
3351             else:
3352                 unclosed_elements = []
3353                 style = {}
3354                 element_style_id = attrib.get('style')
3355                 if default_style:
3356                     style.update(default_style)
3357                 if element_style_id:
3358                     style.update(styles.get(element_style_id, {}))
3359                 for prop in SUPPORTED_STYLING:
3360                     prop_val = attrib.get(_x('tts:' + prop))
3361                     if prop_val:
3362                         style[prop] = prop_val
3363                 if style:
3364                     font = ''
3365                     for k, v in sorted(style.items()):
3366                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3367                             continue
3368                         if k == 'color':
3369                             font += ' color="%s"' % v
3370                         elif k == 'fontSize':
3371                             font += ' size="%s"' % v
3372                         elif k == 'fontFamily':
3373                             font += ' face="%s"' % v
3374                         elif k == 'fontWeight' and v == 'bold':
3375                             self._out += '<b>'
3376                             unclosed_elements.append('b')
3377                         elif k == 'fontStyle' and v == 'italic':
3378                             self._out += '<i>'
3379                             unclosed_elements.append('i')
3380                         elif k == 'textDecoration' and v == 'underline':
3381                             self._out += '<u>'
3382                             unclosed_elements.append('u')
3383                     if font:
3384                         self._out += '<font' + font + '>'
3385                         unclosed_elements.append('font')
3386                     applied_style = {}
3387                     if self._applied_styles:
3388                         applied_style.update(self._applied_styles[-1])
3389                     applied_style.update(style)
3390                     self._applied_styles.append(applied_style)
3391                 self._unclosed_elements.append(unclosed_elements)
3392
3393         def end(self, tag):
3394             if tag not in (_x('ttml:br'), 'br'):
3395                 unclosed_elements = self._unclosed_elements.pop()
3396                 for element in reversed(unclosed_elements):
3397                     self._out += '</%s>' % element
3398                 if unclosed_elements and self._applied_styles:
3399                     self._applied_styles.pop()
3400
3401         def data(self, data):
3402             self._out += data
3403
3404         def close(self):
3405             return self._out.strip()
3406
3407     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3408     # This will not trigger false positives since only UTF-8 text is being replaced
3409     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3410
3411     def parse_node(node):
3412         target = TTMLPElementParser()
3413         parser = xml.etree.ElementTree.XMLParser(target=target)
3414         parser.feed(xml.etree.ElementTree.tostring(node))
3415         return parser.close()
3416
3417     for k, v in LEGACY_NAMESPACES:
3418         for ns in v:
3419             dfxp_data = dfxp_data.replace(ns, k)
3420
3421     dfxp = compat_etree_fromstring(dfxp_data)
3422     out = []
3423     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3424
3425     if not paras:
3426         raise ValueError('Invalid dfxp/TTML subtitle')
3427
3428     repeat = False
3429     while True:
3430         for style in dfxp.findall(_x('.//ttml:style')):
3431             style_id = style.get('id') or style.get(_x('xml:id'))
3432             if not style_id:
3433                 continue
3434             parent_style_id = style.get('style')
3435             if parent_style_id:
3436                 if parent_style_id not in styles:
3437                     repeat = True
3438                     continue
3439                 styles[style_id] = styles[parent_style_id].copy()
3440             for prop in SUPPORTED_STYLING:
3441                 prop_val = style.get(_x('tts:' + prop))
3442                 if prop_val:
3443                     styles.setdefault(style_id, {})[prop] = prop_val
3444         if repeat:
3445             repeat = False
3446         else:
3447             break
3448
3449     for p in ('body', 'div'):
3450         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3451         if ele is None:
3452             continue
3453         style = styles.get(ele.get('style'))
3454         if not style:
3455             continue
3456         default_style.update(style)
3457
3458     for para, index in zip(paras, itertools.count(1)):
3459         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3460         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3461         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3462         if begin_time is None:
3463             continue
3464         if not end_time:
3465             if not dur:
3466                 continue
3467             end_time = begin_time + dur
3468         out.append('%d\n%s --> %s\n%s\n\n' % (
3469             index,
3470             srt_subtitles_timecode(begin_time),
3471             srt_subtitles_timecode(end_time),
3472             parse_node(para)))
3473
3474     return ''.join(out)
3475
3476
3477 def cli_option(params, command_option, param, separator=None):
3478     param = params.get(param)
3479     return ([] if param is None
3480             else [command_option, str(param)] if separator is None
3481             else [f'{command_option}{separator}{param}'])
3482
3483
3484 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3485     param = params.get(param)
3486     assert param in (True, False, None)
3487     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3488
3489
3490 def cli_valueless_option(params, command_option, param, expected_value=True):
3491     return [command_option] if params.get(param) == expected_value else []
3492
3493
3494 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3495     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3496         if use_compat:
3497             return argdict
3498         else:
3499             argdict = None
3500     if argdict is None:
3501         return default
3502     assert isinstance(argdict, dict)
3503
3504     assert isinstance(keys, (list, tuple))
3505     for key_list in keys:
3506         arg_list = list(filter(
3507             lambda x: x is not None,
3508             [argdict.get(key.lower()) for key in variadic(key_list)]))
3509         if arg_list:
3510             return [arg for args in arg_list for arg in args]
3511     return default
3512
3513
3514 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3515     main_key, exe = main_key.lower(), exe.lower()
3516     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3517     keys = [f'{root_key}{k}' for k in (keys or [''])]
3518     if root_key in keys:
3519         if main_key != exe:
3520             keys.append((main_key, exe))
3521         keys.append('default')
3522     else:
3523         use_compat = False
3524     return cli_configuration_args(argdict, keys, default, use_compat)
3525
3526
3527 class ISO639Utils:
3528     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3529     _lang_map = {
3530         'aa': 'aar',
3531         'ab': 'abk',
3532         'ae': 'ave',
3533         'af': 'afr',
3534         'ak': 'aka',
3535         'am': 'amh',
3536         'an': 'arg',
3537         'ar': 'ara',
3538         'as': 'asm',
3539         'av': 'ava',
3540         'ay': 'aym',
3541         'az': 'aze',
3542         'ba': 'bak',
3543         'be': 'bel',
3544         'bg': 'bul',
3545         'bh': 'bih',
3546         'bi': 'bis',
3547         'bm': 'bam',
3548         'bn': 'ben',
3549         'bo': 'bod',
3550         'br': 'bre',
3551         'bs': 'bos',
3552         'ca': 'cat',
3553         'ce': 'che',
3554         'ch': 'cha',
3555         'co': 'cos',
3556         'cr': 'cre',
3557         'cs': 'ces',
3558         'cu': 'chu',
3559         'cv': 'chv',
3560         'cy': 'cym',
3561         'da': 'dan',
3562         'de': 'deu',
3563         'dv': 'div',
3564         'dz': 'dzo',
3565         'ee': 'ewe',
3566         'el': 'ell',
3567         'en': 'eng',
3568         'eo': 'epo',
3569         'es': 'spa',
3570         'et': 'est',
3571         'eu': 'eus',
3572         'fa': 'fas',
3573         'ff': 'ful',
3574         'fi': 'fin',
3575         'fj': 'fij',
3576         'fo': 'fao',
3577         'fr': 'fra',
3578         'fy': 'fry',
3579         'ga': 'gle',
3580         'gd': 'gla',
3581         'gl': 'glg',
3582         'gn': 'grn',
3583         'gu': 'guj',
3584         'gv': 'glv',
3585         'ha': 'hau',
3586         'he': 'heb',
3587         'iw': 'heb',  # Replaced by he in 1989 revision
3588         'hi': 'hin',
3589         'ho': 'hmo',
3590         'hr': 'hrv',
3591         'ht': 'hat',
3592         'hu': 'hun',
3593         'hy': 'hye',
3594         'hz': 'her',
3595         'ia': 'ina',
3596         'id': 'ind',
3597         'in': 'ind',  # Replaced by id in 1989 revision
3598         'ie': 'ile',
3599         'ig': 'ibo',
3600         'ii': 'iii',
3601         'ik': 'ipk',
3602         'io': 'ido',
3603         'is': 'isl',
3604         'it': 'ita',
3605         'iu': 'iku',
3606         'ja': 'jpn',
3607         'jv': 'jav',
3608         'ka': 'kat',
3609         'kg': 'kon',
3610         'ki': 'kik',
3611         'kj': 'kua',
3612         'kk': 'kaz',
3613         'kl': 'kal',
3614         'km': 'khm',
3615         'kn': 'kan',
3616         'ko': 'kor',
3617         'kr': 'kau',
3618         'ks': 'kas',
3619         'ku': 'kur',
3620         'kv': 'kom',
3621         'kw': 'cor',
3622         'ky': 'kir',
3623         'la': 'lat',
3624         'lb': 'ltz',
3625         'lg': 'lug',
3626         'li': 'lim',
3627         'ln': 'lin',
3628         'lo': 'lao',
3629         'lt': 'lit',
3630         'lu': 'lub',
3631         'lv': 'lav',
3632         'mg': 'mlg',
3633         'mh': 'mah',
3634         'mi': 'mri',
3635         'mk': 'mkd',
3636         'ml': 'mal',
3637         'mn': 'mon',
3638         'mr': 'mar',
3639         'ms': 'msa',
3640         'mt': 'mlt',
3641         'my': 'mya',
3642         'na': 'nau',
3643         'nb': 'nob',
3644         'nd': 'nde',
3645         'ne': 'nep',
3646         'ng': 'ndo',
3647         'nl': 'nld',
3648         'nn': 'nno',
3649         'no': 'nor',
3650         'nr': 'nbl',
3651         'nv': 'nav',
3652         'ny': 'nya',
3653         'oc': 'oci',
3654         'oj': 'oji',
3655         'om': 'orm',
3656         'or': 'ori',
3657         'os': 'oss',
3658         'pa': 'pan',
3659         'pe': 'per',
3660         'pi': 'pli',
3661         'pl': 'pol',
3662         'ps': 'pus',
3663         'pt': 'por',
3664         'qu': 'que',
3665         'rm': 'roh',
3666         'rn': 'run',
3667         'ro': 'ron',
3668         'ru': 'rus',
3669         'rw': 'kin',
3670         'sa': 'san',
3671         'sc': 'srd',
3672         'sd': 'snd',
3673         'se': 'sme',
3674         'sg': 'sag',
3675         'si': 'sin',
3676         'sk': 'slk',
3677         'sl': 'slv',
3678         'sm': 'smo',
3679         'sn': 'sna',
3680         'so': 'som',
3681         'sq': 'sqi',
3682         'sr': 'srp',
3683         'ss': 'ssw',
3684         'st': 'sot',
3685         'su': 'sun',
3686         'sv': 'swe',
3687         'sw': 'swa',
3688         'ta': 'tam',
3689         'te': 'tel',
3690         'tg': 'tgk',
3691         'th': 'tha',
3692         'ti': 'tir',
3693         'tk': 'tuk',
3694         'tl': 'tgl',
3695         'tn': 'tsn',
3696         'to': 'ton',
3697         'tr': 'tur',
3698         'ts': 'tso',
3699         'tt': 'tat',
3700         'tw': 'twi',
3701         'ty': 'tah',
3702         'ug': 'uig',
3703         'uk': 'ukr',
3704         'ur': 'urd',
3705         'uz': 'uzb',
3706         've': 'ven',
3707         'vi': 'vie',
3708         'vo': 'vol',
3709         'wa': 'wln',
3710         'wo': 'wol',
3711         'xh': 'xho',
3712         'yi': 'yid',
3713         'ji': 'yid',  # Replaced by yi in 1989 revision
3714         'yo': 'yor',
3715         'za': 'zha',
3716         'zh': 'zho',
3717         'zu': 'zul',
3718     }
3719
3720     @classmethod
3721     def short2long(cls, code):
3722         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3723         return cls._lang_map.get(code[:2])
3724
3725     @classmethod
3726     def long2short(cls, code):
3727         """Convert language code from ISO 639-2/T to ISO 639-1"""
3728         for short_name, long_name in cls._lang_map.items():
3729             if long_name == code:
3730                 return short_name
3731
3732
3733 class ISO3166Utils:
3734     # From http://data.okfn.org/data/core/country-list
3735     _country_map = {
3736         'AF': 'Afghanistan',
3737         'AX': 'Åland Islands',
3738         'AL': 'Albania',
3739         'DZ': 'Algeria',
3740         'AS': 'American Samoa',
3741         'AD': 'Andorra',
3742         'AO': 'Angola',
3743         'AI': 'Anguilla',
3744         'AQ': 'Antarctica',
3745         'AG': 'Antigua and Barbuda',
3746         'AR': 'Argentina',
3747         'AM': 'Armenia',
3748         'AW': 'Aruba',
3749         'AU': 'Australia',
3750         'AT': 'Austria',
3751         'AZ': 'Azerbaijan',
3752         'BS': 'Bahamas',
3753         'BH': 'Bahrain',
3754         'BD': 'Bangladesh',
3755         'BB': 'Barbados',
3756         'BY': 'Belarus',
3757         'BE': 'Belgium',
3758         'BZ': 'Belize',
3759         'BJ': 'Benin',
3760         'BM': 'Bermuda',
3761         'BT': 'Bhutan',
3762         'BO': 'Bolivia, Plurinational State of',
3763         'BQ': 'Bonaire, Sint Eustatius and Saba',
3764         'BA': 'Bosnia and Herzegovina',
3765         'BW': 'Botswana',
3766         'BV': 'Bouvet Island',
3767         'BR': 'Brazil',
3768         'IO': 'British Indian Ocean Territory',
3769         'BN': 'Brunei Darussalam',
3770         'BG': 'Bulgaria',
3771         'BF': 'Burkina Faso',
3772         'BI': 'Burundi',
3773         'KH': 'Cambodia',
3774         'CM': 'Cameroon',
3775         'CA': 'Canada',
3776         'CV': 'Cape Verde',
3777         'KY': 'Cayman Islands',
3778         'CF': 'Central African Republic',
3779         'TD': 'Chad',
3780         'CL': 'Chile',
3781         'CN': 'China',
3782         'CX': 'Christmas Island',
3783         'CC': 'Cocos (Keeling) Islands',
3784         'CO': 'Colombia',
3785         'KM': 'Comoros',
3786         'CG': 'Congo',
3787         'CD': 'Congo, the Democratic Republic of the',
3788         'CK': 'Cook Islands',
3789         'CR': 'Costa Rica',
3790         'CI': 'Côte d\'Ivoire',
3791         'HR': 'Croatia',
3792         'CU': 'Cuba',
3793         'CW': 'Curaçao',
3794         'CY': 'Cyprus',
3795         'CZ': 'Czech Republic',
3796         'DK': 'Denmark',
3797         'DJ': 'Djibouti',
3798         'DM': 'Dominica',
3799         'DO': 'Dominican Republic',
3800         'EC': 'Ecuador',
3801         'EG': 'Egypt',
3802         'SV': 'El Salvador',
3803         'GQ': 'Equatorial Guinea',
3804         'ER': 'Eritrea',
3805         'EE': 'Estonia',
3806         'ET': 'Ethiopia',
3807         'FK': 'Falkland Islands (Malvinas)',
3808         'FO': 'Faroe Islands',
3809         'FJ': 'Fiji',
3810         'FI': 'Finland',
3811         'FR': 'France',
3812         'GF': 'French Guiana',
3813         'PF': 'French Polynesia',
3814         'TF': 'French Southern Territories',
3815         'GA': 'Gabon',
3816         'GM': 'Gambia',
3817         'GE': 'Georgia',
3818         'DE': 'Germany',
3819         'GH': 'Ghana',
3820         'GI': 'Gibraltar',
3821         'GR': 'Greece',
3822         'GL': 'Greenland',
3823         'GD': 'Grenada',
3824         'GP': 'Guadeloupe',
3825         'GU': 'Guam',
3826         'GT': 'Guatemala',
3827         'GG': 'Guernsey',
3828         'GN': 'Guinea',
3829         'GW': 'Guinea-Bissau',
3830         'GY': 'Guyana',
3831         'HT': 'Haiti',
3832         'HM': 'Heard Island and McDonald Islands',
3833         'VA': 'Holy See (Vatican City State)',
3834         'HN': 'Honduras',
3835         'HK': 'Hong Kong',
3836         'HU': 'Hungary',
3837         'IS': 'Iceland',
3838         'IN': 'India',
3839         'ID': 'Indonesia',
3840         'IR': 'Iran, Islamic Republic of',
3841         'IQ': 'Iraq',
3842         'IE': 'Ireland',
3843         'IM': 'Isle of Man',
3844         'IL': 'Israel',
3845         'IT': 'Italy',
3846         'JM': 'Jamaica',
3847         'JP': 'Japan',
3848         'JE': 'Jersey',
3849         'JO': 'Jordan',
3850         'KZ': 'Kazakhstan',
3851         'KE': 'Kenya',
3852         'KI': 'Kiribati',
3853         'KP': 'Korea, Democratic People\'s Republic of',
3854         'KR': 'Korea, Republic of',
3855         'KW': 'Kuwait',
3856         'KG': 'Kyrgyzstan',
3857         'LA': 'Lao People\'s Democratic Republic',
3858         'LV': 'Latvia',
3859         'LB': 'Lebanon',
3860         'LS': 'Lesotho',
3861         'LR': 'Liberia',
3862         'LY': 'Libya',
3863         'LI': 'Liechtenstein',
3864         'LT': 'Lithuania',
3865         'LU': 'Luxembourg',
3866         'MO': 'Macao',
3867         'MK': 'Macedonia, the Former Yugoslav Republic of',
3868         'MG': 'Madagascar',
3869         'MW': 'Malawi',
3870         'MY': 'Malaysia',
3871         'MV': 'Maldives',
3872         'ML': 'Mali',
3873         'MT': 'Malta',
3874         'MH': 'Marshall Islands',
3875         'MQ': 'Martinique',
3876         'MR': 'Mauritania',
3877         'MU': 'Mauritius',
3878         'YT': 'Mayotte',
3879         'MX': 'Mexico',
3880         'FM': 'Micronesia, Federated States of',
3881         'MD': 'Moldova, Republic of',
3882         'MC': 'Monaco',
3883         'MN': 'Mongolia',
3884         'ME': 'Montenegro',
3885         'MS': 'Montserrat',
3886         'MA': 'Morocco',
3887         'MZ': 'Mozambique',
3888         'MM': 'Myanmar',
3889         'NA': 'Namibia',
3890         'NR': 'Nauru',
3891         'NP': 'Nepal',
3892         'NL': 'Netherlands',
3893         'NC': 'New Caledonia',
3894         'NZ': 'New Zealand',
3895         'NI': 'Nicaragua',
3896         'NE': 'Niger',
3897         'NG': 'Nigeria',
3898         'NU': 'Niue',
3899         'NF': 'Norfolk Island',
3900         'MP': 'Northern Mariana Islands',
3901         'NO': 'Norway',
3902         'OM': 'Oman',
3903         'PK': 'Pakistan',
3904         'PW': 'Palau',
3905         'PS': 'Palestine, State of',
3906         'PA': 'Panama',
3907         'PG': 'Papua New Guinea',
3908         'PY': 'Paraguay',
3909         'PE': 'Peru',
3910         'PH': 'Philippines',
3911         'PN': 'Pitcairn',
3912         'PL': 'Poland',
3913         'PT': 'Portugal',
3914         'PR': 'Puerto Rico',
3915         'QA': 'Qatar',
3916         'RE': 'Réunion',
3917         'RO': 'Romania',
3918         'RU': 'Russian Federation',
3919         'RW': 'Rwanda',
3920         'BL': 'Saint Barthélemy',
3921         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3922         'KN': 'Saint Kitts and Nevis',
3923         'LC': 'Saint Lucia',
3924         'MF': 'Saint Martin (French part)',
3925         'PM': 'Saint Pierre and Miquelon',
3926         'VC': 'Saint Vincent and the Grenadines',
3927         'WS': 'Samoa',
3928         'SM': 'San Marino',
3929         'ST': 'Sao Tome and Principe',
3930         'SA': 'Saudi Arabia',
3931         'SN': 'Senegal',
3932         'RS': 'Serbia',
3933         'SC': 'Seychelles',
3934         'SL': 'Sierra Leone',
3935         'SG': 'Singapore',
3936         'SX': 'Sint Maarten (Dutch part)',
3937         'SK': 'Slovakia',
3938         'SI': 'Slovenia',
3939         'SB': 'Solomon Islands',
3940         'SO': 'Somalia',
3941         'ZA': 'South Africa',
3942         'GS': 'South Georgia and the South Sandwich Islands',
3943         'SS': 'South Sudan',
3944         'ES': 'Spain',
3945         'LK': 'Sri Lanka',
3946         'SD': 'Sudan',
3947         'SR': 'Suriname',
3948         'SJ': 'Svalbard and Jan Mayen',
3949         'SZ': 'Swaziland',
3950         'SE': 'Sweden',
3951         'CH': 'Switzerland',
3952         'SY': 'Syrian Arab Republic',
3953         'TW': 'Taiwan, Province of China',
3954         'TJ': 'Tajikistan',
3955         'TZ': 'Tanzania, United Republic of',
3956         'TH': 'Thailand',
3957         'TL': 'Timor-Leste',
3958         'TG': 'Togo',
3959         'TK': 'Tokelau',
3960         'TO': 'Tonga',
3961         'TT': 'Trinidad and Tobago',
3962         'TN': 'Tunisia',
3963         'TR': 'Turkey',
3964         'TM': 'Turkmenistan',
3965         'TC': 'Turks and Caicos Islands',
3966         'TV': 'Tuvalu',
3967         'UG': 'Uganda',
3968         'UA': 'Ukraine',
3969         'AE': 'United Arab Emirates',
3970         'GB': 'United Kingdom',
3971         'US': 'United States',
3972         'UM': 'United States Minor Outlying Islands',
3973         'UY': 'Uruguay',
3974         'UZ': 'Uzbekistan',
3975         'VU': 'Vanuatu',
3976         'VE': 'Venezuela, Bolivarian Republic of',
3977         'VN': 'Viet Nam',
3978         'VG': 'Virgin Islands, British',
3979         'VI': 'Virgin Islands, U.S.',
3980         'WF': 'Wallis and Futuna',
3981         'EH': 'Western Sahara',
3982         'YE': 'Yemen',
3983         'ZM': 'Zambia',
3984         'ZW': 'Zimbabwe',
3985         # Not ISO 3166 codes, but used for IP blocks
3986         'AP': 'Asia/Pacific Region',
3987         'EU': 'Europe',
3988     }
3989
3990     @classmethod
3991     def short2full(cls, code):
3992         """Convert an ISO 3166-2 country code to the corresponding full name"""
3993         return cls._country_map.get(code.upper())
3994
3995
3996 class GeoUtils:
3997     # Major IPv4 address blocks per country
3998     _country_ip_map = {
3999         'AD': '46.172.224.0/19',
4000         'AE': '94.200.0.0/13',
4001         'AF': '149.54.0.0/17',
4002         'AG': '209.59.64.0/18',
4003         'AI': '204.14.248.0/21',
4004         'AL': '46.99.0.0/16',
4005         'AM': '46.70.0.0/15',
4006         'AO': '105.168.0.0/13',
4007         'AP': '182.50.184.0/21',
4008         'AQ': '23.154.160.0/24',
4009         'AR': '181.0.0.0/12',
4010         'AS': '202.70.112.0/20',
4011         'AT': '77.116.0.0/14',
4012         'AU': '1.128.0.0/11',
4013         'AW': '181.41.0.0/18',
4014         'AX': '185.217.4.0/22',
4015         'AZ': '5.197.0.0/16',
4016         'BA': '31.176.128.0/17',
4017         'BB': '65.48.128.0/17',
4018         'BD': '114.130.0.0/16',
4019         'BE': '57.0.0.0/8',
4020         'BF': '102.178.0.0/15',
4021         'BG': '95.42.0.0/15',
4022         'BH': '37.131.0.0/17',
4023         'BI': '154.117.192.0/18',
4024         'BJ': '137.255.0.0/16',
4025         'BL': '185.212.72.0/23',
4026         'BM': '196.12.64.0/18',
4027         'BN': '156.31.0.0/16',
4028         'BO': '161.56.0.0/16',
4029         'BQ': '161.0.80.0/20',
4030         'BR': '191.128.0.0/12',
4031         'BS': '24.51.64.0/18',
4032         'BT': '119.2.96.0/19',
4033         'BW': '168.167.0.0/16',
4034         'BY': '178.120.0.0/13',
4035         'BZ': '179.42.192.0/18',
4036         'CA': '99.224.0.0/11',
4037         'CD': '41.243.0.0/16',
4038         'CF': '197.242.176.0/21',
4039         'CG': '160.113.0.0/16',
4040         'CH': '85.0.0.0/13',
4041         'CI': '102.136.0.0/14',
4042         'CK': '202.65.32.0/19',
4043         'CL': '152.172.0.0/14',
4044         'CM': '102.244.0.0/14',
4045         'CN': '36.128.0.0/10',
4046         'CO': '181.240.0.0/12',
4047         'CR': '201.192.0.0/12',
4048         'CU': '152.206.0.0/15',
4049         'CV': '165.90.96.0/19',
4050         'CW': '190.88.128.0/17',
4051         'CY': '31.153.0.0/16',
4052         'CZ': '88.100.0.0/14',
4053         'DE': '53.0.0.0/8',
4054         'DJ': '197.241.0.0/17',
4055         'DK': '87.48.0.0/12',
4056         'DM': '192.243.48.0/20',
4057         'DO': '152.166.0.0/15',
4058         'DZ': '41.96.0.0/12',
4059         'EC': '186.68.0.0/15',
4060         'EE': '90.190.0.0/15',
4061         'EG': '156.160.0.0/11',
4062         'ER': '196.200.96.0/20',
4063         'ES': '88.0.0.0/11',
4064         'ET': '196.188.0.0/14',
4065         'EU': '2.16.0.0/13',
4066         'FI': '91.152.0.0/13',
4067         'FJ': '144.120.0.0/16',
4068         'FK': '80.73.208.0/21',
4069         'FM': '119.252.112.0/20',
4070         'FO': '88.85.32.0/19',
4071         'FR': '90.0.0.0/9',
4072         'GA': '41.158.0.0/15',
4073         'GB': '25.0.0.0/8',
4074         'GD': '74.122.88.0/21',
4075         'GE': '31.146.0.0/16',
4076         'GF': '161.22.64.0/18',
4077         'GG': '62.68.160.0/19',
4078         'GH': '154.160.0.0/12',
4079         'GI': '95.164.0.0/16',
4080         'GL': '88.83.0.0/19',
4081         'GM': '160.182.0.0/15',
4082         'GN': '197.149.192.0/18',
4083         'GP': '104.250.0.0/19',
4084         'GQ': '105.235.224.0/20',
4085         'GR': '94.64.0.0/13',
4086         'GT': '168.234.0.0/16',
4087         'GU': '168.123.0.0/16',
4088         'GW': '197.214.80.0/20',
4089         'GY': '181.41.64.0/18',
4090         'HK': '113.252.0.0/14',
4091         'HN': '181.210.0.0/16',
4092         'HR': '93.136.0.0/13',
4093         'HT': '148.102.128.0/17',
4094         'HU': '84.0.0.0/14',
4095         'ID': '39.192.0.0/10',
4096         'IE': '87.32.0.0/12',
4097         'IL': '79.176.0.0/13',
4098         'IM': '5.62.80.0/20',
4099         'IN': '117.192.0.0/10',
4100         'IO': '203.83.48.0/21',
4101         'IQ': '37.236.0.0/14',
4102         'IR': '2.176.0.0/12',
4103         'IS': '82.221.0.0/16',
4104         'IT': '79.0.0.0/10',
4105         'JE': '87.244.64.0/18',
4106         'JM': '72.27.0.0/17',
4107         'JO': '176.29.0.0/16',
4108         'JP': '133.0.0.0/8',
4109         'KE': '105.48.0.0/12',
4110         'KG': '158.181.128.0/17',
4111         'KH': '36.37.128.0/17',
4112         'KI': '103.25.140.0/22',
4113         'KM': '197.255.224.0/20',
4114         'KN': '198.167.192.0/19',
4115         'KP': '175.45.176.0/22',
4116         'KR': '175.192.0.0/10',
4117         'KW': '37.36.0.0/14',
4118         'KY': '64.96.0.0/15',
4119         'KZ': '2.72.0.0/13',
4120         'LA': '115.84.64.0/18',
4121         'LB': '178.135.0.0/16',
4122         'LC': '24.92.144.0/20',
4123         'LI': '82.117.0.0/19',
4124         'LK': '112.134.0.0/15',
4125         'LR': '102.183.0.0/16',
4126         'LS': '129.232.0.0/17',
4127         'LT': '78.56.0.0/13',
4128         'LU': '188.42.0.0/16',
4129         'LV': '46.109.0.0/16',
4130         'LY': '41.252.0.0/14',
4131         'MA': '105.128.0.0/11',
4132         'MC': '88.209.64.0/18',
4133         'MD': '37.246.0.0/16',
4134         'ME': '178.175.0.0/17',
4135         'MF': '74.112.232.0/21',
4136         'MG': '154.126.0.0/17',
4137         'MH': '117.103.88.0/21',
4138         'MK': '77.28.0.0/15',
4139         'ML': '154.118.128.0/18',
4140         'MM': '37.111.0.0/17',
4141         'MN': '49.0.128.0/17',
4142         'MO': '60.246.0.0/16',
4143         'MP': '202.88.64.0/20',
4144         'MQ': '109.203.224.0/19',
4145         'MR': '41.188.64.0/18',
4146         'MS': '208.90.112.0/22',
4147         'MT': '46.11.0.0/16',
4148         'MU': '105.16.0.0/12',
4149         'MV': '27.114.128.0/18',
4150         'MW': '102.70.0.0/15',
4151         'MX': '187.192.0.0/11',
4152         'MY': '175.136.0.0/13',
4153         'MZ': '197.218.0.0/15',
4154         'NA': '41.182.0.0/16',
4155         'NC': '101.101.0.0/18',
4156         'NE': '197.214.0.0/18',
4157         'NF': '203.17.240.0/22',
4158         'NG': '105.112.0.0/12',
4159         'NI': '186.76.0.0/15',
4160         'NL': '145.96.0.0/11',
4161         'NO': '84.208.0.0/13',
4162         'NP': '36.252.0.0/15',
4163         'NR': '203.98.224.0/19',
4164         'NU': '49.156.48.0/22',
4165         'NZ': '49.224.0.0/14',
4166         'OM': '5.36.0.0/15',
4167         'PA': '186.72.0.0/15',
4168         'PE': '186.160.0.0/14',
4169         'PF': '123.50.64.0/18',
4170         'PG': '124.240.192.0/19',
4171         'PH': '49.144.0.0/13',
4172         'PK': '39.32.0.0/11',
4173         'PL': '83.0.0.0/11',
4174         'PM': '70.36.0.0/20',
4175         'PR': '66.50.0.0/16',
4176         'PS': '188.161.0.0/16',
4177         'PT': '85.240.0.0/13',
4178         'PW': '202.124.224.0/20',
4179         'PY': '181.120.0.0/14',
4180         'QA': '37.210.0.0/15',
4181         'RE': '102.35.0.0/16',
4182         'RO': '79.112.0.0/13',
4183         'RS': '93.86.0.0/15',
4184         'RU': '5.136.0.0/13',
4185         'RW': '41.186.0.0/16',
4186         'SA': '188.48.0.0/13',
4187         'SB': '202.1.160.0/19',
4188         'SC': '154.192.0.0/11',
4189         'SD': '102.120.0.0/13',
4190         'SE': '78.64.0.0/12',
4191         'SG': '8.128.0.0/10',
4192         'SI': '188.196.0.0/14',
4193         'SK': '78.98.0.0/15',
4194         'SL': '102.143.0.0/17',
4195         'SM': '89.186.32.0/19',
4196         'SN': '41.82.0.0/15',
4197         'SO': '154.115.192.0/18',
4198         'SR': '186.179.128.0/17',
4199         'SS': '105.235.208.0/21',
4200         'ST': '197.159.160.0/19',
4201         'SV': '168.243.0.0/16',
4202         'SX': '190.102.0.0/20',
4203         'SY': '5.0.0.0/16',
4204         'SZ': '41.84.224.0/19',
4205         'TC': '65.255.48.0/20',
4206         'TD': '154.68.128.0/19',
4207         'TG': '196.168.0.0/14',
4208         'TH': '171.96.0.0/13',
4209         'TJ': '85.9.128.0/18',
4210         'TK': '27.96.24.0/21',
4211         'TL': '180.189.160.0/20',
4212         'TM': '95.85.96.0/19',
4213         'TN': '197.0.0.0/11',
4214         'TO': '175.176.144.0/21',
4215         'TR': '78.160.0.0/11',
4216         'TT': '186.44.0.0/15',
4217         'TV': '202.2.96.0/19',
4218         'TW': '120.96.0.0/11',
4219         'TZ': '156.156.0.0/14',
4220         'UA': '37.52.0.0/14',
4221         'UG': '102.80.0.0/13',
4222         'US': '6.0.0.0/8',
4223         'UY': '167.56.0.0/13',
4224         'UZ': '84.54.64.0/18',
4225         'VA': '212.77.0.0/19',
4226         'VC': '207.191.240.0/21',
4227         'VE': '186.88.0.0/13',
4228         'VG': '66.81.192.0/20',
4229         'VI': '146.226.0.0/16',
4230         'VN': '14.160.0.0/11',
4231         'VU': '202.80.32.0/20',
4232         'WF': '117.20.32.0/21',
4233         'WS': '202.4.32.0/19',
4234         'YE': '134.35.0.0/16',
4235         'YT': '41.242.116.0/22',
4236         'ZA': '41.0.0.0/11',
4237         'ZM': '102.144.0.0/13',
4238         'ZW': '102.177.192.0/18',
4239     }
4240
4241     @classmethod
4242     def random_ipv4(cls, code_or_block):
4243         if len(code_or_block) == 2:
4244             block = cls._country_ip_map.get(code_or_block.upper())
4245             if not block:
4246                 return None
4247         else:
4248             block = code_or_block
4249         addr, preflen = block.split('/')
4250         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4251         addr_max = addr_min | (0xffffffff >> int(preflen))
4252         return str(socket.inet_ntoa(
4253             struct.pack('!L', random.randint(addr_min, addr_max))))
4254
4255
4256 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4257 # released into Public Domain
4258 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4259
4260 def long_to_bytes(n, blocksize=0):
4261     """long_to_bytes(n:long, blocksize:int) : string
4262     Convert a long integer to a byte string.
4263
4264     If optional blocksize is given and greater than zero, pad the front of the
4265     byte string with binary zeros so that the length is a multiple of
4266     blocksize.
4267     """
4268     # after much testing, this algorithm was deemed to be the fastest
4269     s = b''
4270     n = int(n)
4271     while n > 0:
4272         s = struct.pack('>I', n & 0xffffffff) + s
4273         n = n >> 32
4274     # strip off leading zeros
4275     for i in range(len(s)):
4276         if s[i] != b'\000'[0]:
4277             break
4278     else:
4279         # only happens when n == 0
4280         s = b'\000'
4281         i = 0
4282     s = s[i:]
4283     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4284     # de-padding being done above, but sigh...
4285     if blocksize > 0 and len(s) % blocksize:
4286         s = (blocksize - len(s) % blocksize) * b'\000' + s
4287     return s
4288
4289
4290 def bytes_to_long(s):
4291     """bytes_to_long(string) : long
4292     Convert a byte string to a long integer.
4293
4294     This is (essentially) the inverse of long_to_bytes().
4295     """
4296     acc = 0
4297     length = len(s)
4298     if length % 4:
4299         extra = (4 - length % 4)
4300         s = b'\000' * extra + s
4301         length = length + extra
4302     for i in range(0, length, 4):
4303         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4304     return acc
4305
4306
4307 def ohdave_rsa_encrypt(data, exponent, modulus):
4308     '''
4309     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4310
4311     Input:
4312         data: data to encrypt, bytes-like object
4313         exponent, modulus: parameter e and N of RSA algorithm, both integer
4314     Output: hex string of encrypted data
4315
4316     Limitation: supports one block encryption only
4317     '''
4318
4319     payload = int(binascii.hexlify(data[::-1]), 16)
4320     encrypted = pow(payload, exponent, modulus)
4321     return '%x' % encrypted
4322
4323
4324 def pkcs1pad(data, length):
4325     """
4326     Padding input data with PKCS#1 scheme
4327
4328     @param {int[]} data        input data
4329     @param {int}   length      target length
4330     @returns {int[]}           padded data
4331     """
4332     if len(data) > length - 11:
4333         raise ValueError('Input data too long for PKCS#1 padding')
4334
4335     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4336     return [0, 2] + pseudo_random + [0] + data
4337
4338
4339 def _base_n_table(n, table):
4340     if not table and not n:
4341         raise ValueError('Either table or n must be specified')
4342     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4343
4344     if n and n != len(table):
4345         raise ValueError(f'base {n} exceeds table length {len(table)}')
4346     return table
4347
4348
4349 def encode_base_n(num, n=None, table=None):
4350     """Convert given int to a base-n string"""
4351     table = _base_n_table(n, table)
4352     if not num:
4353         return table[0]
4354
4355     result, base = '', len(table)
4356     while num:
4357         result = table[num % base] + result
4358         num = num // base
4359     return result
4360
4361
4362 def decode_base_n(string, n=None, table=None):
4363     """Convert given base-n string to int"""
4364     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4365     result, base = 0, len(table)
4366     for char in string:
4367         result = result * base + table[char]
4368     return result
4369
4370
4371 def decode_packed_codes(code):
4372     mobj = re.search(PACKED_CODES_RE, code)
4373     obfuscated_code, base, count, symbols = mobj.groups()
4374     base = int(base)
4375     count = int(count)
4376     symbols = symbols.split('|')
4377     symbol_table = {}
4378
4379     while count:
4380         count -= 1
4381         base_n_count = encode_base_n(count, base)
4382         symbol_table[base_n_count] = symbols[count] or base_n_count
4383
4384     return re.sub(
4385         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4386         obfuscated_code)
4387
4388
4389 def caesar(s, alphabet, shift):
4390     if shift == 0:
4391         return s
4392     l = len(alphabet)
4393     return ''.join(
4394         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4395         for c in s)
4396
4397
4398 def rot47(s):
4399     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4400
4401
4402 def parse_m3u8_attributes(attrib):
4403     info = {}
4404     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4405         if val.startswith('"'):
4406             val = val[1:-1]
4407         info[key] = val
4408     return info
4409
4410
4411 def urshift(val, n):
4412     return val >> n if val >= 0 else (val + 0x100000000) >> n
4413
4414
4415 def write_xattr(path, key, value):
4416     # Windows: Write xattrs to NTFS Alternate Data Streams:
4417     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4418     if compat_os_name == 'nt':
4419         assert ':' not in key
4420         assert os.path.exists(path)
4421
4422         try:
4423             with open(f'{path}:{key}', 'wb') as f:
4424                 f.write(value)
4425         except OSError as e:
4426             raise XAttrMetadataError(e.errno, e.strerror)
4427         return
4428
4429     # UNIX Method 1. Use xattrs/pyxattrs modules
4430
4431     setxattr = None
4432     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4433         # Unicode arguments are not supported in pyxattr until version 0.5.0
4434         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4435         if version_tuple(xattr.__version__) >= (0, 5, 0):
4436             setxattr = xattr.set
4437     elif xattr:
4438         setxattr = xattr.setxattr
4439
4440     if setxattr:
4441         try:
4442             setxattr(path, key, value)
4443         except OSError as e:
4444             raise XAttrMetadataError(e.errno, e.strerror)
4445         return
4446
4447     # UNIX Method 2. Use setfattr/xattr executables
4448     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4449            else 'xattr' if check_executable('xattr', ['-h']) else None)
4450     if not exe:
4451         raise XAttrUnavailableError(
4452             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4453             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4454
4455     value = value.decode()
4456     try:
4457         _, stderr, returncode = Popen.run(
4458             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4459             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4460     except OSError as e:
4461         raise XAttrMetadataError(e.errno, e.strerror)
4462     if returncode:
4463         raise XAttrMetadataError(returncode, stderr)
4464
4465
4466 def random_birthday(year_field, month_field, day_field):
4467     start_date = datetime.date(1950, 1, 1)
4468     end_date = datetime.date(1995, 12, 31)
4469     offset = random.randint(0, (end_date - start_date).days)
4470     random_date = start_date + datetime.timedelta(offset)
4471     return {
4472         year_field: str(random_date.year),
4473         month_field: str(random_date.month),
4474         day_field: str(random_date.day),
4475     }
4476
4477
4478 def find_available_port(interface=''):
4479     try:
4480         with socket.socket() as sock:
4481             sock.bind((interface, 0))
4482             return sock.getsockname()[1]
4483     except OSError:
4484         return None
4485
4486
4487 # Templates for internet shortcut files, which are plain text files.
4488 DOT_URL_LINK_TEMPLATE = '''\
4489 [InternetShortcut]
4490 URL=%(url)s
4491 '''
4492
4493 DOT_WEBLOC_LINK_TEMPLATE = '''\
4494 <?xml version="1.0" encoding="UTF-8"?>
4495 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4496 <plist version="1.0">
4497 <dict>
4498 \t<key>URL</key>
4499 \t<string>%(url)s</string>
4500 </dict>
4501 </plist>
4502 '''
4503
4504 DOT_DESKTOP_LINK_TEMPLATE = '''\
4505 [Desktop Entry]
4506 Encoding=UTF-8
4507 Name=%(filename)s
4508 Type=Link
4509 URL=%(url)s
4510 Icon=text-html
4511 '''
4512
4513 LINK_TEMPLATES = {
4514     'url': DOT_URL_LINK_TEMPLATE,
4515     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4516     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4517 }
4518
4519
4520 def iri_to_uri(iri):
4521     """
4522     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4523
4524     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4525     """
4526
4527     iri_parts = urllib.parse.urlparse(iri)
4528
4529     if '[' in iri_parts.netloc:
4530         raise ValueError('IPv6 URIs are not, yet, supported.')
4531         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4532
4533     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4534
4535     net_location = ''
4536     if iri_parts.username:
4537         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4538         if iri_parts.password is not None:
4539             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4540         net_location += '@'
4541
4542     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4543     # The 'idna' encoding produces ASCII text.
4544     if iri_parts.port is not None and iri_parts.port != 80:
4545         net_location += ':' + str(iri_parts.port)
4546
4547     return urllib.parse.urlunparse(
4548         (iri_parts.scheme,
4549             net_location,
4550
4551             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4552
4553             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4554             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4555
4556             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4557             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4558
4559             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4560
4561     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4562
4563
4564 def to_high_limit_path(path):
4565     if sys.platform in ['win32', 'cygwin']:
4566         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4567         return '\\\\?\\' + os.path.abspath(path)
4568
4569     return path
4570
4571
4572 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4573     val = traversal.traverse_obj(obj, *variadic(field))
4574     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4575         return default
4576     return template % func(val)
4577
4578
4579 def clean_podcast_url(url):
4580     url = re.sub(r'''(?x)
4581         (?:
4582             (?:
4583                 chtbl\.com/track|
4584                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4585                 play\.podtrac\.com|
4586                 chrt\.fm/track|
4587                 mgln\.ai/e
4588             )(?:/[^/.]+)?|
4589             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4590             flex\.acast\.com|
4591             pd(?:
4592                 cn\.co| # https://podcorn.com/analytics-prefix/
4593                 st\.fm # https://podsights.com/docs/
4594             )/e|
4595             [0-9]\.gum\.fm|
4596             pscrb\.fm/rss/p
4597         )/''', '', url)
4598     return re.sub(r'^\w+://(\w+://)', r'\1', url)
4599
4600
4601 _HEX_TABLE = '0123456789abcdef'
4602
4603
4604 def random_uuidv4():
4605     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4606
4607
4608 def make_dir(path, to_screen=None):
4609     try:
4610         dn = os.path.dirname(path)
4611         if dn:
4612             os.makedirs(dn, exist_ok=True)
4613         return True
4614     except OSError as err:
4615         if callable(to_screen) is not None:
4616             to_screen(f'unable to create directory {err}')
4617         return False
4618
4619
4620 def get_executable_path():
4621     from ..update import _get_variant_and_executable_path
4622
4623     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4624
4625
4626 def get_user_config_dirs(package_name):
4627     # .config (e.g. ~/.config/package_name)
4628     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4629     yield os.path.join(xdg_config_home, package_name)
4630
4631     # appdata (%APPDATA%/package_name)
4632     appdata_dir = os.getenv('appdata')
4633     if appdata_dir:
4634         yield os.path.join(appdata_dir, package_name)
4635
4636     # home (~/.package_name)
4637     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4638
4639
4640 def get_system_config_dirs(package_name):
4641     # /etc/package_name
4642     yield os.path.join('/etc', package_name)
4643
4644
4645 def time_seconds(**kwargs):
4646     """
4647     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4648     """
4649     return time.time() + datetime.timedelta(**kwargs).total_seconds()
4650
4651
4652 # create a JSON Web Signature (jws) with HS256 algorithm
4653 # the resulting format is in JWS Compact Serialization
4654 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4655 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4656 def jwt_encode_hs256(payload_data, key, headers={}):
4657     header_data = {
4658         'alg': 'HS256',
4659         'typ': 'JWT',
4660     }
4661     if headers:
4662         header_data.update(headers)
4663     header_b64 = base64.b64encode(json.dumps(header_data).encode())
4664     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4665     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4666     signature_b64 = base64.b64encode(h.digest())
4667     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4668     return token
4669
4670
4671 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4672 def jwt_decode_hs256(jwt):
4673     header_b64, payload_b64, signature_b64 = jwt.split('.')
4674     # add trailing ='s that may have been stripped, superfluous ='s are ignored
4675     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4676     return payload_data
4677
4678
4679 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4680
4681
4682 @functools.cache
4683 def supports_terminal_sequences(stream):
4684     if compat_os_name == 'nt':
4685         if not WINDOWS_VT_MODE:
4686             return False
4687     elif not os.getenv('TERM'):
4688         return False
4689     try:
4690         return stream.isatty()
4691     except BaseException:
4692         return False
4693
4694
4695 def windows_enable_vt_mode():
4696     """Ref: https://bugs.python.org/issue30075 """
4697     if get_windows_version() < (10, 0, 10586):
4698         return
4699
4700     import ctypes
4701     import ctypes.wintypes
4702     import msvcrt
4703
4704     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4705
4706     dll = ctypes.WinDLL('kernel32', use_last_error=False)
4707     handle = os.open('CONOUT$', os.O_RDWR)
4708     try:
4709         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4710         dw_original_mode = ctypes.wintypes.DWORD()
4711         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4712         if not success:
4713             raise Exception('GetConsoleMode failed')
4714
4715         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4716             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4717         if not success:
4718             raise Exception('SetConsoleMode failed')
4719     finally:
4720         os.close(handle)
4721
4722     global WINDOWS_VT_MODE
4723     WINDOWS_VT_MODE = True
4724     supports_terminal_sequences.cache_clear()
4725
4726
4727 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4728
4729
4730 def remove_terminal_sequences(string):
4731     return _terminal_sequences_re.sub('', string)
4732
4733
4734 def number_of_digits(number):
4735     return len('%d' % number)
4736
4737
4738 def join_nonempty(*values, delim='-', from_dict=None):
4739     if from_dict is not None:
4740         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4741     return delim.join(map(str, filter(None, values)))
4742
4743
4744 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4745     """
4746     Find the largest format dimensions in terms of video width and, for each thumbnail:
4747     * Modify the URL: Match the width with the provided regex and replace with the former width
4748     * Update dimensions
4749
4750     This function is useful with video services that scale the provided thumbnails on demand
4751     """
4752     _keys = ('width', 'height')
4753     max_dimensions = max(
4754         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4755         default=(0, 0))
4756     if not max_dimensions[0]:
4757         return thumbnails
4758     return [
4759         merge_dicts(
4760             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4761             dict(zip(_keys, max_dimensions)), thumbnail)
4762         for thumbnail in thumbnails
4763     ]
4764
4765
4766 def parse_http_range(range):
4767     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4768     if not range:
4769         return None, None, None
4770     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4771     if not crg:
4772         return None, None, None
4773     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4774
4775
4776 def read_stdin(what):
4777     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4778     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4779     return sys.stdin
4780
4781
4782 def determine_file_encoding(data):
4783     """
4784     Detect the text encoding used
4785     @returns (encoding, bytes to skip)
4786     """
4787
4788     # BOM marks are given priority over declarations
4789     for bom, enc in BOMS:
4790         if data.startswith(bom):
4791             return enc, len(bom)
4792
4793     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4794     # We ignore the endianness to get a good enough match
4795     data = data.replace(b'\0', b'')
4796     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4797     return mobj.group(1).decode() if mobj else None, 0
4798
4799
4800 class Config:
4801     own_args = None
4802     parsed_args = None
4803     filename = None
4804     __initialized = False
4805
4806     def __init__(self, parser, label=None):
4807         self.parser, self.label = parser, label
4808         self._loaded_paths, self.configs = set(), []
4809
4810     def init(self, args=None, filename=None):
4811         assert not self.__initialized
4812         self.own_args, self.filename = args, filename
4813         return self.load_configs()
4814
4815     def load_configs(self):
4816         directory = ''
4817         if self.filename:
4818             location = os.path.realpath(self.filename)
4819             directory = os.path.dirname(location)
4820             if location in self._loaded_paths:
4821                 return False
4822             self._loaded_paths.add(location)
4823
4824         self.__initialized = True
4825         opts, _ = self.parser.parse_known_args(self.own_args)
4826         self.parsed_args = self.own_args
4827         for location in opts.config_locations or []:
4828             if location == '-':
4829                 if location in self._loaded_paths:
4830                     continue
4831                 self._loaded_paths.add(location)
4832                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4833                 continue
4834             location = os.path.join(directory, expand_path(location))
4835             if os.path.isdir(location):
4836                 location = os.path.join(location, 'yt-dlp.conf')
4837             if not os.path.exists(location):
4838                 self.parser.error(f'config location {location} does not exist')
4839             self.append_config(self.read_file(location), location)
4840         return True
4841
4842     def __str__(self):
4843         label = join_nonempty(
4844             self.label, 'config', f'"{self.filename}"' if self.filename else '',
4845             delim=' ')
4846         return join_nonempty(
4847             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4848             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4849             delim='\n')
4850
4851     @staticmethod
4852     def read_file(filename, default=[]):
4853         try:
4854             optionf = open(filename, 'rb')
4855         except OSError:
4856             return default  # silently skip if file is not present
4857         try:
4858             enc, skip = determine_file_encoding(optionf.read(512))
4859             optionf.seek(skip, io.SEEK_SET)
4860         except OSError:
4861             enc = None  # silently skip read errors
4862         try:
4863             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4864             contents = optionf.read().decode(enc or preferredencoding())
4865             res = shlex.split(contents, comments=True)
4866         except Exception as err:
4867             raise ValueError(f'Unable to parse "{filename}": {err}')
4868         finally:
4869             optionf.close()
4870         return res
4871
4872     @staticmethod
4873     def hide_login_info(opts):
4874         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4875         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4876
4877         def _scrub_eq(o):
4878             m = eqre.match(o)
4879             if m:
4880                 return m.group('key') + '=PRIVATE'
4881             else:
4882                 return o
4883
4884         opts = list(map(_scrub_eq, opts))
4885         for idx, opt in enumerate(opts):
4886             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4887                 opts[idx + 1] = 'PRIVATE'
4888         return opts
4889
4890     def append_config(self, *args, label=None):
4891         config = type(self)(self.parser, label)
4892         config._loaded_paths = self._loaded_paths
4893         if config.init(*args):
4894             self.configs.append(config)
4895
4896     @property
4897     def all_args(self):
4898         for config in reversed(self.configs):
4899             yield from config.all_args
4900         yield from self.parsed_args or []
4901
4902     def parse_known_args(self, **kwargs):
4903         return self.parser.parse_known_args(self.all_args, **kwargs)
4904
4905     def parse_args(self):
4906         return self.parser.parse_args(self.all_args)
4907
4908
4909 class WebSocketsWrapper:
4910     """Wraps websockets module to use in non-async scopes"""
4911     pool = None
4912
4913     def __init__(self, url, headers=None, connect=True):
4914         self.loop = asyncio.new_event_loop()
4915         # XXX: "loop" is deprecated
4916         self.conn = websockets.connect(
4917             url, extra_headers=headers, ping_interval=None,
4918             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
4919         if connect:
4920             self.__enter__()
4921         atexit.register(self.__exit__, None, None, None)
4922
4923     def __enter__(self):
4924         if not self.pool:
4925             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
4926         return self
4927
4928     def send(self, *args):
4929         self.run_with_loop(self.pool.send(*args), self.loop)
4930
4931     def recv(self, *args):
4932         return self.run_with_loop(self.pool.recv(*args), self.loop)
4933
4934     def __exit__(self, type, value, traceback):
4935         try:
4936             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
4937         finally:
4938             self.loop.close()
4939             self._cancel_all_tasks(self.loop)
4940
4941     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
4942     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
4943     @staticmethod
4944     def run_with_loop(main, loop):
4945         if not asyncio.iscoroutine(main):
4946             raise ValueError(f'a coroutine was expected, got {main!r}')
4947
4948         try:
4949             return loop.run_until_complete(main)
4950         finally:
4951             loop.run_until_complete(loop.shutdown_asyncgens())
4952             if hasattr(loop, 'shutdown_default_executor'):
4953                 loop.run_until_complete(loop.shutdown_default_executor())
4954
4955     @staticmethod
4956     def _cancel_all_tasks(loop):
4957         to_cancel = asyncio.all_tasks(loop)
4958
4959         if not to_cancel:
4960             return
4961
4962         for task in to_cancel:
4963             task.cancel()
4964
4965         # XXX: "loop" is removed in python 3.10+
4966         loop.run_until_complete(
4967             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
4968
4969         for task in to_cancel:
4970             if task.cancelled():
4971                 continue
4972             if task.exception() is not None:
4973                 loop.call_exception_handler({
4974                     'message': 'unhandled exception during asyncio.run() shutdown',
4975                     'exception': task.exception(),
4976                     'task': task,
4977                 })
4978
4979
4980 def merge_headers(*dicts):
4981     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4982     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
4983
4984
4985 def cached_method(f):
4986     """Cache a method"""
4987     signature = inspect.signature(f)
4988
4989     @functools.wraps(f)
4990     def wrapper(self, *args, **kwargs):
4991         bound_args = signature.bind(self, *args, **kwargs)
4992         bound_args.apply_defaults()
4993         key = tuple(bound_args.arguments.values())[1:]
4994
4995         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
4996         if key not in cache:
4997             cache[key] = f(self, *args, **kwargs)
4998         return cache[key]
4999     return wrapper
5000
5001
5002 class classproperty:
5003     """property access for class methods with optional caching"""
5004     def __new__(cls, func=None, *args, **kwargs):
5005         if not func:
5006             return functools.partial(cls, *args, **kwargs)
5007         return super().__new__(cls)
5008
5009     def __init__(self, func, *, cache=False):
5010         functools.update_wrapper(self, func)
5011         self.func = func
5012         self._cache = {} if cache else None
5013
5014     def __get__(self, _, cls):
5015         if self._cache is None:
5016             return self.func(cls)
5017         elif cls not in self._cache:
5018             self._cache[cls] = self.func(cls)
5019         return self._cache[cls]
5020
5021
5022 class function_with_repr:
5023     def __init__(self, func, repr_=None):
5024         functools.update_wrapper(self, func)
5025         self.func, self.__repr = func, repr_
5026
5027     def __call__(self, *args, **kwargs):
5028         return self.func(*args, **kwargs)
5029
5030     def __repr__(self):
5031         if self.__repr:
5032             return self.__repr
5033         return f'{self.func.__module__}.{self.func.__qualname__}'
5034
5035
5036 class Namespace(types.SimpleNamespace):
5037     """Immutable namespace"""
5038
5039     def __iter__(self):
5040         return iter(self.__dict__.values())
5041
5042     @property
5043     def items_(self):
5044         return self.__dict__.items()
5045
5046
5047 MEDIA_EXTENSIONS = Namespace(
5048     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5049     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5050     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5051     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5052     thumbnails=('jpg', 'png', 'webp'),
5053     storyboards=('mhtml', ),
5054     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5055     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5056 )
5057 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5058 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5059
5060 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5061
5062
5063 class RetryManager:
5064     """Usage:
5065         for retry in RetryManager(...):
5066             try:
5067                 ...
5068             except SomeException as err:
5069                 retry.error = err
5070                 continue
5071     """
5072     attempt, _error = 0, None
5073
5074     def __init__(self, _retries, _error_callback, **kwargs):
5075         self.retries = _retries or 0
5076         self.error_callback = functools.partial(_error_callback, **kwargs)
5077
5078     def _should_retry(self):
5079         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5080
5081     @property
5082     def error(self):
5083         if self._error is NO_DEFAULT:
5084             return None
5085         return self._error
5086
5087     @error.setter
5088     def error(self, value):
5089         self._error = value
5090
5091     def __iter__(self):
5092         while self._should_retry():
5093             self.error = NO_DEFAULT
5094             self.attempt += 1
5095             yield self
5096             if self.error:
5097                 self.error_callback(self.error, self.attempt, self.retries)
5098
5099     @staticmethod
5100     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5101         """Utility function for reporting retries"""
5102         if count > retries:
5103             if error:
5104                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5105             raise e
5106
5107         if not count:
5108             return warn(e)
5109         elif isinstance(e, ExtractorError):
5110             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5111         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5112
5113         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5114         if delay:
5115             info(f'Sleeping {delay:.2f} seconds ...')
5116             time.sleep(delay)
5117
5118
5119 def make_archive_id(ie, video_id):
5120     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5121     return f'{ie_key.lower()} {video_id}'
5122
5123
5124 def truncate_string(s, left, right=0):
5125     assert left > 3 and right >= 0
5126     if s is None or len(s) <= left + right:
5127         return s
5128     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5129
5130
5131 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5132     assert 'all' in alias_dict, '"all" alias is required'
5133     requested = list(start or [])
5134     for val in options:
5135         discard = val.startswith('-')
5136         if discard:
5137             val = val[1:]
5138
5139         if val in alias_dict:
5140             val = alias_dict[val] if not discard else [
5141                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5142             # NB: Do not allow regex in aliases for performance
5143             requested = orderedSet_from_options(val, alias_dict, start=requested)
5144             continue
5145
5146         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5147                    else [val] if val in alias_dict['all'] else None)
5148         if current is None:
5149             raise ValueError(val)
5150
5151         if discard:
5152             for item in current:
5153                 while item in requested:
5154                     requested.remove(item)
5155         else:
5156             requested.extend(current)
5157
5158     return orderedSet(requested)
5159
5160
5161 # TODO: Rewrite
5162 class FormatSorter:
5163     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5164
5165     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5166                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5167                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5168     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5169                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5170                     'fps', 'fs_approx', 'source', 'id')
5171
5172     settings = {
5173         'vcodec': {'type': 'ordered', 'regex': True,
5174                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5175         'acodec': {'type': 'ordered', 'regex': True,
5176                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5177         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5178                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5179         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5180                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5181         'vext': {'type': 'ordered', 'field': 'video_ext',
5182                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5183                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5184         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5185                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5186                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5187         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5188         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5189                        'field': ('vcodec', 'acodec'),
5190                        'function': lambda it: int(any(v != 'none' for v in it))},
5191         'ie_pref': {'priority': True, 'type': 'extractor'},
5192         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5193         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5194         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5195         'quality': {'convert': 'float', 'default': -1},
5196         'filesize': {'convert': 'bytes'},
5197         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5198         'id': {'convert': 'string', 'field': 'format_id'},
5199         'height': {'convert': 'float_none'},
5200         'width': {'convert': 'float_none'},
5201         'fps': {'convert': 'float_none'},
5202         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5203         'tbr': {'convert': 'float_none'},
5204         'vbr': {'convert': 'float_none'},
5205         'abr': {'convert': 'float_none'},
5206         'asr': {'convert': 'float_none'},
5207         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5208
5209         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5210         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5211                'function': lambda it: next(filter(None, it), None)},
5212         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5213                  'function': lambda it: next(filter(None, it), None)},
5214         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5215         'res': {'type': 'multiple', 'field': ('height', 'width'),
5216                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5217
5218         # Actual field names
5219         'format_id': {'type': 'alias', 'field': 'id'},
5220         'preference': {'type': 'alias', 'field': 'ie_pref'},
5221         'language_preference': {'type': 'alias', 'field': 'lang'},
5222         'source_preference': {'type': 'alias', 'field': 'source'},
5223         'protocol': {'type': 'alias', 'field': 'proto'},
5224         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5225         'audio_channels': {'type': 'alias', 'field': 'channels'},
5226
5227         # Deprecated
5228         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5229         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5230         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5231         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5232         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5233         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5234         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5235         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5236         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5237         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5238         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5239         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5240         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5241         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5242         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5243         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5244         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5245         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5246         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5247         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5248     }
5249
5250     def __init__(self, ydl, field_preference):
5251         self.ydl = ydl
5252         self._order = []
5253         self.evaluate_params(self.ydl.params, field_preference)
5254         if ydl.params.get('verbose'):
5255             self.print_verbose_info(self.ydl.write_debug)
5256
5257     def _get_field_setting(self, field, key):
5258         if field not in self.settings:
5259             if key in ('forced', 'priority'):
5260                 return False
5261             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5262                                         'deprecated and may be removed in a future version')
5263             self.settings[field] = {}
5264         propObj = self.settings[field]
5265         if key not in propObj:
5266             type = propObj.get('type')
5267             if key == 'field':
5268                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5269             elif key == 'convert':
5270                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5271             else:
5272                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5273             propObj[key] = default
5274         return propObj[key]
5275
5276     def _resolve_field_value(self, field, value, convertNone=False):
5277         if value is None:
5278             if not convertNone:
5279                 return None
5280         else:
5281             value = value.lower()
5282         conversion = self._get_field_setting(field, 'convert')
5283         if conversion == 'ignore':
5284             return None
5285         if conversion == 'string':
5286             return value
5287         elif conversion == 'float_none':
5288             return float_or_none(value)
5289         elif conversion == 'bytes':
5290             return parse_bytes(value)
5291         elif conversion == 'order':
5292             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5293             use_regex = self._get_field_setting(field, 'regex')
5294             list_length = len(order_list)
5295             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5296             if use_regex and value is not None:
5297                 for i, regex in enumerate(order_list):
5298                     if regex and re.match(regex, value):
5299                         return list_length - i
5300                 return list_length - empty_pos  # not in list
5301             else:  # not regex or  value = None
5302                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5303         else:
5304             if value.isnumeric():
5305                 return float(value)
5306             else:
5307                 self.settings[field]['convert'] = 'string'
5308                 return value
5309
5310     def evaluate_params(self, params, sort_extractor):
5311         self._use_free_order = params.get('prefer_free_formats', False)
5312         self._sort_user = params.get('format_sort', [])
5313         self._sort_extractor = sort_extractor
5314
5315         def add_item(field, reverse, closest, limit_text):
5316             field = field.lower()
5317             if field in self._order:
5318                 return
5319             self._order.append(field)
5320             limit = self._resolve_field_value(field, limit_text)
5321             data = {
5322                 'reverse': reverse,
5323                 'closest': False if limit is None else closest,
5324                 'limit_text': limit_text,
5325                 'limit': limit}
5326             if field in self.settings:
5327                 self.settings[field].update(data)
5328             else:
5329                 self.settings[field] = data
5330
5331         sort_list = (
5332             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5333             + (tuple() if params.get('format_sort_force', False)
5334                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5335             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5336
5337         for item in sort_list:
5338             match = re.match(self.regex, item)
5339             if match is None:
5340                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5341             field = match.group('field')
5342             if field is None:
5343                 continue
5344             if self._get_field_setting(field, 'type') == 'alias':
5345                 alias, field = field, self._get_field_setting(field, 'field')
5346                 if self._get_field_setting(alias, 'deprecated'):
5347                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5348                                                 f'be removed in a future version. Please use {field} instead')
5349             reverse = match.group('reverse') is not None
5350             closest = match.group('separator') == '~'
5351             limit_text = match.group('limit')
5352
5353             has_limit = limit_text is not None
5354             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5355             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5356
5357             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5358             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5359             limit_count = len(limits)
5360             for (i, f) in enumerate(fields):
5361                 add_item(f, reverse, closest,
5362                          limits[i] if i < limit_count
5363                          else limits[0] if has_limit and not has_multiple_limits
5364                          else None)
5365
5366     def print_verbose_info(self, write_debug):
5367         if self._sort_user:
5368             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5369         if self._sort_extractor:
5370             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5371         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5372             '+' if self._get_field_setting(field, 'reverse') else '', field,
5373             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5374                           self._get_field_setting(field, 'limit_text'),
5375                           self._get_field_setting(field, 'limit'))
5376             if self._get_field_setting(field, 'limit_text') is not None else '')
5377             for field in self._order if self._get_field_setting(field, 'visible')]))
5378
5379     def _calculate_field_preference_from_value(self, format, field, type, value):
5380         reverse = self._get_field_setting(field, 'reverse')
5381         closest = self._get_field_setting(field, 'closest')
5382         limit = self._get_field_setting(field, 'limit')
5383
5384         if type == 'extractor':
5385             maximum = self._get_field_setting(field, 'max')
5386             if value is None or (maximum is not None and value >= maximum):
5387                 value = -1
5388         elif type == 'boolean':
5389             in_list = self._get_field_setting(field, 'in_list')
5390             not_in_list = self._get_field_setting(field, 'not_in_list')
5391             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5392         elif type == 'ordered':
5393             value = self._resolve_field_value(field, value, True)
5394
5395         # try to convert to number
5396         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5397         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5398         if is_num:
5399             value = val_num
5400
5401         return ((-10, 0) if value is None
5402                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5403                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5404                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5405                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5406                 else (-1, value, 0))
5407
5408     def _calculate_field_preference(self, format, field):
5409         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5410         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5411         if type == 'multiple':
5412             type = 'field'  # Only 'field' is allowed in multiple for now
5413             actual_fields = self._get_field_setting(field, 'field')
5414
5415             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5416         else:
5417             value = get_value(field)
5418         return self._calculate_field_preference_from_value(format, field, type, value)
5419
5420     def calculate_preference(self, format):
5421         # Determine missing protocol
5422         if not format.get('protocol'):
5423             format['protocol'] = determine_protocol(format)
5424
5425         # Determine missing ext
5426         if not format.get('ext') and 'url' in format:
5427             format['ext'] = determine_ext(format['url'])
5428         if format.get('vcodec') == 'none':
5429             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5430             format['video_ext'] = 'none'
5431         else:
5432             format['video_ext'] = format['ext']
5433             format['audio_ext'] = 'none'
5434         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5435         #    format['preference'] = -1000
5436
5437         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5438             # HEVC-over-FLV is out-of-spec by FLV's original spec
5439             # ref. https://trac.ffmpeg.org/ticket/6389
5440             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5441             format['preference'] = -100
5442
5443         # Determine missing bitrates
5444         if format.get('vcodec') == 'none':
5445             format['vbr'] = 0
5446         if format.get('acodec') == 'none':
5447             format['abr'] = 0
5448         if not format.get('vbr') and format.get('vcodec') != 'none':
5449             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5450         if not format.get('abr') and format.get('acodec') != 'none':
5451             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5452         if not format.get('tbr'):
5453             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5454
5455         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5456
5457
5458 # XXX: Temporary
5459 class _YDLLogger:
5460     def __init__(self, ydl=None):
5461         self._ydl = ydl
5462
5463     def debug(self, message):
5464         if self._ydl:
5465             self._ydl.write_debug(message)
5466
5467     def info(self, message):
5468         if self._ydl:
5469             self._ydl.to_screen(message)
5470
5471     def warning(self, message, *, once=False):
5472         if self._ydl:
5473             self._ydl.report_warning(message, once)
5474
5475     def error(self, message, *, is_error=True):
5476         if self._ydl:
5477             self._ydl.report_error(message, is_error=is_error)
5478
5479     def stdout(self, message):
5480         if self._ydl:
5481             self._ydl.to_stdout(message)
5482
5483     def stderr(self, message):
5484         if self._ydl:
5485             self._ydl.to_stderr(message)