yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import inspect
  19 import io
  20 import itertools
  21 import json
  22 import locale
  23 import math
  24 import mimetypes
  25 import netrc
  26 import operator
  27 import os
  28 import platform
  29 import random
  30 import re
  31 import shlex
  32 import socket
  33 import ssl
  34 import struct
  35 import subprocess
  36 import sys
  37 import tempfile
  38 import time
  39 import traceback
  40 import types
  41 import unicodedata
  42 import urllib.error
  43 import urllib.parse
  44 import urllib.request
  45 import xml.etree.ElementTree
  46
  47 from . import traversal
  48
  49 from ..compat import functools  # isort: split
  50 from ..compat import (
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_HTMLParseError,
  54     compat_os_name,
  55     compat_shlex_quote,
  56 )
  57 from ..dependencies import websockets, xattr
  58
  59 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  60
  61 # This is not clearly defined otherwise
  62 compiled_regex_type = type(re.compile(''))
  63
  64
  65 class NO_DEFAULT:
  66     pass
  67
  68
  69 def IDENTITY(x):
  70     return x
  71
  72
  73 ENGLISH_MONTH_NAMES = [
  74     'January', 'February', 'March', 'April', 'May', 'June',
  75     'July', 'August', 'September', 'October', 'November', 'December']
  76
  77 MONTH_NAMES = {
  78     'en': ENGLISH_MONTH_NAMES,
  79     'fr': [
  80         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  81         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  82     # these follow the genitive grammatical case (dopełniacz)
  83     # some websites might be using nominative, which will require another month list
  84     # https://en.wikibooks.org/wiki/Polish/Noun_cases
  85     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  86            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  87 }
  88
  89 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  90 TIMEZONE_NAMES = {
  91     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  92     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
  93     'EST': -5, 'EDT': -4,  # Eastern
  94     'CST': -6, 'CDT': -5,  # Central
  95     'MST': -7, 'MDT': -6,  # Mountain
  96     'PST': -8, 'PDT': -7   # Pacific
  97 }
  98
  99 # needed for sanitizing filenames in restricted mode
 100 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 101                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 102                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 103
 104 DATE_FORMATS = (
 105     '%d %B %Y',
 106     '%d %b %Y',
 107     '%B %d %Y',
 108     '%B %dst %Y',
 109     '%B %dnd %Y',
 110     '%B %drd %Y',
 111     '%B %dth %Y',
 112     '%b %d %Y',
 113     '%b %dst %Y',
 114     '%b %dnd %Y',
 115     '%b %drd %Y',
 116     '%b %dth %Y',
 117     '%b %dst %Y %I:%M',
 118     '%b %dnd %Y %I:%M',
 119     '%b %drd %Y %I:%M',
 120     '%b %dth %Y %I:%M',
 121     '%Y %m %d',
 122     '%Y-%m-%d',
 123     '%Y.%m.%d.',
 124     '%Y/%m/%d',
 125     '%Y/%m/%d %H:%M',
 126     '%Y/%m/%d %H:%M:%S',
 127     '%Y%m%d%H%M',
 128     '%Y%m%d%H%M%S',
 129     '%Y%m%d',
 130     '%Y-%m-%d %H:%M',
 131     '%Y-%m-%d %H:%M:%S',
 132     '%Y-%m-%d %H:%M:%S.%f',
 133     '%Y-%m-%d %H:%M:%S:%f',
 134     '%d.%m.%Y %H:%M',
 135     '%d.%m.%Y %H.%M',
 136     '%Y-%m-%dT%H:%M:%SZ',
 137     '%Y-%m-%dT%H:%M:%S.%fZ',
 138     '%Y-%m-%dT%H:%M:%S.%f0Z',
 139     '%Y-%m-%dT%H:%M:%S',
 140     '%Y-%m-%dT%H:%M:%S.%f',
 141     '%Y-%m-%dT%H:%M',
 142     '%b %d %Y at %H:%M',
 143     '%b %d %Y at %H:%M:%S',
 144     '%B %d %Y at %H:%M',
 145     '%B %d %Y at %H:%M:%S',
 146     '%H:%M %d-%b-%Y',
 147 )
 148
 149 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 150 DATE_FORMATS_DAY_FIRST.extend([
 151     '%d-%m-%Y',
 152     '%d.%m.%Y',
 153     '%d.%m.%y',
 154     '%d/%m/%Y',
 155     '%d/%m/%y',
 156     '%d/%m/%Y %H:%M:%S',
 157     '%d-%m-%Y %H:%M',
 158     '%H:%M %d/%m/%Y',
 159 ])
 160
 161 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 162 DATE_FORMATS_MONTH_FIRST.extend([
 163     '%m-%d-%Y',
 164     '%m.%d.%Y',
 165     '%m/%d/%Y',
 166     '%m/%d/%y',
 167     '%m/%d/%Y %H:%M:%S',
 168 ])
 169
 170 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 171 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 172
 173 NUMBER_RE = r'\d+(?:\.\d+)?'
 174
 175
 176 @functools.cache
 177 def preferredencoding():
 178     """Get preferred encoding.
 179
 180     Returns the best encoding scheme for the system, based on
 181     locale.getpreferredencoding() and some further tweaks.
 182     """
 183     try:
 184         pref = locale.getpreferredencoding()
 185         'TEST'.encode(pref)
 186     except Exception:
 187         pref = 'UTF-8'
 188
 189     return pref
 190
 191
 192 def write_json_file(obj, fn):
 193     """ Encode obj as JSON and write it to fn, atomically if possible """
 194
 195     tf = tempfile.NamedTemporaryFile(
 196         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 197         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 198
 199     try:
 200         with tf:
 201             json.dump(obj, tf, ensure_ascii=False)
 202         if sys.platform == 'win32':
 203             # Need to remove existing file on Windows, else os.rename raises
 204             # WindowsError or FileExistsError.
 205             with contextlib.suppress(OSError):
 206                 os.unlink(fn)
 207         with contextlib.suppress(OSError):
 208             mask = os.umask(0)
 209             os.umask(mask)
 210             os.chmod(tf.name, 0o666 & ~mask)
 211         os.rename(tf.name, fn)
 212     except Exception:
 213         with contextlib.suppress(OSError):
 214             os.remove(tf.name)
 215         raise
 216
 217
 218 def find_xpath_attr(node, xpath, key, val=None):
 219     """ Find the xpath xpath[@key=val] """
 220     assert re.match(r'^[a-zA-Z_-]+$', key)
 221     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 222     return node.find(expr)
 223
 224 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 225 # the namespace parameter
 226
 227
 228 def xpath_with_ns(path, ns_map):
 229     components = [c.split(':') for c in path.split('/')]
 230     replaced = []
 231     for c in components:
 232         if len(c) == 1:
 233             replaced.append(c[0])
 234         else:
 235             ns, tag = c
 236             replaced.append('{%s}%s' % (ns_map[ns], tag))
 237     return '/'.join(replaced)
 238
 239
 240 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 241     def _find_xpath(xpath):
 242         return node.find(xpath)
 243
 244     if isinstance(xpath, str):
 245         n = _find_xpath(xpath)
 246     else:
 247         for xp in xpath:
 248             n = _find_xpath(xp)
 249             if n is not None:
 250                 break
 251
 252     if n is None:
 253         if default is not NO_DEFAULT:
 254             return default
 255         elif fatal:
 256             name = xpath if name is None else name
 257             raise ExtractorError('Could not find XML element %s' % name)
 258         else:
 259             return None
 260     return n
 261
 262
 263 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 264     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 265     if n is None or n == default:
 266         return n
 267     if n.text is None:
 268         if default is not NO_DEFAULT:
 269             return default
 270         elif fatal:
 271             name = xpath if name is None else name
 272             raise ExtractorError('Could not find XML element\'s text %s' % name)
 273         else:
 274             return None
 275     return n.text
 276
 277
 278 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 279     n = find_xpath_attr(node, xpath, key)
 280     if n is None:
 281         if default is not NO_DEFAULT:
 282             return default
 283         elif fatal:
 284             name = f'{xpath}[@{key}]' if name is None else name
 285             raise ExtractorError('Could not find XML attribute %s' % name)
 286         else:
 287             return None
 288     return n.attrib[key]
 289
 290
 291 def get_element_by_id(id, html, **kwargs):
 292     """Return the content of the tag with the specified ID in the passed HTML document"""
 293     return get_element_by_attribute('id', id, html, **kwargs)
 294
 295
 296 def get_element_html_by_id(id, html, **kwargs):
 297     """Return the html of the tag with the specified ID in the passed HTML document"""
 298     return get_element_html_by_attribute('id', id, html, **kwargs)
 299
 300
 301 def get_element_by_class(class_name, html):
 302     """Return the content of the first tag with the specified class in the passed HTML document"""
 303     retval = get_elements_by_class(class_name, html)
 304     return retval[0] if retval else None
 305
 306
 307 def get_element_html_by_class(class_name, html):
 308     """Return the html of the first tag with the specified class in the passed HTML document"""
 309     retval = get_elements_html_by_class(class_name, html)
 310     return retval[0] if retval else None
 311
 312
 313 def get_element_by_attribute(attribute, value, html, **kwargs):
 314     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 315     return retval[0] if retval else None
 316
 317
 318 def get_element_html_by_attribute(attribute, value, html, **kargs):
 319     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 320     return retval[0] if retval else None
 321
 322
 323 def get_elements_by_class(class_name, html, **kargs):
 324     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 325     return get_elements_by_attribute(
 326         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 327         html, escape_value=False)
 328
 329
 330 def get_elements_html_by_class(class_name, html):
 331     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 332     return get_elements_html_by_attribute(
 333         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 334         html, escape_value=False)
 335
 336
 337 def get_elements_by_attribute(*args, **kwargs):
 338     """Return the content of the tag with the specified attribute in the passed HTML document"""
 339     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 340
 341
 342 def get_elements_html_by_attribute(*args, **kwargs):
 343     """Return the html of the tag with the specified attribute in the passed HTML document"""
 344     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 345
 346
 347 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 348     """
 349     Return the text (content) and the html (whole) of the tag with the specified
 350     attribute in the passed HTML document
 351     """
 352     if not value:
 353         return
 354
 355     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 356
 357     value = re.escape(value) if escape_value else value
 358
 359     partial_element_re = rf'''(?x)
 360         <(?P<tag>{tag})
 361          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 362          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 363         '''
 364
 365     for m in re.finditer(partial_element_re, html):
 366         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 367
 368         yield (
 369             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 370             whole
 371         )
 372
 373
 374 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 375     """
 376     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 377     closing tag for the first opening tag it has encountered, and can be used
 378     as a context manager
 379     """
 380
 381     class HTMLBreakOnClosingTagException(Exception):
 382         pass
 383
 384     def __init__(self):
 385         self.tagstack = collections.deque()
 386         html.parser.HTMLParser.__init__(self)
 387
 388     def __enter__(self):
 389         return self
 390
 391     def __exit__(self, *_):
 392         self.close()
 393
 394     def close(self):
 395         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 396         # so data remains buffered; we no longer have any interest in it, thus
 397         # override this method to discard it
 398         pass
 399
 400     def handle_starttag(self, tag, _):
 401         self.tagstack.append(tag)
 402
 403     def handle_endtag(self, tag):
 404         if not self.tagstack:
 405             raise compat_HTMLParseError('no tags in the stack')
 406         while self.tagstack:
 407             inner_tag = self.tagstack.pop()
 408             if inner_tag == tag:
 409                 break
 410         else:
 411             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 412         if not self.tagstack:
 413             raise self.HTMLBreakOnClosingTagException()
 414
 415
 416 # XXX: This should be far less strict
 417 def get_element_text_and_html_by_tag(tag, html):
 418     """
 419     For the first element with the specified tag in the passed HTML document
 420     return its' content (text) and the whole element (html)
 421     """
 422     def find_or_raise(haystack, needle, exc):
 423         try:
 424             return haystack.index(needle)
 425         except ValueError:
 426             raise exc
 427     closing_tag = f'</{tag}>'
 428     whole_start = find_or_raise(
 429         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 430     content_start = find_or_raise(
 431         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 432     content_start += whole_start + 1
 433     with HTMLBreakOnClosingTagParser() as parser:
 434         parser.feed(html[whole_start:content_start])
 435         if not parser.tagstack or parser.tagstack[0] != tag:
 436             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 437         offset = content_start
 438         while offset < len(html):
 439             next_closing_tag_start = find_or_raise(
 440                 html[offset:], closing_tag,
 441                 compat_HTMLParseError(f'closing {tag} tag not found'))
 442             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 443             try:
 444                 parser.feed(html[offset:offset + next_closing_tag_end])
 445                 offset += next_closing_tag_end
 446             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 447                 return html[content_start:offset + next_closing_tag_start], \
 448                     html[whole_start:offset + next_closing_tag_end]
 449         raise compat_HTMLParseError('unexpected end of html')
 450
 451
 452 class HTMLAttributeParser(html.parser.HTMLParser):
 453     """Trivial HTML parser to gather the attributes for a single element"""
 454
 455     def __init__(self):
 456         self.attrs = {}
 457         html.parser.HTMLParser.__init__(self)
 458
 459     def handle_starttag(self, tag, attrs):
 460         self.attrs = dict(attrs)
 461         raise compat_HTMLParseError('done')
 462
 463
 464 class HTMLListAttrsParser(html.parser.HTMLParser):
 465     """HTML parser to gather the attributes for the elements of a list"""
 466
 467     def __init__(self):
 468         html.parser.HTMLParser.__init__(self)
 469         self.items = []
 470         self._level = 0
 471
 472     def handle_starttag(self, tag, attrs):
 473         if tag == 'li' and self._level == 0:
 474             self.items.append(dict(attrs))
 475         self._level += 1
 476
 477     def handle_endtag(self, tag):
 478         self._level -= 1
 479
 480
 481 def extract_attributes(html_element):
 482     """Given a string for an HTML element such as
 483     <el
 484          a="foo" B="bar" c="&98;az" d=boz
 485          empty= noval entity="&amp;"
 486          sq='"' dq="'"
 487     >
 488     Decode and return a dictionary of attributes.
 489     {
 490         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 491         'empty': '', 'noval': None, 'entity': '&',
 492         'sq': '"', 'dq': '\''
 493     }.
 494     """
 495     parser = HTMLAttributeParser()
 496     with contextlib.suppress(compat_HTMLParseError):
 497         parser.feed(html_element)
 498         parser.close()
 499     return parser.attrs
 500
 501
 502 def parse_list(webpage):
 503     """Given a string for an series of HTML <li> elements,
 504     return a dictionary of their attributes"""
 505     parser = HTMLListAttrsParser()
 506     parser.feed(webpage)
 507     parser.close()
 508     return parser.items
 509
 510
 511 def clean_html(html):
 512     """Clean an HTML snippet into a readable string"""
 513
 514     if html is None:  # Convenience for sanitizing descriptions etc.
 515         return html
 516
 517     html = re.sub(r'\s+', ' ', html)
 518     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 519     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 520     # Strip html tags
 521     html = re.sub('<.*?>', '', html)
 522     # Replace html entities
 523     html = unescapeHTML(html)
 524     return html.strip()
 525
 526
 527 class LenientJSONDecoder(json.JSONDecoder):
 528     # TODO: Write tests
 529     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 530         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 531         self._close_attempts = 2 * close_objects
 532         super().__init__(*args, **kwargs)
 533
 534     @staticmethod
 535     def _close_object(err):
 536         doc = err.doc[:err.pos]
 537         # We need to add comma first to get the correct error message
 538         if err.msg.startswith('Expecting \',\''):
 539             return doc + ','
 540         elif not doc.endswith(','):
 541             return
 542
 543         if err.msg.startswith('Expecting property name'):
 544             return doc[:-1] + '}'
 545         elif err.msg.startswith('Expecting value'):
 546             return doc[:-1] + ']'
 547
 548     def decode(self, s):
 549         if self.transform_source:
 550             s = self.transform_source(s)
 551         for attempt in range(self._close_attempts + 1):
 552             try:
 553                 if self.ignore_extra:
 554                     return self.raw_decode(s.lstrip())[0]
 555                 return super().decode(s)
 556             except json.JSONDecodeError as e:
 557                 if e.pos is None:
 558                     raise
 559                 elif attempt < self._close_attempts:
 560                     s = self._close_object(e)
 561                     if s is not None:
 562                         continue
 563                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 564         assert False, 'Too many attempts to decode JSON'
 565
 566
 567 def sanitize_open(filename, open_mode):
 568     """Try to open the given filename, and slightly tweak it if this fails.
 569
 570     Attempts to open the given filename. If this fails, it tries to change
 571     the filename slightly, step by step, until it's either able to open it
 572     or it fails and raises a final exception, like the standard open()
 573     function.
 574
 575     It returns the tuple (stream, definitive_file_name).
 576     """
 577     if filename == '-':
 578         if sys.platform == 'win32':
 579             import msvcrt
 580
 581             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 582             with contextlib.suppress(io.UnsupportedOperation):
 583                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 584         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 585
 586     for attempt in range(2):
 587         try:
 588             try:
 589                 if sys.platform == 'win32':
 590                     # FIXME: An exclusive lock also locks the file from being read.
 591                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 592                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 593                     raise LockingUnsupportedError()
 594                 stream = locked_file(filename, open_mode, block=False).__enter__()
 595             except OSError:
 596                 stream = open(filename, open_mode)
 597             return stream, filename
 598         except OSError as err:
 599             if attempt or err.errno in (errno.EACCES,):
 600                 raise
 601             old_filename, filename = filename, sanitize_path(filename)
 602             if old_filename == filename:
 603                 raise
 604
 605
 606 def timeconvert(timestr):
 607     """Convert RFC 2822 defined time string into system timestamp"""
 608     timestamp = None
 609     timetuple = email.utils.parsedate_tz(timestr)
 610     if timetuple is not None:
 611         timestamp = email.utils.mktime_tz(timetuple)
 612     return timestamp
 613
 614
 615 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 616     """Sanitizes a string so it could be used as part of a filename.
 617     @param restricted   Use a stricter subset of allowed characters
 618     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 619                         If unset, yt-dlp's new sanitization rules are in effect
 620     """
 621     if s == '':
 622         return ''
 623
 624     def replace_insane(char):
 625         if restricted and char in ACCENT_CHARS:
 626             return ACCENT_CHARS[char]
 627         elif not restricted and char == '\n':
 628             return '\0 '
 629         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 630             # Replace with their full-width unicode counterparts
 631             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 632         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 633             return ''
 634         elif char == '"':
 635             return '' if restricted else '\''
 636         elif char == ':':
 637             return '\0_\0-' if restricted else '\0 \0-'
 638         elif char in '\\/|*<>':
 639             return '\0_'
 640         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 641             return '\0_'
 642         return char
 643
 644     # Replace look-alike Unicode glyphs
 645     if restricted and (is_id is NO_DEFAULT or not is_id):
 646         s = unicodedata.normalize('NFKC', s)
 647     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 648     result = ''.join(map(replace_insane, s))
 649     if is_id is NO_DEFAULT:
 650         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 651         STRIP_RE = r'(?:\0.|[ _-])*'
 652         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 653     result = result.replace('\0', '') or '_'
 654
 655     if not is_id:
 656         while '__' in result:
 657             result = result.replace('__', '_')
 658         result = result.strip('_')
 659         # Common case of "Foreign band name - English song title"
 660         if restricted and result.startswith('-_'):
 661             result = result[2:]
 662         if result.startswith('-'):
 663             result = '_' + result[len('-'):]
 664         result = result.lstrip('.')
 665         if not result:
 666             result = '_'
 667     return result
 668
 669
 670 def sanitize_path(s, force=False):
 671     """Sanitizes and normalizes path on Windows"""
 672     # XXX: this handles drive relative paths (c:sth) incorrectly
 673     if sys.platform == 'win32':
 674         force = False
 675         drive_or_unc, _ = os.path.splitdrive(s)
 676     elif force:
 677         drive_or_unc = ''
 678     else:
 679         return s
 680
 681     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 682     if drive_or_unc:
 683         norm_path.pop(0)
 684     sanitized_path = [
 685         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 686         for path_part in norm_path]
 687     if drive_or_unc:
 688         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 689     elif force and s and s[0] == os.path.sep:
 690         sanitized_path.insert(0, os.path.sep)
 691     # TODO: Fix behavioral differences <3.12
 692     # The workaround using `normpath` only superficially passes tests
 693     # Ref: https://github.com/python/cpython/pull/100351
 694     return os.path.normpath(os.path.join(*sanitized_path))
 695
 696
 697 def sanitize_url(url, *, scheme='http'):
 698     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 699     # the number of unwanted failures due to missing protocol
 700     if url is None:
 701         return
 702     elif url.startswith('//'):
 703         return f'{scheme}:{url}'
 704     # Fix some common typos seen so far
 705     COMMON_TYPOS = (
 706         # https://github.com/ytdl-org/youtube-dl/issues/15649
 707         (r'^httpss://', r'https://'),
 708         # https://bx1.be/lives/direct-tv/
 709         (r'^rmtp([es]?)://', r'rtmp\1://'),
 710     )
 711     for mistake, fixup in COMMON_TYPOS:
 712         if re.match(mistake, url):
 713             return re.sub(mistake, fixup, url)
 714     return url
 715
 716
 717 def extract_basic_auth(url):
 718     parts = urllib.parse.urlsplit(url)
 719     if parts.username is None:
 720         return url, None
 721     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 722         parts.hostname if parts.port is None
 723         else '%s:%d' % (parts.hostname, parts.port))))
 724     auth_payload = base64.b64encode(
 725         ('%s:%s' % (parts.username, parts.password or '')).encode())
 726     return url, f'Basic {auth_payload.decode()}'
 727
 728
 729 def expand_path(s):
 730     """Expand shell variables and ~"""
 731     return os.path.expandvars(compat_expanduser(s))
 732
 733
 734 def orderedSet(iterable, *, lazy=False):
 735     """Remove all duplicates from the input iterable"""
 736     def _iter():
 737         seen = []  # Do not use set since the items can be unhashable
 738         for x in iterable:
 739             if x not in seen:
 740                 seen.append(x)
 741                 yield x
 742
 743     return _iter() if lazy else list(_iter())
 744
 745
 746 def _htmlentity_transform(entity_with_semicolon):
 747     """Transforms an HTML entity to a character."""
 748     entity = entity_with_semicolon[:-1]
 749
 750     # Known non-numeric HTML entity
 751     if entity in html.entities.name2codepoint:
 752         return chr(html.entities.name2codepoint[entity])
 753
 754     # TODO: HTML5 allows entities without a semicolon.
 755     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 756     if entity_with_semicolon in html.entities.html5:
 757         return html.entities.html5[entity_with_semicolon]
 758
 759     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 760     if mobj is not None:
 761         numstr = mobj.group(1)
 762         if numstr.startswith('x'):
 763             base = 16
 764             numstr = '0%s' % numstr
 765         else:
 766             base = 10
 767         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 768         with contextlib.suppress(ValueError):
 769             return chr(int(numstr, base))
 770
 771     # Unknown entity in name, return its literal representation
 772     return '&%s;' % entity
 773
 774
 775 def unescapeHTML(s):
 776     if s is None:
 777         return None
 778     assert isinstance(s, str)
 779
 780     return re.sub(
 781         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 782
 783
 784 def escapeHTML(text):
 785     return (
 786         text
 787         .replace('&', '&amp;')
 788         .replace('<', '&lt;')
 789         .replace('>', '&gt;')
 790         .replace('"', '&quot;')
 791         .replace("'", '&#39;')
 792     )
 793
 794
 795 class netrc_from_content(netrc.netrc):
 796     def __init__(self, content):
 797         self.hosts, self.macros = {}, {}
 798         with io.StringIO(content) as stream:
 799             self._parse('-', stream, False)
 800
 801
 802 class Popen(subprocess.Popen):
 803     if sys.platform == 'win32':
 804         _startupinfo = subprocess.STARTUPINFO()
 805         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 806     else:
 807         _startupinfo = None
 808
 809     @staticmethod
 810     def _fix_pyinstaller_ld_path(env):
 811         """Restore LD_LIBRARY_PATH when using PyInstaller
 812             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 813                  https://github.com/yt-dlp/yt-dlp/issues/4573
 814         """
 815         if not hasattr(sys, '_MEIPASS'):
 816             return
 817
 818         def _fix(key):
 819             orig = env.get(f'{key}_ORIG')
 820             if orig is None:
 821                 env.pop(key, None)
 822             else:
 823                 env[key] = orig
 824
 825         _fix('LD_LIBRARY_PATH')  # Linux
 826         _fix('DYLD_LIBRARY_PATH')  # macOS
 827
 828     def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
 829         if env is None:
 830             env = os.environ.copy()
 831         self._fix_pyinstaller_ld_path(env)
 832
 833         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 834         if text is True:
 835             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 836             kwargs.setdefault('encoding', 'utf-8')
 837             kwargs.setdefault('errors', 'replace')
 838
 839         if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
 840             if not isinstance(args, str):
 841                 args = ' '.join(compat_shlex_quote(a) for a in args)
 842             shell = False
 843             args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"'
 844
 845         super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
 846
 847     def __comspec(self):
 848         comspec = os.environ.get('ComSpec') or os.path.join(
 849             os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
 850         if os.path.isabs(comspec):
 851             return comspec
 852         raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
 853
 854     def communicate_or_kill(self, *args, **kwargs):
 855         try:
 856             return self.communicate(*args, **kwargs)
 857         except BaseException:  # Including KeyboardInterrupt
 858             self.kill(timeout=None)
 859             raise
 860
 861     def kill(self, *, timeout=0):
 862         super().kill()
 863         if timeout != 0:
 864             self.wait(timeout=timeout)
 865
 866     @classmethod
 867     def run(cls, *args, timeout=None, **kwargs):
 868         with cls(*args, **kwargs) as proc:
 869             default = '' if proc.__text_mode else b''
 870             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 871             return stdout or default, stderr or default, proc.returncode
 872
 873
 874 def encodeArgument(s):
 875     # Legacy code that uses byte strings
 876     # Uncomment the following line after fixing all post processors
 877     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 878     return s if isinstance(s, str) else s.decode('ascii')
 879
 880
 881 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 882
 883
 884 def timetuple_from_msec(msec):
 885     secs, msec = divmod(msec, 1000)
 886     mins, secs = divmod(secs, 60)
 887     hrs, mins = divmod(mins, 60)
 888     return _timetuple(hrs, mins, secs, msec)
 889
 890
 891 def formatSeconds(secs, delim=':', msec=False):
 892     time = timetuple_from_msec(secs * 1000)
 893     if time.hours:
 894         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 895     elif time.minutes:
 896         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 897     else:
 898         ret = '%d' % time.seconds
 899     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 900
 901
 902 def bug_reports_message(before=';'):
 903     from ..update import REPOSITORY
 904
 905     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 906            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 907
 908     before = before.rstrip()
 909     if not before or before.endswith(('.', '!', '?')):
 910         msg = msg[0].title() + msg[1:]
 911
 912     return (before + ' ' if before else '') + msg
 913
 914
 915 class YoutubeDLError(Exception):
 916     """Base exception for YoutubeDL errors."""
 917     msg = None
 918
 919     def __init__(self, msg=None):
 920         if msg is not None:
 921             self.msg = msg
 922         elif self.msg is None:
 923             self.msg = type(self).__name__
 924         super().__init__(self.msg)
 925
 926
 927 class ExtractorError(YoutubeDLError):
 928     """Error during info extraction."""
 929
 930     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 931         """ tb, if given, is the original traceback (so that it can be printed out).
 932         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 933         """
 934         from ..networking.exceptions import network_exceptions
 935         if sys.exc_info()[0] in network_exceptions:
 936             expected = True
 937
 938         self.orig_msg = str(msg)
 939         self.traceback = tb
 940         self.expected = expected
 941         self.cause = cause
 942         self.video_id = video_id
 943         self.ie = ie
 944         self.exc_info = sys.exc_info()  # preserve original exception
 945         if isinstance(self.exc_info[1], ExtractorError):
 946             self.exc_info = self.exc_info[1].exc_info
 947         super().__init__(self.__msg)
 948
 949     @property
 950     def __msg(self):
 951         return ''.join((
 952             format_field(self.ie, None, '[%s] '),
 953             format_field(self.video_id, None, '%s: '),
 954             self.orig_msg,
 955             format_field(self.cause, None, ' (caused by %r)'),
 956             '' if self.expected else bug_reports_message()))
 957
 958     def format_traceback(self):
 959         return join_nonempty(
 960             self.traceback and ''.join(traceback.format_tb(self.traceback)),
 961             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
 962             delim='\n') or None
 963
 964     def __setattr__(self, name, value):
 965         super().__setattr__(name, value)
 966         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
 967             self.msg = self.__msg or type(self).__name__
 968             self.args = (self.msg, )  # Cannot be property
 969
 970
 971 class UnsupportedError(ExtractorError):
 972     def __init__(self, url):
 973         super().__init__(
 974             'Unsupported URL: %s' % url, expected=True)
 975         self.url = url
 976
 977
 978 class RegexNotFoundError(ExtractorError):
 979     """Error when a regex didn't match"""
 980     pass
 981
 982
 983 class GeoRestrictedError(ExtractorError):
 984     """Geographic restriction Error exception.
 985
 986     This exception may be thrown when a video is not available from your
 987     geographic location due to geographic restrictions imposed by a website.
 988     """
 989
 990     def __init__(self, msg, countries=None, **kwargs):
 991         kwargs['expected'] = True
 992         super().__init__(msg, **kwargs)
 993         self.countries = countries
 994
 995
 996 class UserNotLive(ExtractorError):
 997     """Error when a channel/user is not live"""
 998
 999     def __init__(self, msg=None, **kwargs):
1000         kwargs['expected'] = True
1001         super().__init__(msg or 'The channel is not currently live', **kwargs)
1002
1003
1004 class DownloadError(YoutubeDLError):
1005     """Download Error exception.
1006
1007     This exception may be thrown by FileDownloader objects if they are not
1008     configured to continue on errors. They will contain the appropriate
1009     error message.
1010     """
1011
1012     def __init__(self, msg, exc_info=None):
1013         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1014         super().__init__(msg)
1015         self.exc_info = exc_info
1016
1017
1018 class EntryNotInPlaylist(YoutubeDLError):
1019     """Entry not in playlist exception.
1020
1021     This exception will be thrown by YoutubeDL when a requested entry
1022     is not found in the playlist info_dict
1023     """
1024     msg = 'Entry not found in info'
1025
1026
1027 class SameFileError(YoutubeDLError):
1028     """Same File exception.
1029
1030     This exception will be thrown by FileDownloader objects if they detect
1031     multiple files would have to be downloaded to the same file on disk.
1032     """
1033     msg = 'Fixed output name but more than one file to download'
1034
1035     def __init__(self, filename=None):
1036         if filename is not None:
1037             self.msg += f': {filename}'
1038         super().__init__(self.msg)
1039
1040
1041 class PostProcessingError(YoutubeDLError):
1042     """Post Processing exception.
1043
1044     This exception may be raised by PostProcessor's .run() method to
1045     indicate an error in the postprocessing task.
1046     """
1047
1048
1049 class DownloadCancelled(YoutubeDLError):
1050     """ Exception raised when the download queue should be interrupted """
1051     msg = 'The download was cancelled'
1052
1053
1054 class ExistingVideoReached(DownloadCancelled):
1055     """ --break-on-existing triggered """
1056     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1057
1058
1059 class RejectedVideoReached(DownloadCancelled):
1060     """ --break-match-filter triggered """
1061     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1062
1063
1064 class MaxDownloadsReached(DownloadCancelled):
1065     """ --max-downloads limit has been reached. """
1066     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1067
1068
1069 class ReExtractInfo(YoutubeDLError):
1070     """ Video info needs to be re-extracted. """
1071
1072     def __init__(self, msg, expected=False):
1073         super().__init__(msg)
1074         self.expected = expected
1075
1076
1077 class ThrottledDownload(ReExtractInfo):
1078     """ Download speed below --throttled-rate. """
1079     msg = 'The download speed is below throttle limit'
1080
1081     def __init__(self):
1082         super().__init__(self.msg, expected=False)
1083
1084
1085 class UnavailableVideoError(YoutubeDLError):
1086     """Unavailable Format exception.
1087
1088     This exception will be thrown when a video is requested
1089     in a format that is not available for that video.
1090     """
1091     msg = 'Unable to download video'
1092
1093     def __init__(self, err=None):
1094         if err is not None:
1095             self.msg += f': {err}'
1096         super().__init__(self.msg)
1097
1098
1099 class ContentTooShortError(YoutubeDLError):
1100     """Content Too Short exception.
1101
1102     This exception may be raised by FileDownloader objects when a file they
1103     download is too small for what the server announced first, indicating
1104     the connection was probably interrupted.
1105     """
1106
1107     def __init__(self, downloaded, expected):
1108         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1109         # Both in bytes
1110         self.downloaded = downloaded
1111         self.expected = expected
1112
1113
1114 class XAttrMetadataError(YoutubeDLError):
1115     def __init__(self, code=None, msg='Unknown error'):
1116         super().__init__(msg)
1117         self.code = code
1118         self.msg = msg
1119
1120         # Parsing code and msg
1121         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1122                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1123             self.reason = 'NO_SPACE'
1124         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1125             self.reason = 'VALUE_TOO_LONG'
1126         else:
1127             self.reason = 'NOT_SUPPORTED'
1128
1129
1130 class XAttrUnavailableError(YoutubeDLError):
1131     pass
1132
1133
1134 def is_path_like(f):
1135     return isinstance(f, (str, bytes, os.PathLike))
1136
1137
1138 def extract_timezone(date_str):
1139     m = re.search(
1140         r'''(?x)
1141             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1142             (?P<tz>Z|                                            # just the UTC Z, or
1143                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1144                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1145                    [ ]?                                          # optional space
1146                 (?P<sign>\+|-)                                   # +/-
1147                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1148             $)
1149         ''', date_str)
1150     if not m:
1151         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1152         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1153         if timezone is not None:
1154             date_str = date_str[:-len(m.group('tz'))]
1155         timezone = datetime.timedelta(hours=timezone or 0)
1156     else:
1157         date_str = date_str[:-len(m.group('tz'))]
1158         if not m.group('sign'):
1159             timezone = datetime.timedelta()
1160         else:
1161             sign = 1 if m.group('sign') == '+' else -1
1162             timezone = datetime.timedelta(
1163                 hours=sign * int(m.group('hours')),
1164                 minutes=sign * int(m.group('minutes')))
1165     return timezone, date_str
1166
1167
1168 def parse_iso8601(date_str, delimiter='T', timezone=None):
1169     """ Return a UNIX timestamp from the given date """
1170
1171     if date_str is None:
1172         return None
1173
1174     date_str = re.sub(r'\.[0-9]+', '', date_str)
1175
1176     if timezone is None:
1177         timezone, date_str = extract_timezone(date_str)
1178
1179     with contextlib.suppress(ValueError):
1180         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1181         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1182         return calendar.timegm(dt.timetuple())
1183
1184
1185 def date_formats(day_first=True):
1186     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1187
1188
1189 def unified_strdate(date_str, day_first=True):
1190     """Return a string with the date in the format YYYYMMDD"""
1191
1192     if date_str is None:
1193         return None
1194     upload_date = None
1195     # Replace commas
1196     date_str = date_str.replace(',', ' ')
1197     # Remove AM/PM + timezone
1198     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1199     _, date_str = extract_timezone(date_str)
1200
1201     for expression in date_formats(day_first):
1202         with contextlib.suppress(ValueError):
1203             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1204     if upload_date is None:
1205         timetuple = email.utils.parsedate_tz(date_str)
1206         if timetuple:
1207             with contextlib.suppress(ValueError):
1208                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1209     if upload_date is not None:
1210         return str(upload_date)
1211
1212
1213 def unified_timestamp(date_str, day_first=True):
1214     if not isinstance(date_str, str):
1215         return None
1216
1217     date_str = re.sub(r'\s+', ' ', re.sub(
1218         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1219
1220     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1221     timezone, date_str = extract_timezone(date_str)
1222
1223     # Remove AM/PM + timezone
1224     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1225
1226     # Remove unrecognized timezones from ISO 8601 alike timestamps
1227     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1228     if m:
1229         date_str = date_str[:-len(m.group('tz'))]
1230
1231     # Python only supports microseconds, so remove nanoseconds
1232     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1233     if m:
1234         date_str = m.group(1)
1235
1236     for expression in date_formats(day_first):
1237         with contextlib.suppress(ValueError):
1238             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1239             return calendar.timegm(dt.timetuple())
1240
1241     timetuple = email.utils.parsedate_tz(date_str)
1242     if timetuple:
1243         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1244
1245
1246 def determine_ext(url, default_ext='unknown_video'):
1247     if url is None or '.' not in url:
1248         return default_ext
1249     guess = url.partition('?')[0].rpartition('.')[2]
1250     if re.match(r'^[A-Za-z0-9]+$', guess):
1251         return guess
1252     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1253     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1254         return guess.rstrip('/')
1255     else:
1256         return default_ext
1257
1258
1259 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1260     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1261
1262
1263 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1264     R"""
1265     Return a datetime object from a string.
1266     Supported format:
1267         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1268
1269     @param format       strftime format of DATE
1270     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1271                         auto: round to the unit provided in date_str (if applicable).
1272     """
1273     auto_precision = False
1274     if precision == 'auto':
1275         auto_precision = True
1276         precision = 'microsecond'
1277     today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
1278     if date_str in ('now', 'today'):
1279         return today
1280     if date_str == 'yesterday':
1281         return today - datetime.timedelta(days=1)
1282     match = re.match(
1283         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1284         date_str)
1285     if match is not None:
1286         start_time = datetime_from_str(match.group('start'), precision, format)
1287         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1288         unit = match.group('unit')
1289         if unit == 'month' or unit == 'year':
1290             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1291             unit = 'day'
1292         else:
1293             if unit == 'week':
1294                 unit = 'day'
1295                 time *= 7
1296             delta = datetime.timedelta(**{unit + 's': time})
1297             new_date = start_time + delta
1298         if auto_precision:
1299             return datetime_round(new_date, unit)
1300         return new_date
1301
1302     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1303
1304
1305 def date_from_str(date_str, format='%Y%m%d', strict=False):
1306     R"""
1307     Return a date object from a string using datetime_from_str
1308
1309     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1310                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1311     """
1312     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1313         raise ValueError(f'Invalid date format "{date_str}"')
1314     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1315
1316
1317 def datetime_add_months(dt, months):
1318     """Increment/Decrement a datetime object by months."""
1319     month = dt.month + months - 1
1320     year = dt.year + month // 12
1321     month = month % 12 + 1
1322     day = min(dt.day, calendar.monthrange(year, month)[1])
1323     return dt.replace(year, month, day)
1324
1325
1326 def datetime_round(dt, precision='day'):
1327     """
1328     Round a datetime object's time to a specific precision
1329     """
1330     if precision == 'microsecond':
1331         return dt
1332
1333     unit_seconds = {
1334         'day': 86400,
1335         'hour': 3600,
1336         'minute': 60,
1337         'second': 1,
1338     }
1339     roundto = lambda x, n: ((x + n / 2) // n) * n
1340     timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
1341     return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
1342
1343
1344 def hyphenate_date(date_str):
1345     """
1346     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1347     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1348     if match is not None:
1349         return '-'.join(match.groups())
1350     else:
1351         return date_str
1352
1353
1354 class DateRange:
1355     """Represents a time interval between two dates"""
1356
1357     def __init__(self, start=None, end=None):
1358         """start and end must be strings in the format accepted by date"""
1359         if start is not None:
1360             self.start = date_from_str(start, strict=True)
1361         else:
1362             self.start = datetime.datetime.min.date()
1363         if end is not None:
1364             self.end = date_from_str(end, strict=True)
1365         else:
1366             self.end = datetime.datetime.max.date()
1367         if self.start > self.end:
1368             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1369
1370     @classmethod
1371     def day(cls, day):
1372         """Returns a range that only contains the given day"""
1373         return cls(day, day)
1374
1375     def __contains__(self, date):
1376         """Check if the date is in the range"""
1377         if not isinstance(date, datetime.date):
1378             date = date_from_str(date)
1379         return self.start <= date <= self.end
1380
1381     def __repr__(self):
1382         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1383
1384     def __eq__(self, other):
1385         return (isinstance(other, DateRange)
1386                 and self.start == other.start and self.end == other.end)
1387
1388
1389 @functools.cache
1390 def system_identifier():
1391     python_implementation = platform.python_implementation()
1392     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1393         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1394     libc_ver = []
1395     with contextlib.suppress(OSError):  # We may not have access to the executable
1396         libc_ver = platform.libc_ver()
1397
1398     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1399         platform.python_version(),
1400         python_implementation,
1401         platform.machine(),
1402         platform.architecture()[0],
1403         platform.platform(),
1404         ssl.OPENSSL_VERSION,
1405         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1406     )
1407
1408
1409 @functools.cache
1410 def get_windows_version():
1411     ''' Get Windows version. returns () if it's not running on Windows '''
1412     if compat_os_name == 'nt':
1413         return version_tuple(platform.win32_ver()[1])
1414     else:
1415         return ()
1416
1417
1418 def write_string(s, out=None, encoding=None):
1419     assert isinstance(s, str)
1420     out = out or sys.stderr
1421     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1422     if not out:
1423         return
1424
1425     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1426         s = re.sub(r'([\r\n]+)', r' \1', s)
1427
1428     enc, buffer = None, out
1429     if 'b' in getattr(out, 'mode', ''):
1430         enc = encoding or preferredencoding()
1431     elif hasattr(out, 'buffer'):
1432         buffer = out.buffer
1433         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1434
1435     buffer.write(s.encode(enc, 'ignore') if enc else s)
1436     out.flush()
1437
1438
1439 # TODO: Use global logger
1440 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1441     from .. import _IN_CLI
1442     if _IN_CLI:
1443         if msg in deprecation_warning._cache:
1444             return
1445         deprecation_warning._cache.add(msg)
1446         if printer:
1447             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1448         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1449     else:
1450         import warnings
1451         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1452
1453
1454 deprecation_warning._cache = set()
1455
1456
1457 def bytes_to_intlist(bs):
1458     if not bs:
1459         return []
1460     if isinstance(bs[0], int):  # Python 3
1461         return list(bs)
1462     else:
1463         return [ord(c) for c in bs]
1464
1465
1466 def intlist_to_bytes(xs):
1467     if not xs:
1468         return b''
1469     return struct.pack('%dB' % len(xs), *xs)
1470
1471
1472 class LockingUnsupportedError(OSError):
1473     msg = 'File locking is not supported'
1474
1475     def __init__(self):
1476         super().__init__(self.msg)
1477
1478
1479 # Cross-platform file locking
1480 if sys.platform == 'win32':
1481     import ctypes
1482     import ctypes.wintypes
1483     import msvcrt
1484
1485     class OVERLAPPED(ctypes.Structure):
1486         _fields_ = [
1487             ('Internal', ctypes.wintypes.LPVOID),
1488             ('InternalHigh', ctypes.wintypes.LPVOID),
1489             ('Offset', ctypes.wintypes.DWORD),
1490             ('OffsetHigh', ctypes.wintypes.DWORD),
1491             ('hEvent', ctypes.wintypes.HANDLE),
1492         ]
1493
1494     kernel32 = ctypes.WinDLL('kernel32')
1495     LockFileEx = kernel32.LockFileEx
1496     LockFileEx.argtypes = [
1497         ctypes.wintypes.HANDLE,     # hFile
1498         ctypes.wintypes.DWORD,      # dwFlags
1499         ctypes.wintypes.DWORD,      # dwReserved
1500         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1501         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1502         ctypes.POINTER(OVERLAPPED)  # Overlapped
1503     ]
1504     LockFileEx.restype = ctypes.wintypes.BOOL
1505     UnlockFileEx = kernel32.UnlockFileEx
1506     UnlockFileEx.argtypes = [
1507         ctypes.wintypes.HANDLE,     # hFile
1508         ctypes.wintypes.DWORD,      # dwReserved
1509         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1510         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1511         ctypes.POINTER(OVERLAPPED)  # Overlapped
1512     ]
1513     UnlockFileEx.restype = ctypes.wintypes.BOOL
1514     whole_low = 0xffffffff
1515     whole_high = 0x7fffffff
1516
1517     def _lock_file(f, exclusive, block):
1518         overlapped = OVERLAPPED()
1519         overlapped.Offset = 0
1520         overlapped.OffsetHigh = 0
1521         overlapped.hEvent = 0
1522         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1523
1524         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1525                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1526                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1527             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1528             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1529
1530     def _unlock_file(f):
1531         assert f._lock_file_overlapped_p
1532         handle = msvcrt.get_osfhandle(f.fileno())
1533         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1534             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1535
1536 else:
1537     try:
1538         import fcntl
1539
1540         def _lock_file(f, exclusive, block):
1541             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1542             if not block:
1543                 flags |= fcntl.LOCK_NB
1544             try:
1545                 fcntl.flock(f, flags)
1546             except BlockingIOError:
1547                 raise
1548             except OSError:  # AOSP does not have flock()
1549                 fcntl.lockf(f, flags)
1550
1551         def _unlock_file(f):
1552             with contextlib.suppress(OSError):
1553                 return fcntl.flock(f, fcntl.LOCK_UN)
1554             with contextlib.suppress(OSError):
1555                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1556             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1557
1558     except ImportError:
1559
1560         def _lock_file(f, exclusive, block):
1561             raise LockingUnsupportedError()
1562
1563         def _unlock_file(f):
1564             raise LockingUnsupportedError()
1565
1566
1567 class locked_file:
1568     locked = False
1569
1570     def __init__(self, filename, mode, block=True, encoding=None):
1571         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1572             raise NotImplementedError(mode)
1573         self.mode, self.block = mode, block
1574
1575         writable = any(f in mode for f in 'wax+')
1576         readable = any(f in mode for f in 'r+')
1577         flags = functools.reduce(operator.ior, (
1578             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
1579             getattr(os, 'O_BINARY', 0),  # Windows only
1580             getattr(os, 'O_NOINHERIT', 0),  # Windows only
1581             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
1582             os.O_APPEND if 'a' in mode else 0,
1583             os.O_EXCL if 'x' in mode else 0,
1584             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1585         ))
1586
1587         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1588
1589     def __enter__(self):
1590         exclusive = 'r' not in self.mode
1591         try:
1592             _lock_file(self.f, exclusive, self.block)
1593             self.locked = True
1594         except OSError:
1595             self.f.close()
1596             raise
1597         if 'w' in self.mode:
1598             try:
1599                 self.f.truncate()
1600             except OSError as e:
1601                 if e.errno not in (
1602                     errno.ESPIPE,  # Illegal seek - expected for FIFO
1603                     errno.EINVAL,  # Invalid argument - expected for /dev/null
1604                 ):
1605                     raise
1606         return self
1607
1608     def unlock(self):
1609         if not self.locked:
1610             return
1611         try:
1612             _unlock_file(self.f)
1613         finally:
1614             self.locked = False
1615
1616     def __exit__(self, *_):
1617         try:
1618             self.unlock()
1619         finally:
1620             self.f.close()
1621
1622     open = __enter__
1623     close = __exit__
1624
1625     def __getattr__(self, attr):
1626         return getattr(self.f, attr)
1627
1628     def __iter__(self):
1629         return iter(self.f)
1630
1631
1632 @functools.cache
1633 def get_filesystem_encoding():
1634     encoding = sys.getfilesystemencoding()
1635     return encoding if encoding is not None else 'utf-8'
1636
1637
1638 def shell_quote(args):
1639     quoted_args = []
1640     encoding = get_filesystem_encoding()
1641     for a in args:
1642         if isinstance(a, bytes):
1643             # We may get a filename encoded with 'encodeFilename'
1644             a = a.decode(encoding)
1645         quoted_args.append(compat_shlex_quote(a))
1646     return ' '.join(quoted_args)
1647
1648
1649 def smuggle_url(url, data):
1650     """ Pass additional data in a URL for internal use. """
1651
1652     url, idata = unsmuggle_url(url, {})
1653     data.update(idata)
1654     sdata = urllib.parse.urlencode(
1655         {'__youtubedl_smuggle': json.dumps(data)})
1656     return url + '#' + sdata
1657
1658
1659 def unsmuggle_url(smug_url, default=None):
1660     if '#__youtubedl_smuggle' not in smug_url:
1661         return smug_url, default
1662     url, _, sdata = smug_url.rpartition('#')
1663     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1664     data = json.loads(jsond)
1665     return url, data
1666
1667
1668 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1669     """ Formats numbers with decimal sufixes like K, M, etc """
1670     num, factor = float_or_none(num), float(factor)
1671     if num is None or num < 0:
1672         return None
1673     POSSIBLE_SUFFIXES = 'kMGTPEZY'
1674     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1675     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1676     if factor == 1024:
1677         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1678     converted = num / (factor ** exponent)
1679     return fmt % (converted, suffix)
1680
1681
1682 def format_bytes(bytes):
1683     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1684
1685
1686 def lookup_unit_table(unit_table, s, strict=False):
1687     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1688     units_re = '|'.join(re.escape(u) for u in unit_table)
1689     m = (re.fullmatch if strict else re.match)(
1690         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1691     if not m:
1692         return None
1693
1694     num = float(m.group('num').replace(',', '.'))
1695     mult = unit_table[m.group('unit')]
1696     return round(num * mult)
1697
1698
1699 def parse_bytes(s):
1700     """Parse a string indicating a byte quantity into an integer"""
1701     return lookup_unit_table(
1702         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1703         s.upper(), strict=True)
1704
1705
1706 def parse_filesize(s):
1707     if s is None:
1708         return None
1709
1710     # The lower-case forms are of course incorrect and unofficial,
1711     # but we support those too
1712     _UNIT_TABLE = {
1713         'B': 1,
1714         'b': 1,
1715         'bytes': 1,
1716         'KiB': 1024,
1717         'KB': 1000,
1718         'kB': 1024,
1719         'Kb': 1000,
1720         'kb': 1000,
1721         'kilobytes': 1000,
1722         'kibibytes': 1024,
1723         'MiB': 1024 ** 2,
1724         'MB': 1000 ** 2,
1725         'mB': 1024 ** 2,
1726         'Mb': 1000 ** 2,
1727         'mb': 1000 ** 2,
1728         'megabytes': 1000 ** 2,
1729         'mebibytes': 1024 ** 2,
1730         'GiB': 1024 ** 3,
1731         'GB': 1000 ** 3,
1732         'gB': 1024 ** 3,
1733         'Gb': 1000 ** 3,
1734         'gb': 1000 ** 3,
1735         'gigabytes': 1000 ** 3,
1736         'gibibytes': 1024 ** 3,
1737         'TiB': 1024 ** 4,
1738         'TB': 1000 ** 4,
1739         'tB': 1024 ** 4,
1740         'Tb': 1000 ** 4,
1741         'tb': 1000 ** 4,
1742         'terabytes': 1000 ** 4,
1743         'tebibytes': 1024 ** 4,
1744         'PiB': 1024 ** 5,
1745         'PB': 1000 ** 5,
1746         'pB': 1024 ** 5,
1747         'Pb': 1000 ** 5,
1748         'pb': 1000 ** 5,
1749         'petabytes': 1000 ** 5,
1750         'pebibytes': 1024 ** 5,
1751         'EiB': 1024 ** 6,
1752         'EB': 1000 ** 6,
1753         'eB': 1024 ** 6,
1754         'Eb': 1000 ** 6,
1755         'eb': 1000 ** 6,
1756         'exabytes': 1000 ** 6,
1757         'exbibytes': 1024 ** 6,
1758         'ZiB': 1024 ** 7,
1759         'ZB': 1000 ** 7,
1760         'zB': 1024 ** 7,
1761         'Zb': 1000 ** 7,
1762         'zb': 1000 ** 7,
1763         'zettabytes': 1000 ** 7,
1764         'zebibytes': 1024 ** 7,
1765         'YiB': 1024 ** 8,
1766         'YB': 1000 ** 8,
1767         'yB': 1024 ** 8,
1768         'Yb': 1000 ** 8,
1769         'yb': 1000 ** 8,
1770         'yottabytes': 1000 ** 8,
1771         'yobibytes': 1024 ** 8,
1772     }
1773
1774     return lookup_unit_table(_UNIT_TABLE, s)
1775
1776
1777 def parse_count(s):
1778     if s is None:
1779         return None
1780
1781     s = re.sub(r'^[^\d]+\s', '', s).strip()
1782
1783     if re.match(r'^[\d,.]+$', s):
1784         return str_to_int(s)
1785
1786     _UNIT_TABLE = {
1787         'k': 1000,
1788         'K': 1000,
1789         'm': 1000 ** 2,
1790         'M': 1000 ** 2,
1791         'kk': 1000 ** 2,
1792         'KK': 1000 ** 2,
1793         'b': 1000 ** 3,
1794         'B': 1000 ** 3,
1795     }
1796
1797     ret = lookup_unit_table(_UNIT_TABLE, s)
1798     if ret is not None:
1799         return ret
1800
1801     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1802     if mobj:
1803         return str_to_int(mobj.group(1))
1804
1805
1806 def parse_resolution(s, *, lenient=False):
1807     if s is None:
1808         return {}
1809
1810     if lenient:
1811         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1812     else:
1813         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1814     if mobj:
1815         return {
1816             'width': int(mobj.group('w')),
1817             'height': int(mobj.group('h')),
1818         }
1819
1820     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1821     if mobj:
1822         return {'height': int(mobj.group(1))}
1823
1824     mobj = re.search(r'\b([48])[kK]\b', s)
1825     if mobj:
1826         return {'height': int(mobj.group(1)) * 540}
1827
1828     return {}
1829
1830
1831 def parse_bitrate(s):
1832     if not isinstance(s, str):
1833         return
1834     mobj = re.search(r'\b(\d+)\s*kbps', s)
1835     if mobj:
1836         return int(mobj.group(1))
1837
1838
1839 def month_by_name(name, lang='en'):
1840     """ Return the number of a month by (locale-independently) English name """
1841
1842     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1843
1844     try:
1845         return month_names.index(name) + 1
1846     except ValueError:
1847         return None
1848
1849
1850 def month_by_abbreviation(abbrev):
1851     """ Return the number of a month by (locale-independently) English
1852         abbreviations """
1853
1854     try:
1855         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1856     except ValueError:
1857         return None
1858
1859
1860 def fix_xml_ampersands(xml_str):
1861     """Replace all the '&' by '&amp;' in XML"""
1862     return re.sub(
1863         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1864         '&amp;',
1865         xml_str)
1866
1867
1868 def setproctitle(title):
1869     assert isinstance(title, str)
1870
1871     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1872     try:
1873         import ctypes
1874     except ImportError:
1875         return
1876
1877     try:
1878         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1879     except OSError:
1880         return
1881     except TypeError:
1882         # LoadLibrary in Windows Python 2.7.13 only expects
1883         # a bytestring, but since unicode_literals turns
1884         # every string into a unicode string, it fails.
1885         return
1886     title_bytes = title.encode()
1887     buf = ctypes.create_string_buffer(len(title_bytes))
1888     buf.value = title_bytes
1889     try:
1890         libc.prctl(15, buf, 0, 0, 0)
1891     except AttributeError:
1892         return  # Strange libc, just skip this
1893
1894
1895 def remove_start(s, start):
1896     return s[len(start):] if s is not None and s.startswith(start) else s
1897
1898
1899 def remove_end(s, end):
1900     return s[:-len(end)] if s is not None and s.endswith(end) else s
1901
1902
1903 def remove_quotes(s):
1904     if s is None or len(s) < 2:
1905         return s
1906     for quote in ('"', "'", ):
1907         if s[0] == quote and s[-1] == quote:
1908             return s[1:-1]
1909     return s
1910
1911
1912 def get_domain(url):
1913     """
1914     This implementation is inconsistent, but is kept for compatibility.
1915     Use this only for "webpage_url_domain"
1916     """
1917     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1918
1919
1920 def url_basename(url):
1921     path = urllib.parse.urlparse(url).path
1922     return path.strip('/').split('/')[-1]
1923
1924
1925 def base_url(url):
1926     return re.match(r'https?://[^?#]+/', url).group()
1927
1928
1929 def urljoin(base, path):
1930     if isinstance(path, bytes):
1931         path = path.decode()
1932     if not isinstance(path, str) or not path:
1933         return None
1934     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1935         return path
1936     if isinstance(base, bytes):
1937         base = base.decode()
1938     if not isinstance(base, str) or not re.match(
1939             r'^(?:https?:)?//', base):
1940         return None
1941     return urllib.parse.urljoin(base, path)
1942
1943
1944 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1945     if get_attr and v is not None:
1946         v = getattr(v, get_attr, None)
1947     try:
1948         return int(v) * invscale // scale
1949     except (ValueError, TypeError, OverflowError):
1950         return default
1951
1952
1953 def str_or_none(v, default=None):
1954     return default if v is None else str(v)
1955
1956
1957 def str_to_int(int_str):
1958     """ A more relaxed version of int_or_none """
1959     if isinstance(int_str, int):
1960         return int_str
1961     elif isinstance(int_str, str):
1962         int_str = re.sub(r'[,\.\+]', '', int_str)
1963         return int_or_none(int_str)
1964
1965
1966 def float_or_none(v, scale=1, invscale=1, default=None):
1967     if v is None:
1968         return default
1969     try:
1970         return float(v) * invscale / scale
1971     except (ValueError, TypeError):
1972         return default
1973
1974
1975 def bool_or_none(v, default=None):
1976     return v if isinstance(v, bool) else default
1977
1978
1979 def strip_or_none(v, default=None):
1980     return v.strip() if isinstance(v, str) else default
1981
1982
1983 def url_or_none(url):
1984     if not url or not isinstance(url, str):
1985         return None
1986     url = url.strip()
1987     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
1988
1989
1990 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
1991     datetime_object = None
1992     try:
1993         if isinstance(timestamp, (int, float)):  # unix timestamp
1994             # Using naive datetime here can break timestamp() in Windows
1995             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
1996             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
1997             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
1998             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
1999                                + datetime.timedelta(seconds=timestamp))
2000         elif isinstance(timestamp, str):  # assume YYYYMMDD
2001             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2002         date_format = re.sub(  # Support %s on windows
2003             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2004         return datetime_object.strftime(date_format)
2005     except (ValueError, TypeError, AttributeError):
2006         return default
2007
2008
2009 def parse_duration(s):
2010     if not isinstance(s, str):
2011         return None
2012     s = s.strip()
2013     if not s:
2014         return None
2015
2016     days, hours, mins, secs, ms = [None] * 5
2017     m = re.match(r'''(?x)
2018             (?P<before_secs>
2019                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2020             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2021             (?P<ms>[.:][0-9]+)?Z?$
2022         ''', s)
2023     if m:
2024         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2025     else:
2026         m = re.match(
2027             r'''(?ix)(?:P?
2028                 (?:
2029                     [0-9]+\s*y(?:ears?)?,?\s*
2030                 )?
2031                 (?:
2032                     [0-9]+\s*m(?:onths?)?,?\s*
2033                 )?
2034                 (?:
2035                     [0-9]+\s*w(?:eeks?)?,?\s*
2036                 )?
2037                 (?:
2038                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2039                 )?
2040                 T)?
2041                 (?:
2042                     (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2043                 )?
2044                 (?:
2045                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2046                 )?
2047                 (?:
2048                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2049                 )?Z?$''', s)
2050         if m:
2051             days, hours, mins, secs, ms = m.groups()
2052         else:
2053             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2054             if m:
2055                 hours, mins = m.groups()
2056             else:
2057                 return None
2058
2059     if ms:
2060         ms = ms.replace(':', '.')
2061     return sum(float(part or 0) * mult for part, mult in (
2062         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2063
2064
2065 def prepend_extension(filename, ext, expected_real_ext=None):
2066     name, real_ext = os.path.splitext(filename)
2067     return (
2068         f'{name}.{ext}{real_ext}'
2069         if not expected_real_ext or real_ext[1:] == expected_real_ext
2070         else f'{filename}.{ext}')
2071
2072
2073 def replace_extension(filename, ext, expected_real_ext=None):
2074     name, real_ext = os.path.splitext(filename)
2075     return '{}.{}'.format(
2076         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2077         ext)
2078
2079
2080 def check_executable(exe, args=[]):
2081     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2082     args can be a list of arguments for a short output (like -version) """
2083     try:
2084         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2085     except OSError:
2086         return False
2087     return exe
2088
2089
2090 def _get_exe_version_output(exe, args):
2091     try:
2092         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2093         # SIGTTOU if yt-dlp is run in the background.
2094         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2095         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2096                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2097         if ret:
2098             return None
2099     except OSError:
2100         return False
2101     return stdout
2102
2103
2104 def detect_exe_version(output, version_re=None, unrecognized='present'):
2105     assert isinstance(output, str)
2106     if version_re is None:
2107         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2108     m = re.search(version_re, output)
2109     if m:
2110         return m.group(1)
2111     else:
2112         return unrecognized
2113
2114
2115 def get_exe_version(exe, args=['--version'],
2116                     version_re=None, unrecognized=('present', 'broken')):
2117     """ Returns the version of the specified executable,
2118     or False if the executable is not present """
2119     unrecognized = variadic(unrecognized)
2120     assert len(unrecognized) in (1, 2)
2121     out = _get_exe_version_output(exe, args)
2122     if out is None:
2123         return unrecognized[-1]
2124     return out and detect_exe_version(out, version_re, unrecognized[0])
2125
2126
2127 def frange(start=0, stop=None, step=1):
2128     """Float range"""
2129     if stop is None:
2130         start, stop = 0, start
2131     sign = [-1, 1][step > 0] if step else 0
2132     while sign * start < sign * stop:
2133         yield start
2134         start += step
2135
2136
2137 class LazyList(collections.abc.Sequence):
2138     """Lazy immutable list from an iterable
2139     Note that slices of a LazyList are lists and not LazyList"""
2140
2141     class IndexError(IndexError):
2142         pass
2143
2144     def __init__(self, iterable, *, reverse=False, _cache=None):
2145         self._iterable = iter(iterable)
2146         self._cache = [] if _cache is None else _cache
2147         self._reversed = reverse
2148
2149     def __iter__(self):
2150         if self._reversed:
2151             # We need to consume the entire iterable to iterate in reverse
2152             yield from self.exhaust()
2153             return
2154         yield from self._cache
2155         for item in self._iterable:
2156             self._cache.append(item)
2157             yield item
2158
2159     def _exhaust(self):
2160         self._cache.extend(self._iterable)
2161         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2162         return self._cache
2163
2164     def exhaust(self):
2165         """Evaluate the entire iterable"""
2166         return self._exhaust()[::-1 if self._reversed else 1]
2167
2168     @staticmethod
2169     def _reverse_index(x):
2170         return None if x is None else ~x
2171
2172     def __getitem__(self, idx):
2173         if isinstance(idx, slice):
2174             if self._reversed:
2175                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2176             start, stop, step = idx.start, idx.stop, idx.step or 1
2177         elif isinstance(idx, int):
2178             if self._reversed:
2179                 idx = self._reverse_index(idx)
2180             start, stop, step = idx, idx, 0
2181         else:
2182             raise TypeError('indices must be integers or slices')
2183         if ((start or 0) < 0 or (stop or 0) < 0
2184                 or (start is None and step < 0)
2185                 or (stop is None and step > 0)):
2186             # We need to consume the entire iterable to be able to slice from the end
2187             # Obviously, never use this with infinite iterables
2188             self._exhaust()
2189             try:
2190                 return self._cache[idx]
2191             except IndexError as e:
2192                 raise self.IndexError(e) from e
2193         n = max(start or 0, stop or 0) - len(self._cache) + 1
2194         if n > 0:
2195             self._cache.extend(itertools.islice(self._iterable, n))
2196         try:
2197             return self._cache[idx]
2198         except IndexError as e:
2199             raise self.IndexError(e) from e
2200
2201     def __bool__(self):
2202         try:
2203             self[-1] if self._reversed else self[0]
2204         except self.IndexError:
2205             return False
2206         return True
2207
2208     def __len__(self):
2209         self._exhaust()
2210         return len(self._cache)
2211
2212     def __reversed__(self):
2213         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2214
2215     def __copy__(self):
2216         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2217
2218     def __repr__(self):
2219         # repr and str should mimic a list. So we exhaust the iterable
2220         return repr(self.exhaust())
2221
2222     def __str__(self):
2223         return repr(self.exhaust())
2224
2225
2226 class PagedList:
2227
2228     class IndexError(IndexError):
2229         pass
2230
2231     def __len__(self):
2232         # This is only useful for tests
2233         return len(self.getslice())
2234
2235     def __init__(self, pagefunc, pagesize, use_cache=True):
2236         self._pagefunc = pagefunc
2237         self._pagesize = pagesize
2238         self._pagecount = float('inf')
2239         self._use_cache = use_cache
2240         self._cache = {}
2241
2242     def getpage(self, pagenum):
2243         page_results = self._cache.get(pagenum)
2244         if page_results is None:
2245             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2246         if self._use_cache:
2247             self._cache[pagenum] = page_results
2248         return page_results
2249
2250     def getslice(self, start=0, end=None):
2251         return list(self._getslice(start, end))
2252
2253     def _getslice(self, start, end):
2254         raise NotImplementedError('This method must be implemented by subclasses')
2255
2256     def __getitem__(self, idx):
2257         assert self._use_cache, 'Indexing PagedList requires cache'
2258         if not isinstance(idx, int) or idx < 0:
2259             raise TypeError('indices must be non-negative integers')
2260         entries = self.getslice(idx, idx + 1)
2261         if not entries:
2262             raise self.IndexError()
2263         return entries[0]
2264
2265
2266 class OnDemandPagedList(PagedList):
2267     """Download pages until a page with less than maximum results"""
2268
2269     def _getslice(self, start, end):
2270         for pagenum in itertools.count(start // self._pagesize):
2271             firstid = pagenum * self._pagesize
2272             nextfirstid = pagenum * self._pagesize + self._pagesize
2273             if start >= nextfirstid:
2274                 continue
2275
2276             startv = (
2277                 start % self._pagesize
2278                 if firstid <= start < nextfirstid
2279                 else 0)
2280             endv = (
2281                 ((end - 1) % self._pagesize) + 1
2282                 if (end is not None and firstid <= end <= nextfirstid)
2283                 else None)
2284
2285             try:
2286                 page_results = self.getpage(pagenum)
2287             except Exception:
2288                 self._pagecount = pagenum - 1
2289                 raise
2290             if startv != 0 or endv is not None:
2291                 page_results = page_results[startv:endv]
2292             yield from page_results
2293
2294             # A little optimization - if current page is not "full", ie. does
2295             # not contain page_size videos then we can assume that this page
2296             # is the last one - there are no more ids on further pages -
2297             # i.e. no need to query again.
2298             if len(page_results) + startv < self._pagesize:
2299                 break
2300
2301             # If we got the whole page, but the next page is not interesting,
2302             # break out early as well
2303             if end == nextfirstid:
2304                 break
2305
2306
2307 class InAdvancePagedList(PagedList):
2308     """PagedList with total number of pages known in advance"""
2309
2310     def __init__(self, pagefunc, pagecount, pagesize):
2311         PagedList.__init__(self, pagefunc, pagesize, True)
2312         self._pagecount = pagecount
2313
2314     def _getslice(self, start, end):
2315         start_page = start // self._pagesize
2316         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2317         skip_elems = start - start_page * self._pagesize
2318         only_more = None if end is None else end - start
2319         for pagenum in range(start_page, end_page):
2320             page_results = self.getpage(pagenum)
2321             if skip_elems:
2322                 page_results = page_results[skip_elems:]
2323                 skip_elems = None
2324             if only_more is not None:
2325                 if len(page_results) < only_more:
2326                     only_more -= len(page_results)
2327                 else:
2328                     yield from page_results[:only_more]
2329                     break
2330             yield from page_results
2331
2332
2333 class PlaylistEntries:
2334     MissingEntry = object()
2335     is_exhausted = False
2336
2337     def __init__(self, ydl, info_dict):
2338         self.ydl = ydl
2339
2340         # _entries must be assigned now since infodict can change during iteration
2341         entries = info_dict.get('entries')
2342         if entries is None:
2343             raise EntryNotInPlaylist('There are no entries')
2344         elif isinstance(entries, list):
2345             self.is_exhausted = True
2346
2347         requested_entries = info_dict.get('requested_entries')
2348         self.is_incomplete = requested_entries is not None
2349         if self.is_incomplete:
2350             assert self.is_exhausted
2351             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2352             for i, entry in zip(requested_entries, entries):
2353                 self._entries[i - 1] = entry
2354         elif isinstance(entries, (list, PagedList, LazyList)):
2355             self._entries = entries
2356         else:
2357             self._entries = LazyList(entries)
2358
2359     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2360         (?P<start>[+-]?\d+)?
2361         (?P<range>[:-]
2362             (?P<end>[+-]?\d+|inf(?:inite)?)?
2363             (?::(?P<step>[+-]?\d+))?
2364         )?''')
2365
2366     @classmethod
2367     def parse_playlist_items(cls, string):
2368         for segment in string.split(','):
2369             if not segment:
2370                 raise ValueError('There is two or more consecutive commas')
2371             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2372             if not mobj:
2373                 raise ValueError(f'{segment!r} is not a valid specification')
2374             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2375             if int_or_none(step) == 0:
2376                 raise ValueError(f'Step in {segment!r} cannot be zero')
2377             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2378
2379     def get_requested_items(self):
2380         playlist_items = self.ydl.params.get('playlist_items')
2381         playlist_start = self.ydl.params.get('playliststart', 1)
2382         playlist_end = self.ydl.params.get('playlistend')
2383         # For backwards compatibility, interpret -1 as whole list
2384         if playlist_end in (-1, None):
2385             playlist_end = ''
2386         if not playlist_items:
2387             playlist_items = f'{playlist_start}:{playlist_end}'
2388         elif playlist_start != 1 or playlist_end:
2389             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2390
2391         for index in self.parse_playlist_items(playlist_items):
2392             for i, entry in self[index]:
2393                 yield i, entry
2394                 if not entry:
2395                     continue
2396                 try:
2397                     # The item may have just been added to archive. Don't break due to it
2398                     if not self.ydl.params.get('lazy_playlist'):
2399                         # TODO: Add auto-generated fields
2400                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2401                 except (ExistingVideoReached, RejectedVideoReached):
2402                     return
2403
2404     def get_full_count(self):
2405         if self.is_exhausted and not self.is_incomplete:
2406             return len(self)
2407         elif isinstance(self._entries, InAdvancePagedList):
2408             if self._entries._pagesize == 1:
2409                 return self._entries._pagecount
2410
2411     @functools.cached_property
2412     def _getter(self):
2413         if isinstance(self._entries, list):
2414             def get_entry(i):
2415                 try:
2416                     entry = self._entries[i]
2417                 except IndexError:
2418                     entry = self.MissingEntry
2419                     if not self.is_incomplete:
2420                         raise self.IndexError()
2421                 if entry is self.MissingEntry:
2422                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2423                 return entry
2424         else:
2425             def get_entry(i):
2426                 try:
2427                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2428                 except (LazyList.IndexError, PagedList.IndexError):
2429                     raise self.IndexError()
2430         return get_entry
2431
2432     def __getitem__(self, idx):
2433         if isinstance(idx, int):
2434             idx = slice(idx, idx)
2435
2436         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2437         step = 1 if idx.step is None else idx.step
2438         if idx.start is None:
2439             start = 0 if step > 0 else len(self) - 1
2440         else:
2441             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2442
2443         # NB: Do not call len(self) when idx == [:]
2444         if idx.stop is None:
2445             stop = 0 if step < 0 else float('inf')
2446         else:
2447             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2448         stop += [-1, 1][step > 0]
2449
2450         for i in frange(start, stop, step):
2451             if i < 0:
2452                 continue
2453             try:
2454                 entry = self._getter(i)
2455             except self.IndexError:
2456                 self.is_exhausted = True
2457                 if step > 0:
2458                     break
2459                 continue
2460             yield i + 1, entry
2461
2462     def __len__(self):
2463         return len(tuple(self[:]))
2464
2465     class IndexError(IndexError):
2466         pass
2467
2468
2469 def uppercase_escape(s):
2470     unicode_escape = codecs.getdecoder('unicode_escape')
2471     return re.sub(
2472         r'\\U[0-9a-fA-F]{8}',
2473         lambda m: unicode_escape(m.group(0))[0],
2474         s)
2475
2476
2477 def lowercase_escape(s):
2478     unicode_escape = codecs.getdecoder('unicode_escape')
2479     return re.sub(
2480         r'\\u[0-9a-fA-F]{4}',
2481         lambda m: unicode_escape(m.group(0))[0],
2482         s)
2483
2484
2485 def parse_qs(url, **kwargs):
2486     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2487
2488
2489 def read_batch_urls(batch_fd):
2490     def fixup(url):
2491         if not isinstance(url, str):
2492             url = url.decode('utf-8', 'replace')
2493         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2494         for bom in BOM_UTF8:
2495             if url.startswith(bom):
2496                 url = url[len(bom):]
2497         url = url.lstrip()
2498         if not url or url.startswith(('#', ';', ']')):
2499             return False
2500         # "#" cannot be stripped out since it is part of the URI
2501         # However, it can be safely stripped out if following a whitespace
2502         return re.split(r'\s#', url, 1)[0].rstrip()
2503
2504     with contextlib.closing(batch_fd) as fd:
2505         return [url for url in map(fixup, fd) if url]
2506
2507
2508 def urlencode_postdata(*args, **kargs):
2509     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2510
2511
2512 def update_url(url, *, query_update=None, **kwargs):
2513     """Replace URL components specified by kwargs
2514        @param url           str or parse url tuple
2515        @param query_update  update query
2516        @returns             str
2517     """
2518     if isinstance(url, str):
2519         if not kwargs and not query_update:
2520             return url
2521         else:
2522             url = urllib.parse.urlparse(url)
2523     if query_update:
2524         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2525         kwargs['query'] = urllib.parse.urlencode({
2526             **urllib.parse.parse_qs(url.query),
2527             **query_update
2528         }, True)
2529     return urllib.parse.urlunparse(url._replace(**kwargs))
2530
2531
2532 def update_url_query(url, query):
2533     return update_url(url, query_update=query)
2534
2535
2536 def _multipart_encode_impl(data, boundary):
2537     content_type = 'multipart/form-data; boundary=%s' % boundary
2538
2539     out = b''
2540     for k, v in data.items():
2541         out += b'--' + boundary.encode('ascii') + b'\r\n'
2542         if isinstance(k, str):
2543             k = k.encode()
2544         if isinstance(v, str):
2545             v = v.encode()
2546         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2547         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2548         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2549         if boundary.encode('ascii') in content:
2550             raise ValueError('Boundary overlaps with data')
2551         out += content
2552
2553     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2554
2555     return out, content_type
2556
2557
2558 def multipart_encode(data, boundary=None):
2559     '''
2560     Encode a dict to RFC 7578-compliant form-data
2561
2562     data:
2563         A dict where keys and values can be either Unicode or bytes-like
2564         objects.
2565     boundary:
2566         If specified a Unicode object, it's used as the boundary. Otherwise
2567         a random boundary is generated.
2568
2569     Reference: https://tools.ietf.org/html/rfc7578
2570     '''
2571     has_specified_boundary = boundary is not None
2572
2573     while True:
2574         if boundary is None:
2575             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2576
2577         try:
2578             out, content_type = _multipart_encode_impl(data, boundary)
2579             break
2580         except ValueError:
2581             if has_specified_boundary:
2582                 raise
2583             boundary = None
2584
2585     return out, content_type
2586
2587
2588 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2589     if blocked_types is NO_DEFAULT:
2590         blocked_types = (str, bytes, collections.abc.Mapping)
2591     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2592
2593
2594 def variadic(x, allowed_types=NO_DEFAULT):
2595     if not isinstance(allowed_types, (tuple, type)):
2596         deprecation_warning('allowed_types should be a tuple or a type')
2597         allowed_types = tuple(allowed_types)
2598     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2599
2600
2601 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2602     for f in funcs:
2603         try:
2604             val = f(*args, **kwargs)
2605         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2606             pass
2607         else:
2608             if expected_type is None or isinstance(val, expected_type):
2609                 return val
2610
2611
2612 def try_get(src, getter, expected_type=None):
2613     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2614
2615
2616 def filter_dict(dct, cndn=lambda _, v: v is not None):
2617     return {k: v for k, v in dct.items() if cndn(k, v)}
2618
2619
2620 def merge_dicts(*dicts):
2621     merged = {}
2622     for a_dict in dicts:
2623         for k, v in a_dict.items():
2624             if (v is not None and k not in merged
2625                     or isinstance(v, str) and merged[k] == ''):
2626                 merged[k] = v
2627     return merged
2628
2629
2630 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2631     return string if isinstance(string, str) else str(string, encoding, errors)
2632
2633
2634 US_RATINGS = {
2635     'G': 0,
2636     'PG': 10,
2637     'PG-13': 13,
2638     'R': 16,
2639     'NC': 18,
2640 }
2641
2642
2643 TV_PARENTAL_GUIDELINES = {
2644     'TV-Y': 0,
2645     'TV-Y7': 7,
2646     'TV-G': 0,
2647     'TV-PG': 0,
2648     'TV-14': 14,
2649     'TV-MA': 17,
2650 }
2651
2652
2653 def parse_age_limit(s):
2654     # isinstance(False, int) is True. So type() must be used instead
2655     if type(s) is int:  # noqa: E721
2656         return s if 0 <= s <= 21 else None
2657     elif not isinstance(s, str):
2658         return None
2659     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2660     if m:
2661         return int(m.group('age'))
2662     s = s.upper()
2663     if s in US_RATINGS:
2664         return US_RATINGS[s]
2665     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2666     if m:
2667         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2668     return None
2669
2670
2671 def strip_jsonp(code):
2672     return re.sub(
2673         r'''(?sx)^
2674             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2675             (?:\s*&&\s*(?P=func_name))?
2676             \s*\(\s*(?P<callback_data>.*)\);?
2677             \s*?(?://[^\n]*)*$''',
2678         r'\g<callback_data>', code)
2679
2680
2681 def js_to_json(code, vars={}, *, strict=False):
2682     # vars is a dict of var, val pairs to substitute
2683     STRING_QUOTES = '\'"`'
2684     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2685     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2686     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2687     INTEGER_TABLE = (
2688         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2689         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2690     )
2691
2692     def process_escape(match):
2693         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2694         escape = match.group(1) or match.group(2)
2695
2696         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2697                 else R'\u00' if escape == 'x'
2698                 else '' if escape == '\n'
2699                 else escape)
2700
2701     def template_substitute(match):
2702         evaluated = js_to_json(match.group(1), vars, strict=strict)
2703         if evaluated[0] == '"':
2704             return json.loads(evaluated)
2705         return evaluated
2706
2707     def fix_kv(m):
2708         v = m.group(0)
2709         if v in ('true', 'false', 'null'):
2710             return v
2711         elif v in ('undefined', 'void 0'):
2712             return 'null'
2713         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2714             return ''
2715
2716         if v[0] in STRING_QUOTES:
2717             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2718             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2719             return f'"{escaped}"'
2720
2721         for regex, base in INTEGER_TABLE:
2722             im = re.match(regex, v)
2723             if im:
2724                 i = int(im.group(1), base)
2725                 return f'"{i}":' if v.endswith(':') else str(i)
2726
2727         if v in vars:
2728             try:
2729                 if not strict:
2730                     json.loads(vars[v])
2731             except json.JSONDecodeError:
2732                 return json.dumps(vars[v])
2733             else:
2734                 return vars[v]
2735
2736         if not strict:
2737             return f'"{v}"'
2738
2739         raise ValueError(f'Unknown value: {v}')
2740
2741     def create_map(mobj):
2742         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2743
2744     code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
2745     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2746     if not strict:
2747         code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
2748         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2749         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2750         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2751
2752     return re.sub(rf'''(?sx)
2753         {STRING_RE}|
2754         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2755         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2756         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2757         [0-9]+(?={SKIP_RE}:)|
2758         !+
2759         ''', fix_kv, code)
2760
2761
2762 def qualities(quality_ids):
2763     """ Get a numeric quality value out of a list of possible values """
2764     def q(qid):
2765         try:
2766             return quality_ids.index(qid)
2767         except ValueError:
2768             return -1
2769     return q
2770
2771
2772 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2773
2774
2775 DEFAULT_OUTTMPL = {
2776     'default': '%(title)s [%(id)s].%(ext)s',
2777     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2778 }
2779 OUTTMPL_TYPES = {
2780     'chapter': None,
2781     'subtitle': None,
2782     'thumbnail': None,
2783     'description': 'description',
2784     'annotation': 'annotations.xml',
2785     'infojson': 'info.json',
2786     'link': None,
2787     'pl_video': None,
2788     'pl_thumbnail': None,
2789     'pl_description': 'description',
2790     'pl_infojson': 'info.json',
2791 }
2792
2793 # As of [1] format syntax is:
2794 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2795 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2796 STR_FORMAT_RE_TMPL = r'''(?x)
2797     (?<!%)(?P<prefix>(?:%%)*)
2798     %
2799     (?P<has_key>\((?P<key>{0})\))?
2800     (?P<format>
2801         (?P<conversion>[#0\-+ ]+)?
2802         (?P<min_width>\d+)?
2803         (?P<precision>\.\d+)?
2804         (?P<len_mod>[hlL])?  # unused in python
2805         {1}  # conversion type
2806     )
2807 '''
2808
2809
2810 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2811
2812
2813 def limit_length(s, length):
2814     """ Add ellipses to overly long strings """
2815     if s is None:
2816         return None
2817     ELLIPSES = '...'
2818     if len(s) > length:
2819         return s[:length - len(ELLIPSES)] + ELLIPSES
2820     return s
2821
2822
2823 def version_tuple(v):
2824     return tuple(int(e) for e in re.split(r'[-.]', v))
2825
2826
2827 def is_outdated_version(version, limit, assume_new=True):
2828     if not version:
2829         return not assume_new
2830     try:
2831         return version_tuple(version) < version_tuple(limit)
2832     except ValueError:
2833         return not assume_new
2834
2835
2836 def ytdl_is_updateable():
2837     """ Returns if yt-dlp can be updated with -U """
2838
2839     from ..update import is_non_updateable
2840
2841     return not is_non_updateable()
2842
2843
2844 def args_to_str(args):
2845     # Get a short string representation for a subprocess command
2846     return ' '.join(compat_shlex_quote(a) for a in args)
2847
2848
2849 def error_to_str(err):
2850     return f'{type(err).__name__}: {err}'
2851
2852
2853 def mimetype2ext(mt, default=NO_DEFAULT):
2854     if not isinstance(mt, str):
2855         if default is not NO_DEFAULT:
2856             return default
2857         return None
2858
2859     MAP = {
2860         # video
2861         '3gpp': '3gp',
2862         'mp2t': 'ts',
2863         'mp4': 'mp4',
2864         'mpeg': 'mpeg',
2865         'mpegurl': 'm3u8',
2866         'quicktime': 'mov',
2867         'webm': 'webm',
2868         'vp9': 'vp9',
2869         'video/ogg': 'ogv',
2870         'x-flv': 'flv',
2871         'x-m4v': 'm4v',
2872         'x-matroska': 'mkv',
2873         'x-mng': 'mng',
2874         'x-mp4-fragmented': 'mp4',
2875         'x-ms-asf': 'asf',
2876         'x-ms-wmv': 'wmv',
2877         'x-msvideo': 'avi',
2878
2879         # application (streaming playlists)
2880         'dash+xml': 'mpd',
2881         'f4m+xml': 'f4m',
2882         'hds+xml': 'f4m',
2883         'vnd.apple.mpegurl': 'm3u8',
2884         'vnd.ms-sstr+xml': 'ism',
2885         'x-mpegurl': 'm3u8',
2886
2887         # audio
2888         'audio/mp4': 'm4a',
2889         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2890         # Using .mp3 as it's the most popular one
2891         'audio/mpeg': 'mp3',
2892         'audio/webm': 'webm',
2893         'audio/x-matroska': 'mka',
2894         'audio/x-mpegurl': 'm3u',
2895         'midi': 'mid',
2896         'ogg': 'ogg',
2897         'wav': 'wav',
2898         'wave': 'wav',
2899         'x-aac': 'aac',
2900         'x-flac': 'flac',
2901         'x-m4a': 'm4a',
2902         'x-realaudio': 'ra',
2903         'x-wav': 'wav',
2904
2905         # image
2906         'avif': 'avif',
2907         'bmp': 'bmp',
2908         'gif': 'gif',
2909         'jpeg': 'jpg',
2910         'png': 'png',
2911         'svg+xml': 'svg',
2912         'tiff': 'tif',
2913         'vnd.wap.wbmp': 'wbmp',
2914         'webp': 'webp',
2915         'x-icon': 'ico',
2916         'x-jng': 'jng',
2917         'x-ms-bmp': 'bmp',
2918
2919         # caption
2920         'filmstrip+json': 'fs',
2921         'smptett+xml': 'tt',
2922         'ttaf+xml': 'dfxp',
2923         'ttml+xml': 'ttml',
2924         'x-ms-sami': 'sami',
2925
2926         # misc
2927         'gzip': 'gz',
2928         'json': 'json',
2929         'xml': 'xml',
2930         'zip': 'zip',
2931     }
2932
2933     mimetype = mt.partition(';')[0].strip().lower()
2934     _, _, subtype = mimetype.rpartition('/')
2935
2936     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2937     if ext:
2938         return ext
2939     elif default is not NO_DEFAULT:
2940         return default
2941     return subtype.replace('+', '.')
2942
2943
2944 def ext2mimetype(ext_or_url):
2945     if not ext_or_url:
2946         return None
2947     if '.' not in ext_or_url:
2948         ext_or_url = f'file.{ext_or_url}'
2949     return mimetypes.guess_type(ext_or_url)[0]
2950
2951
2952 def parse_codecs(codecs_str):
2953     # http://tools.ietf.org/html/rfc6381
2954     if not codecs_str:
2955         return {}
2956     split_codecs = list(filter(None, map(
2957         str.strip, codecs_str.strip().strip(',').split(','))))
2958     vcodec, acodec, scodec, hdr = None, None, None, None
2959     for full_codec in split_codecs:
2960         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2961         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2962                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2963             if vcodec:
2964                 continue
2965             vcodec = full_codec
2966             if parts[0] in ('dvh1', 'dvhe'):
2967                 hdr = 'DV'
2968             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2969                 hdr = 'HDR10'
2970             elif parts[:2] == ['vp9', '2']:
2971                 hdr = 'HDR10'
2972         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
2973                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2974             acodec = acodec or full_codec
2975         elif parts[0] in ('stpp', 'wvtt'):
2976             scodec = scodec or full_codec
2977         else:
2978             write_string(f'WARNING: Unknown codec {full_codec}\n')
2979     if vcodec or acodec or scodec:
2980         return {
2981             'vcodec': vcodec or 'none',
2982             'acodec': acodec or 'none',
2983             'dynamic_range': hdr,
2984             **({'scodec': scodec} if scodec is not None else {}),
2985         }
2986     elif len(split_codecs) == 2:
2987         return {
2988             'vcodec': split_codecs[0],
2989             'acodec': split_codecs[1],
2990         }
2991     return {}
2992
2993
2994 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
2995     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
2996
2997     allow_mkv = not preferences or 'mkv' in preferences
2998
2999     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3000         return 'mkv'  # TODO: any other format allows this?
3001
3002     # TODO: All codecs supported by parse_codecs isn't handled here
3003     COMPATIBLE_CODECS = {
3004         'mp4': {
3005             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3006             'h264', 'aacl', 'ec-3',  # Set in ISM
3007         },
3008         'webm': {
3009             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3010             'vp9x', 'vp8x',  # in the webm spec
3011         },
3012     }
3013
3014     sanitize_codec = functools.partial(
3015         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3016     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3017
3018     for ext in preferences or COMPATIBLE_CODECS.keys():
3019         codec_set = COMPATIBLE_CODECS.get(ext, set())
3020         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3021             return ext
3022
3023     COMPATIBLE_EXTS = (
3024         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3025         {'webm', 'weba'},
3026     )
3027     for ext in preferences or vexts:
3028         current_exts = {ext, *vexts, *aexts}
3029         if ext == 'mkv' or current_exts == {ext} or any(
3030                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3031             return ext
3032     return 'mkv' if allow_mkv else preferences[-1]
3033
3034
3035 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3036     getheader = url_handle.headers.get
3037
3038     cd = getheader('Content-Disposition')
3039     if cd:
3040         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3041         if m:
3042             e = determine_ext(m.group('filename'), default_ext=None)
3043             if e:
3044                 return e
3045
3046     meta_ext = getheader('x-amz-meta-name')
3047     if meta_ext:
3048         e = meta_ext.rpartition('.')[2]
3049         if e:
3050             return e
3051
3052     return mimetype2ext(getheader('Content-Type'), default=default)
3053
3054
3055 def encode_data_uri(data, mime_type):
3056     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3057
3058
3059 def age_restricted(content_limit, age_limit):
3060     """ Returns True iff the content should be blocked """
3061
3062     if age_limit is None:  # No limit set
3063         return False
3064     if content_limit is None:
3065         return False  # Content available for everyone
3066     return age_limit < content_limit
3067
3068
3069 # List of known byte-order-marks (BOM)
3070 BOMS = [
3071     (b'\xef\xbb\xbf', 'utf-8'),
3072     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3073     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3074     (b'\xff\xfe', 'utf-16-le'),
3075     (b'\xfe\xff', 'utf-16-be'),
3076 ]
3077
3078
3079 def is_html(first_bytes):
3080     """ Detect whether a file contains HTML by examining its first bytes. """
3081
3082     encoding = 'utf-8'
3083     for bom, enc in BOMS:
3084         while first_bytes.startswith(bom):
3085             encoding, first_bytes = enc, first_bytes[len(bom):]
3086
3087     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3088
3089
3090 def determine_protocol(info_dict):
3091     protocol = info_dict.get('protocol')
3092     if protocol is not None:
3093         return protocol
3094
3095     url = sanitize_url(info_dict['url'])
3096     if url.startswith('rtmp'):
3097         return 'rtmp'
3098     elif url.startswith('mms'):
3099         return 'mms'
3100     elif url.startswith('rtsp'):
3101         return 'rtsp'
3102
3103     ext = determine_ext(url)
3104     if ext == 'm3u8':
3105         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3106     elif ext == 'f4m':
3107         return 'f4m'
3108
3109     return urllib.parse.urlparse(url).scheme
3110
3111
3112 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3113     """ Render a list of rows, each as a list of values.
3114     Text after a \t will be right aligned """
3115     def width(string):
3116         return len(remove_terminal_sequences(string).replace('\t', ''))
3117
3118     def get_max_lens(table):
3119         return [max(width(str(v)) for v in col) for col in zip(*table)]
3120
3121     def filter_using_list(row, filterArray):
3122         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3123
3124     max_lens = get_max_lens(data) if hide_empty else []
3125     header_row = filter_using_list(header_row, max_lens)
3126     data = [filter_using_list(row, max_lens) for row in data]
3127
3128     table = [header_row] + data
3129     max_lens = get_max_lens(table)
3130     extra_gap += 1
3131     if delim:
3132         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3133         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3134     for row in table:
3135         for pos, text in enumerate(map(str, row)):
3136             if '\t' in text:
3137                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3138             else:
3139                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3140     ret = '\n'.join(''.join(row).rstrip() for row in table)
3141     return ret
3142
3143
3144 def _match_one(filter_part, dct, incomplete):
3145     # TODO: Generalize code with YoutubeDL._build_format_filter
3146     STRING_OPERATORS = {
3147         '*=': operator.contains,
3148         '^=': lambda attr, value: attr.startswith(value),
3149         '$=': lambda attr, value: attr.endswith(value),
3150         '~=': lambda attr, value: re.search(value, attr),
3151     }
3152     COMPARISON_OPERATORS = {
3153         **STRING_OPERATORS,
3154         '<=': operator.le,  # "<=" must be defined above "<"
3155         '<': operator.lt,
3156         '>=': operator.ge,
3157         '>': operator.gt,
3158         '=': operator.eq,
3159     }
3160
3161     if isinstance(incomplete, bool):
3162         is_incomplete = lambda _: incomplete
3163     else:
3164         is_incomplete = lambda k: k in incomplete
3165
3166     operator_rex = re.compile(r'''(?x)
3167         (?P<key>[a-z_]+)
3168         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3169         (?:
3170             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3171             (?P<strval>.+?)
3172         )
3173         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3174     m = operator_rex.fullmatch(filter_part.strip())
3175     if m:
3176         m = m.groupdict()
3177         unnegated_op = COMPARISON_OPERATORS[m['op']]
3178         if m['negation']:
3179             op = lambda attr, value: not unnegated_op(attr, value)
3180         else:
3181             op = unnegated_op
3182         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3183         if m['quote']:
3184             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3185         actual_value = dct.get(m['key'])
3186         numeric_comparison = None
3187         if isinstance(actual_value, (int, float)):
3188             # If the original field is a string and matching comparisonvalue is
3189             # a number we should respect the origin of the original field
3190             # and process comparison value as a string (see
3191             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3192             try:
3193                 numeric_comparison = int(comparison_value)
3194             except ValueError:
3195                 numeric_comparison = parse_filesize(comparison_value)
3196                 if numeric_comparison is None:
3197                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3198                 if numeric_comparison is None:
3199                     numeric_comparison = parse_duration(comparison_value)
3200         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3201             raise ValueError('Operator %s only supports string values!' % m['op'])
3202         if actual_value is None:
3203             return is_incomplete(m['key']) or m['none_inclusive']
3204         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3205
3206     UNARY_OPERATORS = {
3207         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3208         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3209     }
3210     operator_rex = re.compile(r'''(?x)
3211         (?P<op>%s)\s*(?P<key>[a-z_]+)
3212         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3213     m = operator_rex.fullmatch(filter_part.strip())
3214     if m:
3215         op = UNARY_OPERATORS[m.group('op')]
3216         actual_value = dct.get(m.group('key'))
3217         if is_incomplete(m.group('key')) and actual_value is None:
3218             return True
3219         return op(actual_value)
3220
3221     raise ValueError('Invalid filter part %r' % filter_part)
3222
3223
3224 def match_str(filter_str, dct, incomplete=False):
3225     """ Filter a dictionary with a simple string syntax.
3226     @returns           Whether the filter passes
3227     @param incomplete  Set of keys that is expected to be missing from dct.
3228                        Can be True/False to indicate all/none of the keys may be missing.
3229                        All conditions on incomplete keys pass if the key is missing
3230     """
3231     return all(
3232         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3233         for filter_part in re.split(r'(?<!\\)&', filter_str))
3234
3235
3236 def match_filter_func(filters, breaking_filters=None):
3237     if not filters and not breaking_filters:
3238         return None
3239     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3240     filters = set(variadic(filters or []))
3241
3242     interactive = '-' in filters
3243     if interactive:
3244         filters.remove('-')
3245
3246     def _match_func(info_dict, incomplete=False):
3247         ret = breaking_filters(info_dict, incomplete)
3248         if ret is not None:
3249             raise RejectedVideoReached(ret)
3250
3251         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3252             return NO_DEFAULT if interactive and not incomplete else None
3253         else:
3254             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3255             filter_str = ') | ('.join(map(str.strip, filters))
3256             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3257     return _match_func
3258
3259
3260 class download_range_func:
3261     def __init__(self, chapters, ranges, from_info=False):
3262         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3263
3264     def __call__(self, info_dict, ydl):
3265
3266         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3267                    else 'Cannot match chapters since chapter information is unavailable')
3268         for regex in self.chapters or []:
3269             for i, chapter in enumerate(info_dict.get('chapters') or []):
3270                 if re.search(regex, chapter['title']):
3271                     warning = None
3272                     yield {**chapter, 'index': i}
3273         if self.chapters and warning:
3274             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3275
3276         for start, end in self.ranges or []:
3277             yield {
3278                 'start_time': self._handle_negative_timestamp(start, info_dict),
3279                 'end_time': self._handle_negative_timestamp(end, info_dict),
3280             }
3281
3282         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3283             yield {
3284                 'start_time': info_dict.get('start_time') or 0,
3285                 'end_time': info_dict.get('end_time') or float('inf'),
3286             }
3287         elif not self.ranges and not self.chapters:
3288             yield {}
3289
3290     @staticmethod
3291     def _handle_negative_timestamp(time, info):
3292         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3293
3294     def __eq__(self, other):
3295         return (isinstance(other, download_range_func)
3296                 and self.chapters == other.chapters and self.ranges == other.ranges)
3297
3298     def __repr__(self):
3299         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3300
3301
3302 def parse_dfxp_time_expr(time_expr):
3303     if not time_expr:
3304         return
3305
3306     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3307     if mobj:
3308         return float(mobj.group('time_offset'))
3309
3310     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3311     if mobj:
3312         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3313
3314
3315 def srt_subtitles_timecode(seconds):
3316     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3317
3318
3319 def ass_subtitles_timecode(seconds):
3320     time = timetuple_from_msec(seconds * 1000)
3321     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3322
3323
3324 def dfxp2srt(dfxp_data):
3325     '''
3326     @param dfxp_data A bytes-like object containing DFXP data
3327     @returns A unicode object containing converted SRT data
3328     '''
3329     LEGACY_NAMESPACES = (
3330         (b'http://www.w3.org/ns/ttml', [
3331             b'http://www.w3.org/2004/11/ttaf1',
3332             b'http://www.w3.org/2006/04/ttaf1',
3333             b'http://www.w3.org/2006/10/ttaf1',
3334         ]),
3335         (b'http://www.w3.org/ns/ttml#styling', [
3336             b'http://www.w3.org/ns/ttml#style',
3337         ]),
3338     )
3339
3340     SUPPORTED_STYLING = [
3341         'color',
3342         'fontFamily',
3343         'fontSize',
3344         'fontStyle',
3345         'fontWeight',
3346         'textDecoration'
3347     ]
3348
3349     _x = functools.partial(xpath_with_ns, ns_map={
3350         'xml': 'http://www.w3.org/XML/1998/namespace',
3351         'ttml': 'http://www.w3.org/ns/ttml',
3352         'tts': 'http://www.w3.org/ns/ttml#styling',
3353     })
3354
3355     styles = {}
3356     default_style = {}
3357
3358     class TTMLPElementParser:
3359         _out = ''
3360         _unclosed_elements = []
3361         _applied_styles = []
3362
3363         def start(self, tag, attrib):
3364             if tag in (_x('ttml:br'), 'br'):
3365                 self._out += '\n'
3366             else:
3367                 unclosed_elements = []
3368                 style = {}
3369                 element_style_id = attrib.get('style')
3370                 if default_style:
3371                     style.update(default_style)
3372                 if element_style_id:
3373                     style.update(styles.get(element_style_id, {}))
3374                 for prop in SUPPORTED_STYLING:
3375                     prop_val = attrib.get(_x('tts:' + prop))
3376                     if prop_val:
3377                         style[prop] = prop_val
3378                 if style:
3379                     font = ''
3380                     for k, v in sorted(style.items()):
3381                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3382                             continue
3383                         if k == 'color':
3384                             font += ' color="%s"' % v
3385                         elif k == 'fontSize':
3386                             font += ' size="%s"' % v
3387                         elif k == 'fontFamily':
3388                             font += ' face="%s"' % v
3389                         elif k == 'fontWeight' and v == 'bold':
3390                             self._out += '<b>'
3391                             unclosed_elements.append('b')
3392                         elif k == 'fontStyle' and v == 'italic':
3393                             self._out += '<i>'
3394                             unclosed_elements.append('i')
3395                         elif k == 'textDecoration' and v == 'underline':
3396                             self._out += '<u>'
3397                             unclosed_elements.append('u')
3398                     if font:
3399                         self._out += '<font' + font + '>'
3400                         unclosed_elements.append('font')
3401                     applied_style = {}
3402                     if self._applied_styles:
3403                         applied_style.update(self._applied_styles[-1])
3404                     applied_style.update(style)
3405                     self._applied_styles.append(applied_style)
3406                 self._unclosed_elements.append(unclosed_elements)
3407
3408         def end(self, tag):
3409             if tag not in (_x('ttml:br'), 'br'):
3410                 unclosed_elements = self._unclosed_elements.pop()
3411                 for element in reversed(unclosed_elements):
3412                     self._out += '</%s>' % element
3413                 if unclosed_elements and self._applied_styles:
3414                     self._applied_styles.pop()
3415
3416         def data(self, data):
3417             self._out += data
3418
3419         def close(self):
3420             return self._out.strip()
3421
3422     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3423     # This will not trigger false positives since only UTF-8 text is being replaced
3424     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3425
3426     def parse_node(node):
3427         target = TTMLPElementParser()
3428         parser = xml.etree.ElementTree.XMLParser(target=target)
3429         parser.feed(xml.etree.ElementTree.tostring(node))
3430         return parser.close()
3431
3432     for k, v in LEGACY_NAMESPACES:
3433         for ns in v:
3434             dfxp_data = dfxp_data.replace(ns, k)
3435
3436     dfxp = compat_etree_fromstring(dfxp_data)
3437     out = []
3438     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3439
3440     if not paras:
3441         raise ValueError('Invalid dfxp/TTML subtitle')
3442
3443     repeat = False
3444     while True:
3445         for style in dfxp.findall(_x('.//ttml:style')):
3446             style_id = style.get('id') or style.get(_x('xml:id'))
3447             if not style_id:
3448                 continue
3449             parent_style_id = style.get('style')
3450             if parent_style_id:
3451                 if parent_style_id not in styles:
3452                     repeat = True
3453                     continue
3454                 styles[style_id] = styles[parent_style_id].copy()
3455             for prop in SUPPORTED_STYLING:
3456                 prop_val = style.get(_x('tts:' + prop))
3457                 if prop_val:
3458                     styles.setdefault(style_id, {})[prop] = prop_val
3459         if repeat:
3460             repeat = False
3461         else:
3462             break
3463
3464     for p in ('body', 'div'):
3465         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3466         if ele is None:
3467             continue
3468         style = styles.get(ele.get('style'))
3469         if not style:
3470             continue
3471         default_style.update(style)
3472
3473     for para, index in zip(paras, itertools.count(1)):
3474         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3475         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3476         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3477         if begin_time is None:
3478             continue
3479         if not end_time:
3480             if not dur:
3481                 continue
3482             end_time = begin_time + dur
3483         out.append('%d\n%s --> %s\n%s\n\n' % (
3484             index,
3485             srt_subtitles_timecode(begin_time),
3486             srt_subtitles_timecode(end_time),
3487             parse_node(para)))
3488
3489     return ''.join(out)
3490
3491
3492 def cli_option(params, command_option, param, separator=None):
3493     param = params.get(param)
3494     return ([] if param is None
3495             else [command_option, str(param)] if separator is None
3496             else [f'{command_option}{separator}{param}'])
3497
3498
3499 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3500     param = params.get(param)
3501     assert param in (True, False, None)
3502     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3503
3504
3505 def cli_valueless_option(params, command_option, param, expected_value=True):
3506     return [command_option] if params.get(param) == expected_value else []
3507
3508
3509 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3510     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3511         if use_compat:
3512             return argdict
3513         else:
3514             argdict = None
3515     if argdict is None:
3516         return default
3517     assert isinstance(argdict, dict)
3518
3519     assert isinstance(keys, (list, tuple))
3520     for key_list in keys:
3521         arg_list = list(filter(
3522             lambda x: x is not None,
3523             [argdict.get(key.lower()) for key in variadic(key_list)]))
3524         if arg_list:
3525             return [arg for args in arg_list for arg in args]
3526     return default
3527
3528
3529 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3530     main_key, exe = main_key.lower(), exe.lower()
3531     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3532     keys = [f'{root_key}{k}' for k in (keys or [''])]
3533     if root_key in keys:
3534         if main_key != exe:
3535             keys.append((main_key, exe))
3536         keys.append('default')
3537     else:
3538         use_compat = False
3539     return cli_configuration_args(argdict, keys, default, use_compat)
3540
3541
3542 class ISO639Utils:
3543     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3544     _lang_map = {
3545         'aa': 'aar',
3546         'ab': 'abk',
3547         'ae': 'ave',
3548         'af': 'afr',
3549         'ak': 'aka',
3550         'am': 'amh',
3551         'an': 'arg',
3552         'ar': 'ara',
3553         'as': 'asm',
3554         'av': 'ava',
3555         'ay': 'aym',
3556         'az': 'aze',
3557         'ba': 'bak',
3558         'be': 'bel',
3559         'bg': 'bul',
3560         'bh': 'bih',
3561         'bi': 'bis',
3562         'bm': 'bam',
3563         'bn': 'ben',
3564         'bo': 'bod',
3565         'br': 'bre',
3566         'bs': 'bos',
3567         'ca': 'cat',
3568         'ce': 'che',
3569         'ch': 'cha',
3570         'co': 'cos',
3571         'cr': 'cre',
3572         'cs': 'ces',
3573         'cu': 'chu',
3574         'cv': 'chv',
3575         'cy': 'cym',
3576         'da': 'dan',
3577         'de': 'deu',
3578         'dv': 'div',
3579         'dz': 'dzo',
3580         'ee': 'ewe',
3581         'el': 'ell',
3582         'en': 'eng',
3583         'eo': 'epo',
3584         'es': 'spa',
3585         'et': 'est',
3586         'eu': 'eus',
3587         'fa': 'fas',
3588         'ff': 'ful',
3589         'fi': 'fin',
3590         'fj': 'fij',
3591         'fo': 'fao',
3592         'fr': 'fra',
3593         'fy': 'fry',
3594         'ga': 'gle',
3595         'gd': 'gla',
3596         'gl': 'glg',
3597         'gn': 'grn',
3598         'gu': 'guj',
3599         'gv': 'glv',
3600         'ha': 'hau',
3601         'he': 'heb',
3602         'iw': 'heb',  # Replaced by he in 1989 revision
3603         'hi': 'hin',
3604         'ho': 'hmo',
3605         'hr': 'hrv',
3606         'ht': 'hat',
3607         'hu': 'hun',
3608         'hy': 'hye',
3609         'hz': 'her',
3610         'ia': 'ina',
3611         'id': 'ind',
3612         'in': 'ind',  # Replaced by id in 1989 revision
3613         'ie': 'ile',
3614         'ig': 'ibo',
3615         'ii': 'iii',
3616         'ik': 'ipk',
3617         'io': 'ido',
3618         'is': 'isl',
3619         'it': 'ita',
3620         'iu': 'iku',
3621         'ja': 'jpn',
3622         'jv': 'jav',
3623         'ka': 'kat',
3624         'kg': 'kon',
3625         'ki': 'kik',
3626         'kj': 'kua',
3627         'kk': 'kaz',
3628         'kl': 'kal',
3629         'km': 'khm',
3630         'kn': 'kan',
3631         'ko': 'kor',
3632         'kr': 'kau',
3633         'ks': 'kas',
3634         'ku': 'kur',
3635         'kv': 'kom',
3636         'kw': 'cor',
3637         'ky': 'kir',
3638         'la': 'lat',
3639         'lb': 'ltz',
3640         'lg': 'lug',
3641         'li': 'lim',
3642         'ln': 'lin',
3643         'lo': 'lao',
3644         'lt': 'lit',
3645         'lu': 'lub',
3646         'lv': 'lav',
3647         'mg': 'mlg',
3648         'mh': 'mah',
3649         'mi': 'mri',
3650         'mk': 'mkd',
3651         'ml': 'mal',
3652         'mn': 'mon',
3653         'mr': 'mar',
3654         'ms': 'msa',
3655         'mt': 'mlt',
3656         'my': 'mya',
3657         'na': 'nau',
3658         'nb': 'nob',
3659         'nd': 'nde',
3660         'ne': 'nep',
3661         'ng': 'ndo',
3662         'nl': 'nld',
3663         'nn': 'nno',
3664         'no': 'nor',
3665         'nr': 'nbl',
3666         'nv': 'nav',
3667         'ny': 'nya',
3668         'oc': 'oci',
3669         'oj': 'oji',
3670         'om': 'orm',
3671         'or': 'ori',
3672         'os': 'oss',
3673         'pa': 'pan',
3674         'pe': 'per',
3675         'pi': 'pli',
3676         'pl': 'pol',
3677         'ps': 'pus',
3678         'pt': 'por',
3679         'qu': 'que',
3680         'rm': 'roh',
3681         'rn': 'run',
3682         'ro': 'ron',
3683         'ru': 'rus',
3684         'rw': 'kin',
3685         'sa': 'san',
3686         'sc': 'srd',
3687         'sd': 'snd',
3688         'se': 'sme',
3689         'sg': 'sag',
3690         'si': 'sin',
3691         'sk': 'slk',
3692         'sl': 'slv',
3693         'sm': 'smo',
3694         'sn': 'sna',
3695         'so': 'som',
3696         'sq': 'sqi',
3697         'sr': 'srp',
3698         'ss': 'ssw',
3699         'st': 'sot',
3700         'su': 'sun',
3701         'sv': 'swe',
3702         'sw': 'swa',
3703         'ta': 'tam',
3704         'te': 'tel',
3705         'tg': 'tgk',
3706         'th': 'tha',
3707         'ti': 'tir',
3708         'tk': 'tuk',
3709         'tl': 'tgl',
3710         'tn': 'tsn',
3711         'to': 'ton',
3712         'tr': 'tur',
3713         'ts': 'tso',
3714         'tt': 'tat',
3715         'tw': 'twi',
3716         'ty': 'tah',
3717         'ug': 'uig',
3718         'uk': 'ukr',
3719         'ur': 'urd',
3720         'uz': 'uzb',
3721         've': 'ven',
3722         'vi': 'vie',
3723         'vo': 'vol',
3724         'wa': 'wln',
3725         'wo': 'wol',
3726         'xh': 'xho',
3727         'yi': 'yid',
3728         'ji': 'yid',  # Replaced by yi in 1989 revision
3729         'yo': 'yor',
3730         'za': 'zha',
3731         'zh': 'zho',
3732         'zu': 'zul',
3733     }
3734
3735     @classmethod
3736     def short2long(cls, code):
3737         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3738         return cls._lang_map.get(code[:2])
3739
3740     @classmethod
3741     def long2short(cls, code):
3742         """Convert language code from ISO 639-2/T to ISO 639-1"""
3743         for short_name, long_name in cls._lang_map.items():
3744             if long_name == code:
3745                 return short_name
3746
3747
3748 class ISO3166Utils:
3749     # From http://data.okfn.org/data/core/country-list
3750     _country_map = {
3751         'AF': 'Afghanistan',
3752         'AX': 'Åland Islands',
3753         'AL': 'Albania',
3754         'DZ': 'Algeria',
3755         'AS': 'American Samoa',
3756         'AD': 'Andorra',
3757         'AO': 'Angola',
3758         'AI': 'Anguilla',
3759         'AQ': 'Antarctica',
3760         'AG': 'Antigua and Barbuda',
3761         'AR': 'Argentina',
3762         'AM': 'Armenia',
3763         'AW': 'Aruba',
3764         'AU': 'Australia',
3765         'AT': 'Austria',
3766         'AZ': 'Azerbaijan',
3767         'BS': 'Bahamas',
3768         'BH': 'Bahrain',
3769         'BD': 'Bangladesh',
3770         'BB': 'Barbados',
3771         'BY': 'Belarus',
3772         'BE': 'Belgium',
3773         'BZ': 'Belize',
3774         'BJ': 'Benin',
3775         'BM': 'Bermuda',
3776         'BT': 'Bhutan',
3777         'BO': 'Bolivia, Plurinational State of',
3778         'BQ': 'Bonaire, Sint Eustatius and Saba',
3779         'BA': 'Bosnia and Herzegovina',
3780         'BW': 'Botswana',
3781         'BV': 'Bouvet Island',
3782         'BR': 'Brazil',
3783         'IO': 'British Indian Ocean Territory',
3784         'BN': 'Brunei Darussalam',
3785         'BG': 'Bulgaria',
3786         'BF': 'Burkina Faso',
3787         'BI': 'Burundi',
3788         'KH': 'Cambodia',
3789         'CM': 'Cameroon',
3790         'CA': 'Canada',
3791         'CV': 'Cape Verde',
3792         'KY': 'Cayman Islands',
3793         'CF': 'Central African Republic',
3794         'TD': 'Chad',
3795         'CL': 'Chile',
3796         'CN': 'China',
3797         'CX': 'Christmas Island',
3798         'CC': 'Cocos (Keeling) Islands',
3799         'CO': 'Colombia',
3800         'KM': 'Comoros',
3801         'CG': 'Congo',
3802         'CD': 'Congo, the Democratic Republic of the',
3803         'CK': 'Cook Islands',
3804         'CR': 'Costa Rica',
3805         'CI': 'Côte d\'Ivoire',
3806         'HR': 'Croatia',
3807         'CU': 'Cuba',
3808         'CW': 'Curaçao',
3809         'CY': 'Cyprus',
3810         'CZ': 'Czech Republic',
3811         'DK': 'Denmark',
3812         'DJ': 'Djibouti',
3813         'DM': 'Dominica',
3814         'DO': 'Dominican Republic',
3815         'EC': 'Ecuador',
3816         'EG': 'Egypt',
3817         'SV': 'El Salvador',
3818         'GQ': 'Equatorial Guinea',
3819         'ER': 'Eritrea',
3820         'EE': 'Estonia',
3821         'ET': 'Ethiopia',
3822         'FK': 'Falkland Islands (Malvinas)',
3823         'FO': 'Faroe Islands',
3824         'FJ': 'Fiji',
3825         'FI': 'Finland',
3826         'FR': 'France',
3827         'GF': 'French Guiana',
3828         'PF': 'French Polynesia',
3829         'TF': 'French Southern Territories',
3830         'GA': 'Gabon',
3831         'GM': 'Gambia',
3832         'GE': 'Georgia',
3833         'DE': 'Germany',
3834         'GH': 'Ghana',
3835         'GI': 'Gibraltar',
3836         'GR': 'Greece',
3837         'GL': 'Greenland',
3838         'GD': 'Grenada',
3839         'GP': 'Guadeloupe',
3840         'GU': 'Guam',
3841         'GT': 'Guatemala',
3842         'GG': 'Guernsey',
3843         'GN': 'Guinea',
3844         'GW': 'Guinea-Bissau',
3845         'GY': 'Guyana',
3846         'HT': 'Haiti',
3847         'HM': 'Heard Island and McDonald Islands',
3848         'VA': 'Holy See (Vatican City State)',
3849         'HN': 'Honduras',
3850         'HK': 'Hong Kong',
3851         'HU': 'Hungary',
3852         'IS': 'Iceland',
3853         'IN': 'India',
3854         'ID': 'Indonesia',
3855         'IR': 'Iran, Islamic Republic of',
3856         'IQ': 'Iraq',
3857         'IE': 'Ireland',
3858         'IM': 'Isle of Man',
3859         'IL': 'Israel',
3860         'IT': 'Italy',
3861         'JM': 'Jamaica',
3862         'JP': 'Japan',
3863         'JE': 'Jersey',
3864         'JO': 'Jordan',
3865         'KZ': 'Kazakhstan',
3866         'KE': 'Kenya',
3867         'KI': 'Kiribati',
3868         'KP': 'Korea, Democratic People\'s Republic of',
3869         'KR': 'Korea, Republic of',
3870         'KW': 'Kuwait',
3871         'KG': 'Kyrgyzstan',
3872         'LA': 'Lao People\'s Democratic Republic',
3873         'LV': 'Latvia',
3874         'LB': 'Lebanon',
3875         'LS': 'Lesotho',
3876         'LR': 'Liberia',
3877         'LY': 'Libya',
3878         'LI': 'Liechtenstein',
3879         'LT': 'Lithuania',
3880         'LU': 'Luxembourg',
3881         'MO': 'Macao',
3882         'MK': 'Macedonia, the Former Yugoslav Republic of',
3883         'MG': 'Madagascar',
3884         'MW': 'Malawi',
3885         'MY': 'Malaysia',
3886         'MV': 'Maldives',
3887         'ML': 'Mali',
3888         'MT': 'Malta',
3889         'MH': 'Marshall Islands',
3890         'MQ': 'Martinique',
3891         'MR': 'Mauritania',
3892         'MU': 'Mauritius',
3893         'YT': 'Mayotte',
3894         'MX': 'Mexico',
3895         'FM': 'Micronesia, Federated States of',
3896         'MD': 'Moldova, Republic of',
3897         'MC': 'Monaco',
3898         'MN': 'Mongolia',
3899         'ME': 'Montenegro',
3900         'MS': 'Montserrat',
3901         'MA': 'Morocco',
3902         'MZ': 'Mozambique',
3903         'MM': 'Myanmar',
3904         'NA': 'Namibia',
3905         'NR': 'Nauru',
3906         'NP': 'Nepal',
3907         'NL': 'Netherlands',
3908         'NC': 'New Caledonia',
3909         'NZ': 'New Zealand',
3910         'NI': 'Nicaragua',
3911         'NE': 'Niger',
3912         'NG': 'Nigeria',
3913         'NU': 'Niue',
3914         'NF': 'Norfolk Island',
3915         'MP': 'Northern Mariana Islands',
3916         'NO': 'Norway',
3917         'OM': 'Oman',
3918         'PK': 'Pakistan',
3919         'PW': 'Palau',
3920         'PS': 'Palestine, State of',
3921         'PA': 'Panama',
3922         'PG': 'Papua New Guinea',
3923         'PY': 'Paraguay',
3924         'PE': 'Peru',
3925         'PH': 'Philippines',
3926         'PN': 'Pitcairn',
3927         'PL': 'Poland',
3928         'PT': 'Portugal',
3929         'PR': 'Puerto Rico',
3930         'QA': 'Qatar',
3931         'RE': 'Réunion',
3932         'RO': 'Romania',
3933         'RU': 'Russian Federation',
3934         'RW': 'Rwanda',
3935         'BL': 'Saint Barthélemy',
3936         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3937         'KN': 'Saint Kitts and Nevis',
3938         'LC': 'Saint Lucia',
3939         'MF': 'Saint Martin (French part)',
3940         'PM': 'Saint Pierre and Miquelon',
3941         'VC': 'Saint Vincent and the Grenadines',
3942         'WS': 'Samoa',
3943         'SM': 'San Marino',
3944         'ST': 'Sao Tome and Principe',
3945         'SA': 'Saudi Arabia',
3946         'SN': 'Senegal',
3947         'RS': 'Serbia',
3948         'SC': 'Seychelles',
3949         'SL': 'Sierra Leone',
3950         'SG': 'Singapore',
3951         'SX': 'Sint Maarten (Dutch part)',
3952         'SK': 'Slovakia',
3953         'SI': 'Slovenia',
3954         'SB': 'Solomon Islands',
3955         'SO': 'Somalia',
3956         'ZA': 'South Africa',
3957         'GS': 'South Georgia and the South Sandwich Islands',
3958         'SS': 'South Sudan',
3959         'ES': 'Spain',
3960         'LK': 'Sri Lanka',
3961         'SD': 'Sudan',
3962         'SR': 'Suriname',
3963         'SJ': 'Svalbard and Jan Mayen',
3964         'SZ': 'Swaziland',
3965         'SE': 'Sweden',
3966         'CH': 'Switzerland',
3967         'SY': 'Syrian Arab Republic',
3968         'TW': 'Taiwan, Province of China',
3969         'TJ': 'Tajikistan',
3970         'TZ': 'Tanzania, United Republic of',
3971         'TH': 'Thailand',
3972         'TL': 'Timor-Leste',
3973         'TG': 'Togo',
3974         'TK': 'Tokelau',
3975         'TO': 'Tonga',
3976         'TT': 'Trinidad and Tobago',
3977         'TN': 'Tunisia',
3978         'TR': 'Turkey',
3979         'TM': 'Turkmenistan',
3980         'TC': 'Turks and Caicos Islands',
3981         'TV': 'Tuvalu',
3982         'UG': 'Uganda',
3983         'UA': 'Ukraine',
3984         'AE': 'United Arab Emirates',
3985         'GB': 'United Kingdom',
3986         'US': 'United States',
3987         'UM': 'United States Minor Outlying Islands',
3988         'UY': 'Uruguay',
3989         'UZ': 'Uzbekistan',
3990         'VU': 'Vanuatu',
3991         'VE': 'Venezuela, Bolivarian Republic of',
3992         'VN': 'Viet Nam',
3993         'VG': 'Virgin Islands, British',
3994         'VI': 'Virgin Islands, U.S.',
3995         'WF': 'Wallis and Futuna',
3996         'EH': 'Western Sahara',
3997         'YE': 'Yemen',
3998         'ZM': 'Zambia',
3999         'ZW': 'Zimbabwe',
4000         # Not ISO 3166 codes, but used for IP blocks
4001         'AP': 'Asia/Pacific Region',
4002         'EU': 'Europe',
4003     }
4004
4005     @classmethod
4006     def short2full(cls, code):
4007         """Convert an ISO 3166-2 country code to the corresponding full name"""
4008         return cls._country_map.get(code.upper())
4009
4010
4011 class GeoUtils:
4012     # Major IPv4 address blocks per country
4013     _country_ip_map = {
4014         'AD': '46.172.224.0/19',
4015         'AE': '94.200.0.0/13',
4016         'AF': '149.54.0.0/17',
4017         'AG': '209.59.64.0/18',
4018         'AI': '204.14.248.0/21',
4019         'AL': '46.99.0.0/16',
4020         'AM': '46.70.0.0/15',
4021         'AO': '105.168.0.0/13',
4022         'AP': '182.50.184.0/21',
4023         'AQ': '23.154.160.0/24',
4024         'AR': '181.0.0.0/12',
4025         'AS': '202.70.112.0/20',
4026         'AT': '77.116.0.0/14',
4027         'AU': '1.128.0.0/11',
4028         'AW': '181.41.0.0/18',
4029         'AX': '185.217.4.0/22',
4030         'AZ': '5.197.0.0/16',
4031         'BA': '31.176.128.0/17',
4032         'BB': '65.48.128.0/17',
4033         'BD': '114.130.0.0/16',
4034         'BE': '57.0.0.0/8',
4035         'BF': '102.178.0.0/15',
4036         'BG': '95.42.0.0/15',
4037         'BH': '37.131.0.0/17',
4038         'BI': '154.117.192.0/18',
4039         'BJ': '137.255.0.0/16',
4040         'BL': '185.212.72.0/23',
4041         'BM': '196.12.64.0/18',
4042         'BN': '156.31.0.0/16',
4043         'BO': '161.56.0.0/16',
4044         'BQ': '161.0.80.0/20',
4045         'BR': '191.128.0.0/12',
4046         'BS': '24.51.64.0/18',
4047         'BT': '119.2.96.0/19',
4048         'BW': '168.167.0.0/16',
4049         'BY': '178.120.0.0/13',
4050         'BZ': '179.42.192.0/18',
4051         'CA': '99.224.0.0/11',
4052         'CD': '41.243.0.0/16',
4053         'CF': '197.242.176.0/21',
4054         'CG': '160.113.0.0/16',
4055         'CH': '85.0.0.0/13',
4056         'CI': '102.136.0.0/14',
4057         'CK': '202.65.32.0/19',
4058         'CL': '152.172.0.0/14',
4059         'CM': '102.244.0.0/14',
4060         'CN': '36.128.0.0/10',
4061         'CO': '181.240.0.0/12',
4062         'CR': '201.192.0.0/12',
4063         'CU': '152.206.0.0/15',
4064         'CV': '165.90.96.0/19',
4065         'CW': '190.88.128.0/17',
4066         'CY': '31.153.0.0/16',
4067         'CZ': '88.100.0.0/14',
4068         'DE': '53.0.0.0/8',
4069         'DJ': '197.241.0.0/17',
4070         'DK': '87.48.0.0/12',
4071         'DM': '192.243.48.0/20',
4072         'DO': '152.166.0.0/15',
4073         'DZ': '41.96.0.0/12',
4074         'EC': '186.68.0.0/15',
4075         'EE': '90.190.0.0/15',
4076         'EG': '156.160.0.0/11',
4077         'ER': '196.200.96.0/20',
4078         'ES': '88.0.0.0/11',
4079         'ET': '196.188.0.0/14',
4080         'EU': '2.16.0.0/13',
4081         'FI': '91.152.0.0/13',
4082         'FJ': '144.120.0.0/16',
4083         'FK': '80.73.208.0/21',
4084         'FM': '119.252.112.0/20',
4085         'FO': '88.85.32.0/19',
4086         'FR': '90.0.0.0/9',
4087         'GA': '41.158.0.0/15',
4088         'GB': '25.0.0.0/8',
4089         'GD': '74.122.88.0/21',
4090         'GE': '31.146.0.0/16',
4091         'GF': '161.22.64.0/18',
4092         'GG': '62.68.160.0/19',
4093         'GH': '154.160.0.0/12',
4094         'GI': '95.164.0.0/16',
4095         'GL': '88.83.0.0/19',
4096         'GM': '160.182.0.0/15',
4097         'GN': '197.149.192.0/18',
4098         'GP': '104.250.0.0/19',
4099         'GQ': '105.235.224.0/20',
4100         'GR': '94.64.0.0/13',
4101         'GT': '168.234.0.0/16',
4102         'GU': '168.123.0.0/16',
4103         'GW': '197.214.80.0/20',
4104         'GY': '181.41.64.0/18',
4105         'HK': '113.252.0.0/14',
4106         'HN': '181.210.0.0/16',
4107         'HR': '93.136.0.0/13',
4108         'HT': '148.102.128.0/17',
4109         'HU': '84.0.0.0/14',
4110         'ID': '39.192.0.0/10',
4111         'IE': '87.32.0.0/12',
4112         'IL': '79.176.0.0/13',
4113         'IM': '5.62.80.0/20',
4114         'IN': '117.192.0.0/10',
4115         'IO': '203.83.48.0/21',
4116         'IQ': '37.236.0.0/14',
4117         'IR': '2.176.0.0/12',
4118         'IS': '82.221.0.0/16',
4119         'IT': '79.0.0.0/10',
4120         'JE': '87.244.64.0/18',
4121         'JM': '72.27.0.0/17',
4122         'JO': '176.29.0.0/16',
4123         'JP': '133.0.0.0/8',
4124         'KE': '105.48.0.0/12',
4125         'KG': '158.181.128.0/17',
4126         'KH': '36.37.128.0/17',
4127         'KI': '103.25.140.0/22',
4128         'KM': '197.255.224.0/20',
4129         'KN': '198.167.192.0/19',
4130         'KP': '175.45.176.0/22',
4131         'KR': '175.192.0.0/10',
4132         'KW': '37.36.0.0/14',
4133         'KY': '64.96.0.0/15',
4134         'KZ': '2.72.0.0/13',
4135         'LA': '115.84.64.0/18',
4136         'LB': '178.135.0.0/16',
4137         'LC': '24.92.144.0/20',
4138         'LI': '82.117.0.0/19',
4139         'LK': '112.134.0.0/15',
4140         'LR': '102.183.0.0/16',
4141         'LS': '129.232.0.0/17',
4142         'LT': '78.56.0.0/13',
4143         'LU': '188.42.0.0/16',
4144         'LV': '46.109.0.0/16',
4145         'LY': '41.252.0.0/14',
4146         'MA': '105.128.0.0/11',
4147         'MC': '88.209.64.0/18',
4148         'MD': '37.246.0.0/16',
4149         'ME': '178.175.0.0/17',
4150         'MF': '74.112.232.0/21',
4151         'MG': '154.126.0.0/17',
4152         'MH': '117.103.88.0/21',
4153         'MK': '77.28.0.0/15',
4154         'ML': '154.118.128.0/18',
4155         'MM': '37.111.0.0/17',
4156         'MN': '49.0.128.0/17',
4157         'MO': '60.246.0.0/16',
4158         'MP': '202.88.64.0/20',
4159         'MQ': '109.203.224.0/19',
4160         'MR': '41.188.64.0/18',
4161         'MS': '208.90.112.0/22',
4162         'MT': '46.11.0.0/16',
4163         'MU': '105.16.0.0/12',
4164         'MV': '27.114.128.0/18',
4165         'MW': '102.70.0.0/15',
4166         'MX': '187.192.0.0/11',
4167         'MY': '175.136.0.0/13',
4168         'MZ': '197.218.0.0/15',
4169         'NA': '41.182.0.0/16',
4170         'NC': '101.101.0.0/18',
4171         'NE': '197.214.0.0/18',
4172         'NF': '203.17.240.0/22',
4173         'NG': '105.112.0.0/12',
4174         'NI': '186.76.0.0/15',
4175         'NL': '145.96.0.0/11',
4176         'NO': '84.208.0.0/13',
4177         'NP': '36.252.0.0/15',
4178         'NR': '203.98.224.0/19',
4179         'NU': '49.156.48.0/22',
4180         'NZ': '49.224.0.0/14',
4181         'OM': '5.36.0.0/15',
4182         'PA': '186.72.0.0/15',
4183         'PE': '186.160.0.0/14',
4184         'PF': '123.50.64.0/18',
4185         'PG': '124.240.192.0/19',
4186         'PH': '49.144.0.0/13',
4187         'PK': '39.32.0.0/11',
4188         'PL': '83.0.0.0/11',
4189         'PM': '70.36.0.0/20',
4190         'PR': '66.50.0.0/16',
4191         'PS': '188.161.0.0/16',
4192         'PT': '85.240.0.0/13',
4193         'PW': '202.124.224.0/20',
4194         'PY': '181.120.0.0/14',
4195         'QA': '37.210.0.0/15',
4196         'RE': '102.35.0.0/16',
4197         'RO': '79.112.0.0/13',
4198         'RS': '93.86.0.0/15',
4199         'RU': '5.136.0.0/13',
4200         'RW': '41.186.0.0/16',
4201         'SA': '188.48.0.0/13',
4202         'SB': '202.1.160.0/19',
4203         'SC': '154.192.0.0/11',
4204         'SD': '102.120.0.0/13',
4205         'SE': '78.64.0.0/12',
4206         'SG': '8.128.0.0/10',
4207         'SI': '188.196.0.0/14',
4208         'SK': '78.98.0.0/15',
4209         'SL': '102.143.0.0/17',
4210         'SM': '89.186.32.0/19',
4211         'SN': '41.82.0.0/15',
4212         'SO': '154.115.192.0/18',
4213         'SR': '186.179.128.0/17',
4214         'SS': '105.235.208.0/21',
4215         'ST': '197.159.160.0/19',
4216         'SV': '168.243.0.0/16',
4217         'SX': '190.102.0.0/20',
4218         'SY': '5.0.0.0/16',
4219         'SZ': '41.84.224.0/19',
4220         'TC': '65.255.48.0/20',
4221         'TD': '154.68.128.0/19',
4222         'TG': '196.168.0.0/14',
4223         'TH': '171.96.0.0/13',
4224         'TJ': '85.9.128.0/18',
4225         'TK': '27.96.24.0/21',
4226         'TL': '180.189.160.0/20',
4227         'TM': '95.85.96.0/19',
4228         'TN': '197.0.0.0/11',
4229         'TO': '175.176.144.0/21',
4230         'TR': '78.160.0.0/11',
4231         'TT': '186.44.0.0/15',
4232         'TV': '202.2.96.0/19',
4233         'TW': '120.96.0.0/11',
4234         'TZ': '156.156.0.0/14',
4235         'UA': '37.52.0.0/14',
4236         'UG': '102.80.0.0/13',
4237         'US': '6.0.0.0/8',
4238         'UY': '167.56.0.0/13',
4239         'UZ': '84.54.64.0/18',
4240         'VA': '212.77.0.0/19',
4241         'VC': '207.191.240.0/21',
4242         'VE': '186.88.0.0/13',
4243         'VG': '66.81.192.0/20',
4244         'VI': '146.226.0.0/16',
4245         'VN': '14.160.0.0/11',
4246         'VU': '202.80.32.0/20',
4247         'WF': '117.20.32.0/21',
4248         'WS': '202.4.32.0/19',
4249         'YE': '134.35.0.0/16',
4250         'YT': '41.242.116.0/22',
4251         'ZA': '41.0.0.0/11',
4252         'ZM': '102.144.0.0/13',
4253         'ZW': '102.177.192.0/18',
4254     }
4255
4256     @classmethod
4257     def random_ipv4(cls, code_or_block):
4258         if len(code_or_block) == 2:
4259             block = cls._country_ip_map.get(code_or_block.upper())
4260             if not block:
4261                 return None
4262         else:
4263             block = code_or_block
4264         addr, preflen = block.split('/')
4265         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4266         addr_max = addr_min | (0xffffffff >> int(preflen))
4267         return str(socket.inet_ntoa(
4268             struct.pack('!L', random.randint(addr_min, addr_max))))
4269
4270
4271 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4272 # released into Public Domain
4273 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4274
4275 def long_to_bytes(n, blocksize=0):
4276     """long_to_bytes(n:long, blocksize:int) : string
4277     Convert a long integer to a byte string.
4278
4279     If optional blocksize is given and greater than zero, pad the front of the
4280     byte string with binary zeros so that the length is a multiple of
4281     blocksize.
4282     """
4283     # after much testing, this algorithm was deemed to be the fastest
4284     s = b''
4285     n = int(n)
4286     while n > 0:
4287         s = struct.pack('>I', n & 0xffffffff) + s
4288         n = n >> 32
4289     # strip off leading zeros
4290     for i in range(len(s)):
4291         if s[i] != b'\000'[0]:
4292             break
4293     else:
4294         # only happens when n == 0
4295         s = b'\000'
4296         i = 0
4297     s = s[i:]
4298     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4299     # de-padding being done above, but sigh...
4300     if blocksize > 0 and len(s) % blocksize:
4301         s = (blocksize - len(s) % blocksize) * b'\000' + s
4302     return s
4303
4304
4305 def bytes_to_long(s):
4306     """bytes_to_long(string) : long
4307     Convert a byte string to a long integer.
4308
4309     This is (essentially) the inverse of long_to_bytes().
4310     """
4311     acc = 0
4312     length = len(s)
4313     if length % 4:
4314         extra = (4 - length % 4)
4315         s = b'\000' * extra + s
4316         length = length + extra
4317     for i in range(0, length, 4):
4318         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4319     return acc
4320
4321
4322 def ohdave_rsa_encrypt(data, exponent, modulus):
4323     '''
4324     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4325
4326     Input:
4327         data: data to encrypt, bytes-like object
4328         exponent, modulus: parameter e and N of RSA algorithm, both integer
4329     Output: hex string of encrypted data
4330
4331     Limitation: supports one block encryption only
4332     '''
4333
4334     payload = int(binascii.hexlify(data[::-1]), 16)
4335     encrypted = pow(payload, exponent, modulus)
4336     return '%x' % encrypted
4337
4338
4339 def pkcs1pad(data, length):
4340     """
4341     Padding input data with PKCS#1 scheme
4342
4343     @param {int[]} data        input data
4344     @param {int}   length      target length
4345     @returns {int[]}           padded data
4346     """
4347     if len(data) > length - 11:
4348         raise ValueError('Input data too long for PKCS#1 padding')
4349
4350     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4351     return [0, 2] + pseudo_random + [0] + data
4352
4353
4354 def _base_n_table(n, table):
4355     if not table and not n:
4356         raise ValueError('Either table or n must be specified')
4357     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4358
4359     if n and n != len(table):
4360         raise ValueError(f'base {n} exceeds table length {len(table)}')
4361     return table
4362
4363
4364 def encode_base_n(num, n=None, table=None):
4365     """Convert given int to a base-n string"""
4366     table = _base_n_table(n, table)
4367     if not num:
4368         return table[0]
4369
4370     result, base = '', len(table)
4371     while num:
4372         result = table[num % base] + result
4373         num = num // base
4374     return result
4375
4376
4377 def decode_base_n(string, n=None, table=None):
4378     """Convert given base-n string to int"""
4379     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4380     result, base = 0, len(table)
4381     for char in string:
4382         result = result * base + table[char]
4383     return result
4384
4385
4386 def decode_packed_codes(code):
4387     mobj = re.search(PACKED_CODES_RE, code)
4388     obfuscated_code, base, count, symbols = mobj.groups()
4389     base = int(base)
4390     count = int(count)
4391     symbols = symbols.split('|')
4392     symbol_table = {}
4393
4394     while count:
4395         count -= 1
4396         base_n_count = encode_base_n(count, base)
4397         symbol_table[base_n_count] = symbols[count] or base_n_count
4398
4399     return re.sub(
4400         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4401         obfuscated_code)
4402
4403
4404 def caesar(s, alphabet, shift):
4405     if shift == 0:
4406         return s
4407     l = len(alphabet)
4408     return ''.join(
4409         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4410         for c in s)
4411
4412
4413 def rot47(s):
4414     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4415
4416
4417 def parse_m3u8_attributes(attrib):
4418     info = {}
4419     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4420         if val.startswith('"'):
4421             val = val[1:-1]
4422         info[key] = val
4423     return info
4424
4425
4426 def urshift(val, n):
4427     return val >> n if val >= 0 else (val + 0x100000000) >> n
4428
4429
4430 def write_xattr(path, key, value):
4431     # Windows: Write xattrs to NTFS Alternate Data Streams:
4432     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4433     if compat_os_name == 'nt':
4434         assert ':' not in key
4435         assert os.path.exists(path)
4436
4437         try:
4438             with open(f'{path}:{key}', 'wb') as f:
4439                 f.write(value)
4440         except OSError as e:
4441             raise XAttrMetadataError(e.errno, e.strerror)
4442         return
4443
4444     # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
4445
4446     setxattr = None
4447     if callable(getattr(os, 'setxattr', None)):
4448         setxattr = os.setxattr
4449     elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4450         # Unicode arguments are not supported in pyxattr until version 0.5.0
4451         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4452         if version_tuple(xattr.__version__) >= (0, 5, 0):
4453             setxattr = xattr.set
4454     elif xattr:
4455         setxattr = xattr.setxattr
4456
4457     if setxattr:
4458         try:
4459             setxattr(path, key, value)
4460         except OSError as e:
4461             raise XAttrMetadataError(e.errno, e.strerror)
4462         return
4463
4464     # UNIX Method 2. Use setfattr/xattr executables
4465     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4466            else 'xattr' if check_executable('xattr', ['-h']) else None)
4467     if not exe:
4468         raise XAttrUnavailableError(
4469             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4470             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4471
4472     value = value.decode()
4473     try:
4474         _, stderr, returncode = Popen.run(
4475             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4476             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4477     except OSError as e:
4478         raise XAttrMetadataError(e.errno, e.strerror)
4479     if returncode:
4480         raise XAttrMetadataError(returncode, stderr)
4481
4482
4483 def random_birthday(year_field, month_field, day_field):
4484     start_date = datetime.date(1950, 1, 1)
4485     end_date = datetime.date(1995, 12, 31)
4486     offset = random.randint(0, (end_date - start_date).days)
4487     random_date = start_date + datetime.timedelta(offset)
4488     return {
4489         year_field: str(random_date.year),
4490         month_field: str(random_date.month),
4491         day_field: str(random_date.day),
4492     }
4493
4494
4495 def find_available_port(interface=''):
4496     try:
4497         with socket.socket() as sock:
4498             sock.bind((interface, 0))
4499             return sock.getsockname()[1]
4500     except OSError:
4501         return None
4502
4503
4504 # Templates for internet shortcut files, which are plain text files.
4505 DOT_URL_LINK_TEMPLATE = '''\
4506 [InternetShortcut]
4507 URL=%(url)s
4508 '''
4509
4510 DOT_WEBLOC_LINK_TEMPLATE = '''\
4511 <?xml version="1.0" encoding="UTF-8"?>
4512 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4513 <plist version="1.0">
4514 <dict>
4515 \t<key>URL</key>
4516 \t<string>%(url)s</string>
4517 </dict>
4518 </plist>
4519 '''
4520
4521 DOT_DESKTOP_LINK_TEMPLATE = '''\
4522 [Desktop Entry]
4523 Encoding=UTF-8
4524 Name=%(filename)s
4525 Type=Link
4526 URL=%(url)s
4527 Icon=text-html
4528 '''
4529
4530 LINK_TEMPLATES = {
4531     'url': DOT_URL_LINK_TEMPLATE,
4532     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4533     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4534 }
4535
4536
4537 def iri_to_uri(iri):
4538     """
4539     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4540
4541     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4542     """
4543
4544     iri_parts = urllib.parse.urlparse(iri)
4545
4546     if '[' in iri_parts.netloc:
4547         raise ValueError('IPv6 URIs are not, yet, supported.')
4548         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4549
4550     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4551
4552     net_location = ''
4553     if iri_parts.username:
4554         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4555         if iri_parts.password is not None:
4556             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4557         net_location += '@'
4558
4559     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4560     # The 'idna' encoding produces ASCII text.
4561     if iri_parts.port is not None and iri_parts.port != 80:
4562         net_location += ':' + str(iri_parts.port)
4563
4564     return urllib.parse.urlunparse(
4565         (iri_parts.scheme,
4566             net_location,
4567
4568             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4569
4570             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4571             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4572
4573             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4574             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4575
4576             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4577
4578     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4579
4580
4581 def to_high_limit_path(path):
4582     if sys.platform in ['win32', 'cygwin']:
4583         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4584         return '\\\\?\\' + os.path.abspath(path)
4585
4586     return path
4587
4588
4589 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4590     val = traversal.traverse_obj(obj, *variadic(field))
4591     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4592         return default
4593     return template % func(val)
4594
4595
4596 def clean_podcast_url(url):
4597     url = re.sub(r'''(?x)
4598         (?:
4599             (?:
4600                 chtbl\.com/track|
4601                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4602                 play\.podtrac\.com|
4603                 chrt\.fm/track|
4604                 mgln\.ai/e
4605             )(?:/[^/.]+)?|
4606             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4607             flex\.acast\.com|
4608             pd(?:
4609                 cn\.co| # https://podcorn.com/analytics-prefix/
4610                 st\.fm # https://podsights.com/docs/
4611             )/e|
4612             [0-9]\.gum\.fm|
4613             pscrb\.fm/rss/p
4614         )/''', '', url)
4615     return re.sub(r'^\w+://(\w+://)', r'\1', url)
4616
4617
4618 _HEX_TABLE = '0123456789abcdef'
4619
4620
4621 def random_uuidv4():
4622     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4623
4624
4625 def make_dir(path, to_screen=None):
4626     try:
4627         dn = os.path.dirname(path)
4628         if dn:
4629             os.makedirs(dn, exist_ok=True)
4630         return True
4631     except OSError as err:
4632         if callable(to_screen) is not None:
4633             to_screen(f'unable to create directory {err}')
4634         return False
4635
4636
4637 def get_executable_path():
4638     from ..update import _get_variant_and_executable_path
4639
4640     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4641
4642
4643 def get_user_config_dirs(package_name):
4644     # .config (e.g. ~/.config/package_name)
4645     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4646     yield os.path.join(xdg_config_home, package_name)
4647
4648     # appdata (%APPDATA%/package_name)
4649     appdata_dir = os.getenv('appdata')
4650     if appdata_dir:
4651         yield os.path.join(appdata_dir, package_name)
4652
4653     # home (~/.package_name)
4654     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4655
4656
4657 def get_system_config_dirs(package_name):
4658     # /etc/package_name
4659     yield os.path.join('/etc', package_name)
4660
4661
4662 def time_seconds(**kwargs):
4663     """
4664     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4665     """
4666     return time.time() + datetime.timedelta(**kwargs).total_seconds()
4667
4668
4669 # create a JSON Web Signature (jws) with HS256 algorithm
4670 # the resulting format is in JWS Compact Serialization
4671 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4672 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4673 def jwt_encode_hs256(payload_data, key, headers={}):
4674     header_data = {
4675         'alg': 'HS256',
4676         'typ': 'JWT',
4677     }
4678     if headers:
4679         header_data.update(headers)
4680     header_b64 = base64.b64encode(json.dumps(header_data).encode())
4681     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4682     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4683     signature_b64 = base64.b64encode(h.digest())
4684     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4685     return token
4686
4687
4688 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4689 def jwt_decode_hs256(jwt):
4690     header_b64, payload_b64, signature_b64 = jwt.split('.')
4691     # add trailing ='s that may have been stripped, superfluous ='s are ignored
4692     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4693     return payload_data
4694
4695
4696 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4697
4698
4699 @functools.cache
4700 def supports_terminal_sequences(stream):
4701     if compat_os_name == 'nt':
4702         if not WINDOWS_VT_MODE:
4703             return False
4704     elif not os.getenv('TERM'):
4705         return False
4706     try:
4707         return stream.isatty()
4708     except BaseException:
4709         return False
4710
4711
4712 def windows_enable_vt_mode():
4713     """Ref: https://bugs.python.org/issue30075 """
4714     if get_windows_version() < (10, 0, 10586):
4715         return
4716
4717     import ctypes
4718     import ctypes.wintypes
4719     import msvcrt
4720
4721     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4722
4723     dll = ctypes.WinDLL('kernel32', use_last_error=False)
4724     handle = os.open('CONOUT$', os.O_RDWR)
4725     try:
4726         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4727         dw_original_mode = ctypes.wintypes.DWORD()
4728         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4729         if not success:
4730             raise Exception('GetConsoleMode failed')
4731
4732         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4733             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4734         if not success:
4735             raise Exception('SetConsoleMode failed')
4736     finally:
4737         os.close(handle)
4738
4739     global WINDOWS_VT_MODE
4740     WINDOWS_VT_MODE = True
4741     supports_terminal_sequences.cache_clear()
4742
4743
4744 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4745
4746
4747 def remove_terminal_sequences(string):
4748     return _terminal_sequences_re.sub('', string)
4749
4750
4751 def number_of_digits(number):
4752     return len('%d' % number)
4753
4754
4755 def join_nonempty(*values, delim='-', from_dict=None):
4756     if from_dict is not None:
4757         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4758     return delim.join(map(str, filter(None, values)))
4759
4760
4761 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4762     """
4763     Find the largest format dimensions in terms of video width and, for each thumbnail:
4764     * Modify the URL: Match the width with the provided regex and replace with the former width
4765     * Update dimensions
4766
4767     This function is useful with video services that scale the provided thumbnails on demand
4768     """
4769     _keys = ('width', 'height')
4770     max_dimensions = max(
4771         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4772         default=(0, 0))
4773     if not max_dimensions[0]:
4774         return thumbnails
4775     return [
4776         merge_dicts(
4777             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4778             dict(zip(_keys, max_dimensions)), thumbnail)
4779         for thumbnail in thumbnails
4780     ]
4781
4782
4783 def parse_http_range(range):
4784     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4785     if not range:
4786         return None, None, None
4787     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4788     if not crg:
4789         return None, None, None
4790     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4791
4792
4793 def read_stdin(what):
4794     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4795     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4796     return sys.stdin
4797
4798
4799 def determine_file_encoding(data):
4800     """
4801     Detect the text encoding used
4802     @returns (encoding, bytes to skip)
4803     """
4804
4805     # BOM marks are given priority over declarations
4806     for bom, enc in BOMS:
4807         if data.startswith(bom):
4808             return enc, len(bom)
4809
4810     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4811     # We ignore the endianness to get a good enough match
4812     data = data.replace(b'\0', b'')
4813     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4814     return mobj.group(1).decode() if mobj else None, 0
4815
4816
4817 class Config:
4818     own_args = None
4819     parsed_args = None
4820     filename = None
4821     __initialized = False
4822
4823     def __init__(self, parser, label=None):
4824         self.parser, self.label = parser, label
4825         self._loaded_paths, self.configs = set(), []
4826
4827     def init(self, args=None, filename=None):
4828         assert not self.__initialized
4829         self.own_args, self.filename = args, filename
4830         return self.load_configs()
4831
4832     def load_configs(self):
4833         directory = ''
4834         if self.filename:
4835             location = os.path.realpath(self.filename)
4836             directory = os.path.dirname(location)
4837             if location in self._loaded_paths:
4838                 return False
4839             self._loaded_paths.add(location)
4840
4841         self.__initialized = True
4842         opts, _ = self.parser.parse_known_args(self.own_args)
4843         self.parsed_args = self.own_args
4844         for location in opts.config_locations or []:
4845             if location == '-':
4846                 if location in self._loaded_paths:
4847                     continue
4848                 self._loaded_paths.add(location)
4849                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4850                 continue
4851             location = os.path.join(directory, expand_path(location))
4852             if os.path.isdir(location):
4853                 location = os.path.join(location, 'yt-dlp.conf')
4854             if not os.path.exists(location):
4855                 self.parser.error(f'config location {location} does not exist')
4856             self.append_config(self.read_file(location), location)
4857         return True
4858
4859     def __str__(self):
4860         label = join_nonempty(
4861             self.label, 'config', f'"{self.filename}"' if self.filename else '',
4862             delim=' ')
4863         return join_nonempty(
4864             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4865             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4866             delim='\n')
4867
4868     @staticmethod
4869     def read_file(filename, default=[]):
4870         try:
4871             optionf = open(filename, 'rb')
4872         except OSError:
4873             return default  # silently skip if file is not present
4874         try:
4875             enc, skip = determine_file_encoding(optionf.read(512))
4876             optionf.seek(skip, io.SEEK_SET)
4877         except OSError:
4878             enc = None  # silently skip read errors
4879         try:
4880             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4881             contents = optionf.read().decode(enc or preferredencoding())
4882             res = shlex.split(contents, comments=True)
4883         except Exception as err:
4884             raise ValueError(f'Unable to parse "{filename}": {err}')
4885         finally:
4886             optionf.close()
4887         return res
4888
4889     @staticmethod
4890     def hide_login_info(opts):
4891         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4892         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4893
4894         def _scrub_eq(o):
4895             m = eqre.match(o)
4896             if m:
4897                 return m.group('key') + '=PRIVATE'
4898             else:
4899                 return o
4900
4901         opts = list(map(_scrub_eq, opts))
4902         for idx, opt in enumerate(opts):
4903             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4904                 opts[idx + 1] = 'PRIVATE'
4905         return opts
4906
4907     def append_config(self, *args, label=None):
4908         config = type(self)(self.parser, label)
4909         config._loaded_paths = self._loaded_paths
4910         if config.init(*args):
4911             self.configs.append(config)
4912
4913     @property
4914     def all_args(self):
4915         for config in reversed(self.configs):
4916             yield from config.all_args
4917         yield from self.parsed_args or []
4918
4919     def parse_known_args(self, **kwargs):
4920         return self.parser.parse_known_args(self.all_args, **kwargs)
4921
4922     def parse_args(self):
4923         return self.parser.parse_args(self.all_args)
4924
4925
4926 class WebSocketsWrapper:
4927     """Wraps websockets module to use in non-async scopes"""
4928     pool = None
4929
4930     def __init__(self, url, headers=None, connect=True):
4931         self.loop = asyncio.new_event_loop()
4932         # XXX: "loop" is deprecated
4933         self.conn = websockets.connect(
4934             url, extra_headers=headers, ping_interval=None,
4935             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
4936         if connect:
4937             self.__enter__()
4938         atexit.register(self.__exit__, None, None, None)
4939
4940     def __enter__(self):
4941         if not self.pool:
4942             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
4943         return self
4944
4945     def send(self, *args):
4946         self.run_with_loop(self.pool.send(*args), self.loop)
4947
4948     def recv(self, *args):
4949         return self.run_with_loop(self.pool.recv(*args), self.loop)
4950
4951     def __exit__(self, type, value, traceback):
4952         try:
4953             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
4954         finally:
4955             self.loop.close()
4956             self._cancel_all_tasks(self.loop)
4957
4958     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
4959     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
4960     @staticmethod
4961     def run_with_loop(main, loop):
4962         if not asyncio.iscoroutine(main):
4963             raise ValueError(f'a coroutine was expected, got {main!r}')
4964
4965         try:
4966             return loop.run_until_complete(main)
4967         finally:
4968             loop.run_until_complete(loop.shutdown_asyncgens())
4969             if hasattr(loop, 'shutdown_default_executor'):
4970                 loop.run_until_complete(loop.shutdown_default_executor())
4971
4972     @staticmethod
4973     def _cancel_all_tasks(loop):
4974         to_cancel = asyncio.all_tasks(loop)
4975
4976         if not to_cancel:
4977             return
4978
4979         for task in to_cancel:
4980             task.cancel()
4981
4982         # XXX: "loop" is removed in python 3.10+
4983         loop.run_until_complete(
4984             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
4985
4986         for task in to_cancel:
4987             if task.cancelled():
4988                 continue
4989             if task.exception() is not None:
4990                 loop.call_exception_handler({
4991                     'message': 'unhandled exception during asyncio.run() shutdown',
4992                     'exception': task.exception(),
4993                     'task': task,
4994                 })
4995
4996
4997 def merge_headers(*dicts):
4998     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4999     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5000
5001
5002 def cached_method(f):
5003     """Cache a method"""
5004     signature = inspect.signature(f)
5005
5006     @functools.wraps(f)
5007     def wrapper(self, *args, **kwargs):
5008         bound_args = signature.bind(self, *args, **kwargs)
5009         bound_args.apply_defaults()
5010         key = tuple(bound_args.arguments.values())[1:]
5011
5012         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5013         if key not in cache:
5014             cache[key] = f(self, *args, **kwargs)
5015         return cache[key]
5016     return wrapper
5017
5018
5019 class classproperty:
5020     """property access for class methods with optional caching"""
5021     def __new__(cls, func=None, *args, **kwargs):
5022         if not func:
5023             return functools.partial(cls, *args, **kwargs)
5024         return super().__new__(cls)
5025
5026     def __init__(self, func, *, cache=False):
5027         functools.update_wrapper(self, func)
5028         self.func = func
5029         self._cache = {} if cache else None
5030
5031     def __get__(self, _, cls):
5032         if self._cache is None:
5033             return self.func(cls)
5034         elif cls not in self._cache:
5035             self._cache[cls] = self.func(cls)
5036         return self._cache[cls]
5037
5038
5039 class function_with_repr:
5040     def __init__(self, func, repr_=None):
5041         functools.update_wrapper(self, func)
5042         self.func, self.__repr = func, repr_
5043
5044     def __call__(self, *args, **kwargs):
5045         return self.func(*args, **kwargs)
5046
5047     def __repr__(self):
5048         if self.__repr:
5049             return self.__repr
5050         return f'{self.func.__module__}.{self.func.__qualname__}'
5051
5052
5053 class Namespace(types.SimpleNamespace):
5054     """Immutable namespace"""
5055
5056     def __iter__(self):
5057         return iter(self.__dict__.values())
5058
5059     @property
5060     def items_(self):
5061         return self.__dict__.items()
5062
5063
5064 MEDIA_EXTENSIONS = Namespace(
5065     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5066     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5067     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5068     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5069     thumbnails=('jpg', 'png', 'webp'),
5070     storyboards=('mhtml', ),
5071     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5072     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5073 )
5074 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5075 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5076
5077 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5078
5079
5080 class RetryManager:
5081     """Usage:
5082         for retry in RetryManager(...):
5083             try:
5084                 ...
5085             except SomeException as err:
5086                 retry.error = err
5087                 continue
5088     """
5089     attempt, _error = 0, None
5090
5091     def __init__(self, _retries, _error_callback, **kwargs):
5092         self.retries = _retries or 0
5093         self.error_callback = functools.partial(_error_callback, **kwargs)
5094
5095     def _should_retry(self):
5096         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5097
5098     @property
5099     def error(self):
5100         if self._error is NO_DEFAULT:
5101             return None
5102         return self._error
5103
5104     @error.setter
5105     def error(self, value):
5106         self._error = value
5107
5108     def __iter__(self):
5109         while self._should_retry():
5110             self.error = NO_DEFAULT
5111             self.attempt += 1
5112             yield self
5113             if self.error:
5114                 self.error_callback(self.error, self.attempt, self.retries)
5115
5116     @staticmethod
5117     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5118         """Utility function for reporting retries"""
5119         if count > retries:
5120             if error:
5121                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5122             raise e
5123
5124         if not count:
5125             return warn(e)
5126         elif isinstance(e, ExtractorError):
5127             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5128         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5129
5130         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5131         if delay:
5132             info(f'Sleeping {delay:.2f} seconds ...')
5133             time.sleep(delay)
5134
5135
5136 def make_archive_id(ie, video_id):
5137     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5138     return f'{ie_key.lower()} {video_id}'
5139
5140
5141 def truncate_string(s, left, right=0):
5142     assert left > 3 and right >= 0
5143     if s is None or len(s) <= left + right:
5144         return s
5145     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5146
5147
5148 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5149     assert 'all' in alias_dict, '"all" alias is required'
5150     requested = list(start or [])
5151     for val in options:
5152         discard = val.startswith('-')
5153         if discard:
5154             val = val[1:]
5155
5156         if val in alias_dict:
5157             val = alias_dict[val] if not discard else [
5158                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5159             # NB: Do not allow regex in aliases for performance
5160             requested = orderedSet_from_options(val, alias_dict, start=requested)
5161             continue
5162
5163         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5164                    else [val] if val in alias_dict['all'] else None)
5165         if current is None:
5166             raise ValueError(val)
5167
5168         if discard:
5169             for item in current:
5170                 while item in requested:
5171                     requested.remove(item)
5172         else:
5173             requested.extend(current)
5174
5175     return orderedSet(requested)
5176
5177
5178 # TODO: Rewrite
5179 class FormatSorter:
5180     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5181
5182     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5183                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5184                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5185     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5186                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5187                     'fps', 'fs_approx', 'source', 'id')
5188
5189     settings = {
5190         'vcodec': {'type': 'ordered', 'regex': True,
5191                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5192         'acodec': {'type': 'ordered', 'regex': True,
5193                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5194         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5195                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5196         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5197                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5198         'vext': {'type': 'ordered', 'field': 'video_ext',
5199                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5200                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5201         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5202                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5203                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5204         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5205         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5206                        'field': ('vcodec', 'acodec'),
5207                        'function': lambda it: int(any(v != 'none' for v in it))},
5208         'ie_pref': {'priority': True, 'type': 'extractor'},
5209         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5210         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5211         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5212         'quality': {'convert': 'float', 'default': -1},
5213         'filesize': {'convert': 'bytes'},
5214         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5215         'id': {'convert': 'string', 'field': 'format_id'},
5216         'height': {'convert': 'float_none'},
5217         'width': {'convert': 'float_none'},
5218         'fps': {'convert': 'float_none'},
5219         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5220         'tbr': {'convert': 'float_none'},
5221         'vbr': {'convert': 'float_none'},
5222         'abr': {'convert': 'float_none'},
5223         'asr': {'convert': 'float_none'},
5224         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5225
5226         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5227         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5228                'function': lambda it: next(filter(None, it), None)},
5229         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5230                  'function': lambda it: next(filter(None, it), None)},
5231         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5232         'res': {'type': 'multiple', 'field': ('height', 'width'),
5233                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5234
5235         # Actual field names
5236         'format_id': {'type': 'alias', 'field': 'id'},
5237         'preference': {'type': 'alias', 'field': 'ie_pref'},
5238         'language_preference': {'type': 'alias', 'field': 'lang'},
5239         'source_preference': {'type': 'alias', 'field': 'source'},
5240         'protocol': {'type': 'alias', 'field': 'proto'},
5241         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5242         'audio_channels': {'type': 'alias', 'field': 'channels'},
5243
5244         # Deprecated
5245         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5246         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5247         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5248         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5249         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5250         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5251         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5252         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5253         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5254         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5255         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5256         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5257         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5258         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5259         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5260         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5261         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5262         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5263         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5264         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5265     }
5266
5267     def __init__(self, ydl, field_preference):
5268         self.ydl = ydl
5269         self._order = []
5270         self.evaluate_params(self.ydl.params, field_preference)
5271         if ydl.params.get('verbose'):
5272             self.print_verbose_info(self.ydl.write_debug)
5273
5274     def _get_field_setting(self, field, key):
5275         if field not in self.settings:
5276             if key in ('forced', 'priority'):
5277                 return False
5278             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5279                                         'deprecated and may be removed in a future version')
5280             self.settings[field] = {}
5281         propObj = self.settings[field]
5282         if key not in propObj:
5283             type = propObj.get('type')
5284             if key == 'field':
5285                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5286             elif key == 'convert':
5287                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5288             else:
5289                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5290             propObj[key] = default
5291         return propObj[key]
5292
5293     def _resolve_field_value(self, field, value, convertNone=False):
5294         if value is None:
5295             if not convertNone:
5296                 return None
5297         else:
5298             value = value.lower()
5299         conversion = self._get_field_setting(field, 'convert')
5300         if conversion == 'ignore':
5301             return None
5302         if conversion == 'string':
5303             return value
5304         elif conversion == 'float_none':
5305             return float_or_none(value)
5306         elif conversion == 'bytes':
5307             return parse_bytes(value)
5308         elif conversion == 'order':
5309             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5310             use_regex = self._get_field_setting(field, 'regex')
5311             list_length = len(order_list)
5312             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5313             if use_regex and value is not None:
5314                 for i, regex in enumerate(order_list):
5315                     if regex and re.match(regex, value):
5316                         return list_length - i
5317                 return list_length - empty_pos  # not in list
5318             else:  # not regex or  value = None
5319                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5320         else:
5321             if value.isnumeric():
5322                 return float(value)
5323             else:
5324                 self.settings[field]['convert'] = 'string'
5325                 return value
5326
5327     def evaluate_params(self, params, sort_extractor):
5328         self._use_free_order = params.get('prefer_free_formats', False)
5329         self._sort_user = params.get('format_sort', [])
5330         self._sort_extractor = sort_extractor
5331
5332         def add_item(field, reverse, closest, limit_text):
5333             field = field.lower()
5334             if field in self._order:
5335                 return
5336             self._order.append(field)
5337             limit = self._resolve_field_value(field, limit_text)
5338             data = {
5339                 'reverse': reverse,
5340                 'closest': False if limit is None else closest,
5341                 'limit_text': limit_text,
5342                 'limit': limit}
5343             if field in self.settings:
5344                 self.settings[field].update(data)
5345             else:
5346                 self.settings[field] = data
5347
5348         sort_list = (
5349             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5350             + (tuple() if params.get('format_sort_force', False)
5351                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5352             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5353
5354         for item in sort_list:
5355             match = re.match(self.regex, item)
5356             if match is None:
5357                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5358             field = match.group('field')
5359             if field is None:
5360                 continue
5361             if self._get_field_setting(field, 'type') == 'alias':
5362                 alias, field = field, self._get_field_setting(field, 'field')
5363                 if self._get_field_setting(alias, 'deprecated'):
5364                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5365                                                 f'be removed in a future version. Please use {field} instead')
5366             reverse = match.group('reverse') is not None
5367             closest = match.group('separator') == '~'
5368             limit_text = match.group('limit')
5369
5370             has_limit = limit_text is not None
5371             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5372             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5373
5374             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5375             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5376             limit_count = len(limits)
5377             for (i, f) in enumerate(fields):
5378                 add_item(f, reverse, closest,
5379                          limits[i] if i < limit_count
5380                          else limits[0] if has_limit and not has_multiple_limits
5381                          else None)
5382
5383     def print_verbose_info(self, write_debug):
5384         if self._sort_user:
5385             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5386         if self._sort_extractor:
5387             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5388         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5389             '+' if self._get_field_setting(field, 'reverse') else '', field,
5390             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5391                           self._get_field_setting(field, 'limit_text'),
5392                           self._get_field_setting(field, 'limit'))
5393             if self._get_field_setting(field, 'limit_text') is not None else '')
5394             for field in self._order if self._get_field_setting(field, 'visible')]))
5395
5396     def _calculate_field_preference_from_value(self, format, field, type, value):
5397         reverse = self._get_field_setting(field, 'reverse')
5398         closest = self._get_field_setting(field, 'closest')
5399         limit = self._get_field_setting(field, 'limit')
5400
5401         if type == 'extractor':
5402             maximum = self._get_field_setting(field, 'max')
5403             if value is None or (maximum is not None and value >= maximum):
5404                 value = -1
5405         elif type == 'boolean':
5406             in_list = self._get_field_setting(field, 'in_list')
5407             not_in_list = self._get_field_setting(field, 'not_in_list')
5408             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5409         elif type == 'ordered':
5410             value = self._resolve_field_value(field, value, True)
5411
5412         # try to convert to number
5413         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5414         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5415         if is_num:
5416             value = val_num
5417
5418         return ((-10, 0) if value is None
5419                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5420                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5421                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5422                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5423                 else (-1, value, 0))
5424
5425     def _calculate_field_preference(self, format, field):
5426         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5427         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5428         if type == 'multiple':
5429             type = 'field'  # Only 'field' is allowed in multiple for now
5430             actual_fields = self._get_field_setting(field, 'field')
5431
5432             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5433         else:
5434             value = get_value(field)
5435         return self._calculate_field_preference_from_value(format, field, type, value)
5436
5437     def calculate_preference(self, format):
5438         # Determine missing protocol
5439         if not format.get('protocol'):
5440             format['protocol'] = determine_protocol(format)
5441
5442         # Determine missing ext
5443         if not format.get('ext') and 'url' in format:
5444             format['ext'] = determine_ext(format['url'])
5445         if format.get('vcodec') == 'none':
5446             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5447             format['video_ext'] = 'none'
5448         else:
5449             format['video_ext'] = format['ext']
5450             format['audio_ext'] = 'none'
5451         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5452         #    format['preference'] = -1000
5453
5454         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5455             # HEVC-over-FLV is out-of-spec by FLV's original spec
5456             # ref. https://trac.ffmpeg.org/ticket/6389
5457             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5458             format['preference'] = -100
5459
5460         # Determine missing bitrates
5461         if format.get('vcodec') == 'none':
5462             format['vbr'] = 0
5463         if format.get('acodec') == 'none':
5464             format['abr'] = 0
5465         if not format.get('vbr') and format.get('vcodec') != 'none':
5466             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5467         if not format.get('abr') and format.get('acodec') != 'none':
5468             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5469         if not format.get('tbr'):
5470             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5471
5472         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5473
5474
5475 # XXX: Temporary
5476 class _YDLLogger:
5477     def __init__(self, ydl=None):
5478         self._ydl = ydl
5479
5480     def debug(self, message):
5481         if self._ydl:
5482             self._ydl.write_debug(message)
5483
5484     def info(self, message):
5485         if self._ydl:
5486             self._ydl.to_screen(message)
5487
5488     def warning(self, message, *, once=False):
5489         if self._ydl:
5490             self._ydl.report_warning(message, once)
5491
5492     def error(self, message, *, is_error=True):
5493         if self._ydl:
5494             self._ydl.report_error(message, is_error=is_error)
5495
5496     def stdout(self, message):
5497         if self._ydl:
5498             self._ydl.to_stdout(message)
5499
5500     def stderr(self, message):
5501         if self._ydl:
5502             self._ydl.to_stderr(message)