yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import inspect
  19 import io
  20 import itertools
  21 import json
  22 import locale
  23 import math
  24 import mimetypes
  25 import netrc
  26 import operator
  27 import os
  28 import platform
  29 import random
  30 import re
  31 import shlex
  32 import socket
  33 import ssl
  34 import struct
  35 import subprocess
  36 import sys
  37 import tempfile
  38 import time
  39 import traceback
  40 import types
  41 import unicodedata
  42 import urllib.error
  43 import urllib.parse
  44 import urllib.request
  45 import xml.etree.ElementTree
  46
  47 from . import traversal
  48
  49 from ..compat import functools  # isort: split
  50 from ..compat import (
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_HTMLParseError,
  54     compat_os_name,
  55     compat_shlex_quote,
  56 )
  57 from ..dependencies import websockets, xattr
  58
  59 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  60
  61 # This is not clearly defined otherwise
  62 compiled_regex_type = type(re.compile(''))
  63
  64
  65 class NO_DEFAULT:
  66     pass
  67
  68
  69 def IDENTITY(x):
  70     return x
  71
  72
  73 ENGLISH_MONTH_NAMES = [
  74     'January', 'February', 'March', 'April', 'May', 'June',
  75     'July', 'August', 'September', 'October', 'November', 'December']
  76
  77 MONTH_NAMES = {
  78     'en': ENGLISH_MONTH_NAMES,
  79     'fr': [
  80         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  81         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  82     # these follow the genitive grammatical case (dopełniacz)
  83     # some websites might be using nominative, which will require another month list
  84     # https://en.wikibooks.org/wiki/Polish/Noun_cases
  85     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  86            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  87 }
  88
  89 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  90 TIMEZONE_NAMES = {
  91     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  92     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
  93     'EST': -5, 'EDT': -4,  # Eastern
  94     'CST': -6, 'CDT': -5,  # Central
  95     'MST': -7, 'MDT': -6,  # Mountain
  96     'PST': -8, 'PDT': -7   # Pacific
  97 }
  98
  99 # needed for sanitizing filenames in restricted mode
 100 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 101                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 102                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 103
 104 DATE_FORMATS = (
 105     '%d %B %Y',
 106     '%d %b %Y',
 107     '%B %d %Y',
 108     '%B %dst %Y',
 109     '%B %dnd %Y',
 110     '%B %drd %Y',
 111     '%B %dth %Y',
 112     '%b %d %Y',
 113     '%b %dst %Y',
 114     '%b %dnd %Y',
 115     '%b %drd %Y',
 116     '%b %dth %Y',
 117     '%b %dst %Y %I:%M',
 118     '%b %dnd %Y %I:%M',
 119     '%b %drd %Y %I:%M',
 120     '%b %dth %Y %I:%M',
 121     '%Y %m %d',
 122     '%Y-%m-%d',
 123     '%Y.%m.%d.',
 124     '%Y/%m/%d',
 125     '%Y/%m/%d %H:%M',
 126     '%Y/%m/%d %H:%M:%S',
 127     '%Y%m%d%H%M',
 128     '%Y%m%d%H%M%S',
 129     '%Y%m%d',
 130     '%Y-%m-%d %H:%M',
 131     '%Y-%m-%d %H:%M:%S',
 132     '%Y-%m-%d %H:%M:%S.%f',
 133     '%Y-%m-%d %H:%M:%S:%f',
 134     '%d.%m.%Y %H:%M',
 135     '%d.%m.%Y %H.%M',
 136     '%Y-%m-%dT%H:%M:%SZ',
 137     '%Y-%m-%dT%H:%M:%S.%fZ',
 138     '%Y-%m-%dT%H:%M:%S.%f0Z',
 139     '%Y-%m-%dT%H:%M:%S',
 140     '%Y-%m-%dT%H:%M:%S.%f',
 141     '%Y-%m-%dT%H:%M',
 142     '%b %d %Y at %H:%M',
 143     '%b %d %Y at %H:%M:%S',
 144     '%B %d %Y at %H:%M',
 145     '%B %d %Y at %H:%M:%S',
 146     '%H:%M %d-%b-%Y',
 147 )
 148
 149 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 150 DATE_FORMATS_DAY_FIRST.extend([
 151     '%d-%m-%Y',
 152     '%d.%m.%Y',
 153     '%d.%m.%y',
 154     '%d/%m/%Y',
 155     '%d/%m/%y',
 156     '%d/%m/%Y %H:%M:%S',
 157     '%d-%m-%Y %H:%M',
 158     '%H:%M %d/%m/%Y',
 159 ])
 160
 161 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 162 DATE_FORMATS_MONTH_FIRST.extend([
 163     '%m-%d-%Y',
 164     '%m.%d.%Y',
 165     '%m/%d/%Y',
 166     '%m/%d/%y',
 167     '%m/%d/%Y %H:%M:%S',
 168 ])
 169
 170 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 171 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 172
 173 NUMBER_RE = r'\d+(?:\.\d+)?'
 174
 175
 176 @functools.cache
 177 def preferredencoding():
 178     """Get preferred encoding.
 179
 180     Returns the best encoding scheme for the system, based on
 181     locale.getpreferredencoding() and some further tweaks.
 182     """
 183     try:
 184         pref = locale.getpreferredencoding()
 185         'TEST'.encode(pref)
 186     except Exception:
 187         pref = 'UTF-8'
 188
 189     return pref
 190
 191
 192 def write_json_file(obj, fn):
 193     """ Encode obj as JSON and write it to fn, atomically if possible """
 194
 195     tf = tempfile.NamedTemporaryFile(
 196         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 197         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 198
 199     try:
 200         with tf:
 201             json.dump(obj, tf, ensure_ascii=False)
 202         if sys.platform == 'win32':
 203             # Need to remove existing file on Windows, else os.rename raises
 204             # WindowsError or FileExistsError.
 205             with contextlib.suppress(OSError):
 206                 os.unlink(fn)
 207         with contextlib.suppress(OSError):
 208             mask = os.umask(0)
 209             os.umask(mask)
 210             os.chmod(tf.name, 0o666 & ~mask)
 211         os.rename(tf.name, fn)
 212     except Exception:
 213         with contextlib.suppress(OSError):
 214             os.remove(tf.name)
 215         raise
 216
 217
 218 def find_xpath_attr(node, xpath, key, val=None):
 219     """ Find the xpath xpath[@key=val] """
 220     assert re.match(r'^[a-zA-Z_-]+$', key)
 221     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 222     return node.find(expr)
 223
 224 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 225 # the namespace parameter
 226
 227
 228 def xpath_with_ns(path, ns_map):
 229     components = [c.split(':') for c in path.split('/')]
 230     replaced = []
 231     for c in components:
 232         if len(c) == 1:
 233             replaced.append(c[0])
 234         else:
 235             ns, tag = c
 236             replaced.append('{%s}%s' % (ns_map[ns], tag))
 237     return '/'.join(replaced)
 238
 239
 240 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 241     def _find_xpath(xpath):
 242         return node.find(xpath)
 243
 244     if isinstance(xpath, str):
 245         n = _find_xpath(xpath)
 246     else:
 247         for xp in xpath:
 248             n = _find_xpath(xp)
 249             if n is not None:
 250                 break
 251
 252     if n is None:
 253         if default is not NO_DEFAULT:
 254             return default
 255         elif fatal:
 256             name = xpath if name is None else name
 257             raise ExtractorError('Could not find XML element %s' % name)
 258         else:
 259             return None
 260     return n
 261
 262
 263 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 264     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 265     if n is None or n == default:
 266         return n
 267     if n.text is None:
 268         if default is not NO_DEFAULT:
 269             return default
 270         elif fatal:
 271             name = xpath if name is None else name
 272             raise ExtractorError('Could not find XML element\'s text %s' % name)
 273         else:
 274             return None
 275     return n.text
 276
 277
 278 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 279     n = find_xpath_attr(node, xpath, key)
 280     if n is None:
 281         if default is not NO_DEFAULT:
 282             return default
 283         elif fatal:
 284             name = f'{xpath}[@{key}]' if name is None else name
 285             raise ExtractorError('Could not find XML attribute %s' % name)
 286         else:
 287             return None
 288     return n.attrib[key]
 289
 290
 291 def get_element_by_id(id, html, **kwargs):
 292     """Return the content of the tag with the specified ID in the passed HTML document"""
 293     return get_element_by_attribute('id', id, html, **kwargs)
 294
 295
 296 def get_element_html_by_id(id, html, **kwargs):
 297     """Return the html of the tag with the specified ID in the passed HTML document"""
 298     return get_element_html_by_attribute('id', id, html, **kwargs)
 299
 300
 301 def get_element_by_class(class_name, html):
 302     """Return the content of the first tag with the specified class in the passed HTML document"""
 303     retval = get_elements_by_class(class_name, html)
 304     return retval[0] if retval else None
 305
 306
 307 def get_element_html_by_class(class_name, html):
 308     """Return the html of the first tag with the specified class in the passed HTML document"""
 309     retval = get_elements_html_by_class(class_name, html)
 310     return retval[0] if retval else None
 311
 312
 313 def get_element_by_attribute(attribute, value, html, **kwargs):
 314     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 315     return retval[0] if retval else None
 316
 317
 318 def get_element_html_by_attribute(attribute, value, html, **kargs):
 319     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 320     return retval[0] if retval else None
 321
 322
 323 def get_elements_by_class(class_name, html, **kargs):
 324     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 325     return get_elements_by_attribute(
 326         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 327         html, escape_value=False)
 328
 329
 330 def get_elements_html_by_class(class_name, html):
 331     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 332     return get_elements_html_by_attribute(
 333         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 334         html, escape_value=False)
 335
 336
 337 def get_elements_by_attribute(*args, **kwargs):
 338     """Return the content of the tag with the specified attribute in the passed HTML document"""
 339     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 340
 341
 342 def get_elements_html_by_attribute(*args, **kwargs):
 343     """Return the html of the tag with the specified attribute in the passed HTML document"""
 344     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 345
 346
 347 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 348     """
 349     Return the text (content) and the html (whole) of the tag with the specified
 350     attribute in the passed HTML document
 351     """
 352     if not value:
 353         return
 354
 355     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 356
 357     value = re.escape(value) if escape_value else value
 358
 359     partial_element_re = rf'''(?x)
 360         <(?P<tag>{tag})
 361          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 362          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 363         '''
 364
 365     for m in re.finditer(partial_element_re, html):
 366         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 367
 368         yield (
 369             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 370             whole
 371         )
 372
 373
 374 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 375     """
 376     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 377     closing tag for the first opening tag it has encountered, and can be used
 378     as a context manager
 379     """
 380
 381     class HTMLBreakOnClosingTagException(Exception):
 382         pass
 383
 384     def __init__(self):
 385         self.tagstack = collections.deque()
 386         html.parser.HTMLParser.__init__(self)
 387
 388     def __enter__(self):
 389         return self
 390
 391     def __exit__(self, *_):
 392         self.close()
 393
 394     def close(self):
 395         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 396         # so data remains buffered; we no longer have any interest in it, thus
 397         # override this method to discard it
 398         pass
 399
 400     def handle_starttag(self, tag, _):
 401         self.tagstack.append(tag)
 402
 403     def handle_endtag(self, tag):
 404         if not self.tagstack:
 405             raise compat_HTMLParseError('no tags in the stack')
 406         while self.tagstack:
 407             inner_tag = self.tagstack.pop()
 408             if inner_tag == tag:
 409                 break
 410         else:
 411             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 412         if not self.tagstack:
 413             raise self.HTMLBreakOnClosingTagException()
 414
 415
 416 # XXX: This should be far less strict
 417 def get_element_text_and_html_by_tag(tag, html):
 418     """
 419     For the first element with the specified tag in the passed HTML document
 420     return its' content (text) and the whole element (html)
 421     """
 422     def find_or_raise(haystack, needle, exc):
 423         try:
 424             return haystack.index(needle)
 425         except ValueError:
 426             raise exc
 427     closing_tag = f'</{tag}>'
 428     whole_start = find_or_raise(
 429         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 430     content_start = find_or_raise(
 431         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 432     content_start += whole_start + 1
 433     with HTMLBreakOnClosingTagParser() as parser:
 434         parser.feed(html[whole_start:content_start])
 435         if not parser.tagstack or parser.tagstack[0] != tag:
 436             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 437         offset = content_start
 438         while offset < len(html):
 439             next_closing_tag_start = find_or_raise(
 440                 html[offset:], closing_tag,
 441                 compat_HTMLParseError(f'closing {tag} tag not found'))
 442             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 443             try:
 444                 parser.feed(html[offset:offset + next_closing_tag_end])
 445                 offset += next_closing_tag_end
 446             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 447                 return html[content_start:offset + next_closing_tag_start], \
 448                     html[whole_start:offset + next_closing_tag_end]
 449         raise compat_HTMLParseError('unexpected end of html')
 450
 451
 452 class HTMLAttributeParser(html.parser.HTMLParser):
 453     """Trivial HTML parser to gather the attributes for a single element"""
 454
 455     def __init__(self):
 456         self.attrs = {}
 457         html.parser.HTMLParser.__init__(self)
 458
 459     def handle_starttag(self, tag, attrs):
 460         self.attrs = dict(attrs)
 461         raise compat_HTMLParseError('done')
 462
 463
 464 class HTMLListAttrsParser(html.parser.HTMLParser):
 465     """HTML parser to gather the attributes for the elements of a list"""
 466
 467     def __init__(self):
 468         html.parser.HTMLParser.__init__(self)
 469         self.items = []
 470         self._level = 0
 471
 472     def handle_starttag(self, tag, attrs):
 473         if tag == 'li' and self._level == 0:
 474             self.items.append(dict(attrs))
 475         self._level += 1
 476
 477     def handle_endtag(self, tag):
 478         self._level -= 1
 479
 480
 481 def extract_attributes(html_element):
 482     """Given a string for an HTML element such as
 483     <el
 484          a="foo" B="bar" c="&98;az" d=boz
 485          empty= noval entity="&amp;"
 486          sq='"' dq="'"
 487     >
 488     Decode and return a dictionary of attributes.
 489     {
 490         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 491         'empty': '', 'noval': None, 'entity': '&',
 492         'sq': '"', 'dq': '\''
 493     }.
 494     """
 495     parser = HTMLAttributeParser()
 496     with contextlib.suppress(compat_HTMLParseError):
 497         parser.feed(html_element)
 498         parser.close()
 499     return parser.attrs
 500
 501
 502 def parse_list(webpage):
 503     """Given a string for an series of HTML <li> elements,
 504     return a dictionary of their attributes"""
 505     parser = HTMLListAttrsParser()
 506     parser.feed(webpage)
 507     parser.close()
 508     return parser.items
 509
 510
 511 def clean_html(html):
 512     """Clean an HTML snippet into a readable string"""
 513
 514     if html is None:  # Convenience for sanitizing descriptions etc.
 515         return html
 516
 517     html = re.sub(r'\s+', ' ', html)
 518     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 519     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 520     # Strip html tags
 521     html = re.sub('<.*?>', '', html)
 522     # Replace html entities
 523     html = unescapeHTML(html)
 524     return html.strip()
 525
 526
 527 class LenientJSONDecoder(json.JSONDecoder):
 528     # TODO: Write tests
 529     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 530         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 531         self._close_attempts = 2 * close_objects
 532         super().__init__(*args, **kwargs)
 533
 534     @staticmethod
 535     def _close_object(err):
 536         doc = err.doc[:err.pos]
 537         # We need to add comma first to get the correct error message
 538         if err.msg.startswith('Expecting \',\''):
 539             return doc + ','
 540         elif not doc.endswith(','):
 541             return
 542
 543         if err.msg.startswith('Expecting property name'):
 544             return doc[:-1] + '}'
 545         elif err.msg.startswith('Expecting value'):
 546             return doc[:-1] + ']'
 547
 548     def decode(self, s):
 549         if self.transform_source:
 550             s = self.transform_source(s)
 551         for attempt in range(self._close_attempts + 1):
 552             try:
 553                 if self.ignore_extra:
 554                     return self.raw_decode(s.lstrip())[0]
 555                 return super().decode(s)
 556             except json.JSONDecodeError as e:
 557                 if e.pos is None:
 558                     raise
 559                 elif attempt < self._close_attempts:
 560                     s = self._close_object(e)
 561                     if s is not None:
 562                         continue
 563                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 564         assert False, 'Too many attempts to decode JSON'
 565
 566
 567 def sanitize_open(filename, open_mode):
 568     """Try to open the given filename, and slightly tweak it if this fails.
 569
 570     Attempts to open the given filename. If this fails, it tries to change
 571     the filename slightly, step by step, until it's either able to open it
 572     or it fails and raises a final exception, like the standard open()
 573     function.
 574
 575     It returns the tuple (stream, definitive_file_name).
 576     """
 577     if filename == '-':
 578         if sys.platform == 'win32':
 579             import msvcrt
 580
 581             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 582             with contextlib.suppress(io.UnsupportedOperation):
 583                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 584         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 585
 586     for attempt in range(2):
 587         try:
 588             try:
 589                 if sys.platform == 'win32':
 590                     # FIXME: An exclusive lock also locks the file from being read.
 591                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 592                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 593                     raise LockingUnsupportedError()
 594                 stream = locked_file(filename, open_mode, block=False).__enter__()
 595             except OSError:
 596                 stream = open(filename, open_mode)
 597             return stream, filename
 598         except OSError as err:
 599             if attempt or err.errno in (errno.EACCES,):
 600                 raise
 601             old_filename, filename = filename, sanitize_path(filename)
 602             if old_filename == filename:
 603                 raise
 604
 605
 606 def timeconvert(timestr):
 607     """Convert RFC 2822 defined time string into system timestamp"""
 608     timestamp = None
 609     timetuple = email.utils.parsedate_tz(timestr)
 610     if timetuple is not None:
 611         timestamp = email.utils.mktime_tz(timetuple)
 612     return timestamp
 613
 614
 615 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 616     """Sanitizes a string so it could be used as part of a filename.
 617     @param restricted   Use a stricter subset of allowed characters
 618     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 619                         If unset, yt-dlp's new sanitization rules are in effect
 620     """
 621     if s == '':
 622         return ''
 623
 624     def replace_insane(char):
 625         if restricted and char in ACCENT_CHARS:
 626             return ACCENT_CHARS[char]
 627         elif not restricted and char == '\n':
 628             return '\0 '
 629         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 630             # Replace with their full-width unicode counterparts
 631             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 632         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 633             return ''
 634         elif char == '"':
 635             return '' if restricted else '\''
 636         elif char == ':':
 637             return '\0_\0-' if restricted else '\0 \0-'
 638         elif char in '\\/|*<>':
 639             return '\0_'
 640         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 641             return '\0_'
 642         return char
 643
 644     # Replace look-alike Unicode glyphs
 645     if restricted and (is_id is NO_DEFAULT or not is_id):
 646         s = unicodedata.normalize('NFKC', s)
 647     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 648     result = ''.join(map(replace_insane, s))
 649     if is_id is NO_DEFAULT:
 650         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 651         STRIP_RE = r'(?:\0.|[ _-])*'
 652         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 653     result = result.replace('\0', '') or '_'
 654
 655     if not is_id:
 656         while '__' in result:
 657             result = result.replace('__', '_')
 658         result = result.strip('_')
 659         # Common case of "Foreign band name - English song title"
 660         if restricted and result.startswith('-_'):
 661             result = result[2:]
 662         if result.startswith('-'):
 663             result = '_' + result[len('-'):]
 664         result = result.lstrip('.')
 665         if not result:
 666             result = '_'
 667     return result
 668
 669
 670 def sanitize_path(s, force=False):
 671     """Sanitizes and normalizes path on Windows"""
 672     if sys.platform == 'win32':
 673         force = False
 674         drive_or_unc, _ = os.path.splitdrive(s)
 675     elif force:
 676         drive_or_unc = ''
 677     else:
 678         return s
 679
 680     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 681     if drive_or_unc:
 682         norm_path.pop(0)
 683     sanitized_path = [
 684         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 685         for path_part in norm_path]
 686     if drive_or_unc:
 687         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 688     elif force and s and s[0] == os.path.sep:
 689         sanitized_path.insert(0, os.path.sep)
 690     return os.path.join(*sanitized_path)
 691
 692
 693 def sanitize_url(url, *, scheme='http'):
 694     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 695     # the number of unwanted failures due to missing protocol
 696     if url is None:
 697         return
 698     elif url.startswith('//'):
 699         return f'{scheme}:{url}'
 700     # Fix some common typos seen so far
 701     COMMON_TYPOS = (
 702         # https://github.com/ytdl-org/youtube-dl/issues/15649
 703         (r'^httpss://', r'https://'),
 704         # https://bx1.be/lives/direct-tv/
 705         (r'^rmtp([es]?)://', r'rtmp\1://'),
 706     )
 707     for mistake, fixup in COMMON_TYPOS:
 708         if re.match(mistake, url):
 709             return re.sub(mistake, fixup, url)
 710     return url
 711
 712
 713 def extract_basic_auth(url):
 714     parts = urllib.parse.urlsplit(url)
 715     if parts.username is None:
 716         return url, None
 717     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 718         parts.hostname if parts.port is None
 719         else '%s:%d' % (parts.hostname, parts.port))))
 720     auth_payload = base64.b64encode(
 721         ('%s:%s' % (parts.username, parts.password or '')).encode())
 722     return url, f'Basic {auth_payload.decode()}'
 723
 724
 725 def expand_path(s):
 726     """Expand shell variables and ~"""
 727     return os.path.expandvars(compat_expanduser(s))
 728
 729
 730 def orderedSet(iterable, *, lazy=False):
 731     """Remove all duplicates from the input iterable"""
 732     def _iter():
 733         seen = []  # Do not use set since the items can be unhashable
 734         for x in iterable:
 735             if x not in seen:
 736                 seen.append(x)
 737                 yield x
 738
 739     return _iter() if lazy else list(_iter())
 740
 741
 742 def _htmlentity_transform(entity_with_semicolon):
 743     """Transforms an HTML entity to a character."""
 744     entity = entity_with_semicolon[:-1]
 745
 746     # Known non-numeric HTML entity
 747     if entity in html.entities.name2codepoint:
 748         return chr(html.entities.name2codepoint[entity])
 749
 750     # TODO: HTML5 allows entities without a semicolon.
 751     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 752     if entity_with_semicolon in html.entities.html5:
 753         return html.entities.html5[entity_with_semicolon]
 754
 755     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 756     if mobj is not None:
 757         numstr = mobj.group(1)
 758         if numstr.startswith('x'):
 759             base = 16
 760             numstr = '0%s' % numstr
 761         else:
 762             base = 10
 763         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 764         with contextlib.suppress(ValueError):
 765             return chr(int(numstr, base))
 766
 767     # Unknown entity in name, return its literal representation
 768     return '&%s;' % entity
 769
 770
 771 def unescapeHTML(s):
 772     if s is None:
 773         return None
 774     assert isinstance(s, str)
 775
 776     return re.sub(
 777         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 778
 779
 780 def escapeHTML(text):
 781     return (
 782         text
 783         .replace('&', '&amp;')
 784         .replace('<', '&lt;')
 785         .replace('>', '&gt;')
 786         .replace('"', '&quot;')
 787         .replace("'", '&#39;')
 788     )
 789
 790
 791 class netrc_from_content(netrc.netrc):
 792     def __init__(self, content):
 793         self.hosts, self.macros = {}, {}
 794         with io.StringIO(content) as stream:
 795             self._parse('-', stream, False)
 796
 797
 798 class Popen(subprocess.Popen):
 799     if sys.platform == 'win32':
 800         _startupinfo = subprocess.STARTUPINFO()
 801         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 802     else:
 803         _startupinfo = None
 804
 805     @staticmethod
 806     def _fix_pyinstaller_ld_path(env):
 807         """Restore LD_LIBRARY_PATH when using PyInstaller
 808             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 809                  https://github.com/yt-dlp/yt-dlp/issues/4573
 810         """
 811         if not hasattr(sys, '_MEIPASS'):
 812             return
 813
 814         def _fix(key):
 815             orig = env.get(f'{key}_ORIG')
 816             if orig is None:
 817                 env.pop(key, None)
 818             else:
 819                 env[key] = orig
 820
 821         _fix('LD_LIBRARY_PATH')  # Linux
 822         _fix('DYLD_LIBRARY_PATH')  # macOS
 823
 824     def __init__(self, *args, env=None, text=False, **kwargs):
 825         if env is None:
 826             env = os.environ.copy()
 827         self._fix_pyinstaller_ld_path(env)
 828
 829         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 830         if text is True:
 831             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 832             kwargs.setdefault('encoding', 'utf-8')
 833             kwargs.setdefault('errors', 'replace')
 834         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 835
 836     def communicate_or_kill(self, *args, **kwargs):
 837         try:
 838             return self.communicate(*args, **kwargs)
 839         except BaseException:  # Including KeyboardInterrupt
 840             self.kill(timeout=None)
 841             raise
 842
 843     def kill(self, *, timeout=0):
 844         super().kill()
 845         if timeout != 0:
 846             self.wait(timeout=timeout)
 847
 848     @classmethod
 849     def run(cls, *args, timeout=None, **kwargs):
 850         with cls(*args, **kwargs) as proc:
 851             default = '' if proc.__text_mode else b''
 852             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 853             return stdout or default, stderr or default, proc.returncode
 854
 855
 856 def encodeArgument(s):
 857     # Legacy code that uses byte strings
 858     # Uncomment the following line after fixing all post processors
 859     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 860     return s if isinstance(s, str) else s.decode('ascii')
 861
 862
 863 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 864
 865
 866 def timetuple_from_msec(msec):
 867     secs, msec = divmod(msec, 1000)
 868     mins, secs = divmod(secs, 60)
 869     hrs, mins = divmod(mins, 60)
 870     return _timetuple(hrs, mins, secs, msec)
 871
 872
 873 def formatSeconds(secs, delim=':', msec=False):
 874     time = timetuple_from_msec(secs * 1000)
 875     if time.hours:
 876         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 877     elif time.minutes:
 878         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 879     else:
 880         ret = '%d' % time.seconds
 881     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 882
 883
 884 def bug_reports_message(before=';'):
 885     from ..update import REPOSITORY
 886
 887     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 888            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 889
 890     before = before.rstrip()
 891     if not before or before.endswith(('.', '!', '?')):
 892         msg = msg[0].title() + msg[1:]
 893
 894     return (before + ' ' if before else '') + msg
 895
 896
 897 class YoutubeDLError(Exception):
 898     """Base exception for YoutubeDL errors."""
 899     msg = None
 900
 901     def __init__(self, msg=None):
 902         if msg is not None:
 903             self.msg = msg
 904         elif self.msg is None:
 905             self.msg = type(self).__name__
 906         super().__init__(self.msg)
 907
 908
 909 class ExtractorError(YoutubeDLError):
 910     """Error during info extraction."""
 911
 912     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 913         """ tb, if given, is the original traceback (so that it can be printed out).
 914         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 915         """
 916         from ..networking.exceptions import network_exceptions
 917         if sys.exc_info()[0] in network_exceptions:
 918             expected = True
 919
 920         self.orig_msg = str(msg)
 921         self.traceback = tb
 922         self.expected = expected
 923         self.cause = cause
 924         self.video_id = video_id
 925         self.ie = ie
 926         self.exc_info = sys.exc_info()  # preserve original exception
 927         if isinstance(self.exc_info[1], ExtractorError):
 928             self.exc_info = self.exc_info[1].exc_info
 929         super().__init__(self.__msg)
 930
 931     @property
 932     def __msg(self):
 933         return ''.join((
 934             format_field(self.ie, None, '[%s] '),
 935             format_field(self.video_id, None, '%s: '),
 936             self.orig_msg,
 937             format_field(self.cause, None, ' (caused by %r)'),
 938             '' if self.expected else bug_reports_message()))
 939
 940     def format_traceback(self):
 941         return join_nonempty(
 942             self.traceback and ''.join(traceback.format_tb(self.traceback)),
 943             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
 944             delim='\n') or None
 945
 946     def __setattr__(self, name, value):
 947         super().__setattr__(name, value)
 948         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
 949             self.msg = self.__msg or type(self).__name__
 950             self.args = (self.msg, )  # Cannot be property
 951
 952
 953 class UnsupportedError(ExtractorError):
 954     def __init__(self, url):
 955         super().__init__(
 956             'Unsupported URL: %s' % url, expected=True)
 957         self.url = url
 958
 959
 960 class RegexNotFoundError(ExtractorError):
 961     """Error when a regex didn't match"""
 962     pass
 963
 964
 965 class GeoRestrictedError(ExtractorError):
 966     """Geographic restriction Error exception.
 967
 968     This exception may be thrown when a video is not available from your
 969     geographic location due to geographic restrictions imposed by a website.
 970     """
 971
 972     def __init__(self, msg, countries=None, **kwargs):
 973         kwargs['expected'] = True
 974         super().__init__(msg, **kwargs)
 975         self.countries = countries
 976
 977
 978 class UserNotLive(ExtractorError):
 979     """Error when a channel/user is not live"""
 980
 981     def __init__(self, msg=None, **kwargs):
 982         kwargs['expected'] = True
 983         super().__init__(msg or 'The channel is not currently live', **kwargs)
 984
 985
 986 class DownloadError(YoutubeDLError):
 987     """Download Error exception.
 988
 989     This exception may be thrown by FileDownloader objects if they are not
 990     configured to continue on errors. They will contain the appropriate
 991     error message.
 992     """
 993
 994     def __init__(self, msg, exc_info=None):
 995         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 996         super().__init__(msg)
 997         self.exc_info = exc_info
 998
 999
1000 class EntryNotInPlaylist(YoutubeDLError):
1001     """Entry not in playlist exception.
1002
1003     This exception will be thrown by YoutubeDL when a requested entry
1004     is not found in the playlist info_dict
1005     """
1006     msg = 'Entry not found in info'
1007
1008
1009 class SameFileError(YoutubeDLError):
1010     """Same File exception.
1011
1012     This exception will be thrown by FileDownloader objects if they detect
1013     multiple files would have to be downloaded to the same file on disk.
1014     """
1015     msg = 'Fixed output name but more than one file to download'
1016
1017     def __init__(self, filename=None):
1018         if filename is not None:
1019             self.msg += f': {filename}'
1020         super().__init__(self.msg)
1021
1022
1023 class PostProcessingError(YoutubeDLError):
1024     """Post Processing exception.
1025
1026     This exception may be raised by PostProcessor's .run() method to
1027     indicate an error in the postprocessing task.
1028     """
1029
1030
1031 class DownloadCancelled(YoutubeDLError):
1032     """ Exception raised when the download queue should be interrupted """
1033     msg = 'The download was cancelled'
1034
1035
1036 class ExistingVideoReached(DownloadCancelled):
1037     """ --break-on-existing triggered """
1038     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1039
1040
1041 class RejectedVideoReached(DownloadCancelled):
1042     """ --break-match-filter triggered """
1043     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1044
1045
1046 class MaxDownloadsReached(DownloadCancelled):
1047     """ --max-downloads limit has been reached. """
1048     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1049
1050
1051 class ReExtractInfo(YoutubeDLError):
1052     """ Video info needs to be re-extracted. """
1053
1054     def __init__(self, msg, expected=False):
1055         super().__init__(msg)
1056         self.expected = expected
1057
1058
1059 class ThrottledDownload(ReExtractInfo):
1060     """ Download speed below --throttled-rate. """
1061     msg = 'The download speed is below throttle limit'
1062
1063     def __init__(self):
1064         super().__init__(self.msg, expected=False)
1065
1066
1067 class UnavailableVideoError(YoutubeDLError):
1068     """Unavailable Format exception.
1069
1070     This exception will be thrown when a video is requested
1071     in a format that is not available for that video.
1072     """
1073     msg = 'Unable to download video'
1074
1075     def __init__(self, err=None):
1076         if err is not None:
1077             self.msg += f': {err}'
1078         super().__init__(self.msg)
1079
1080
1081 class ContentTooShortError(YoutubeDLError):
1082     """Content Too Short exception.
1083
1084     This exception may be raised by FileDownloader objects when a file they
1085     download is too small for what the server announced first, indicating
1086     the connection was probably interrupted.
1087     """
1088
1089     def __init__(self, downloaded, expected):
1090         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1091         # Both in bytes
1092         self.downloaded = downloaded
1093         self.expected = expected
1094
1095
1096 class XAttrMetadataError(YoutubeDLError):
1097     def __init__(self, code=None, msg='Unknown error'):
1098         super().__init__(msg)
1099         self.code = code
1100         self.msg = msg
1101
1102         # Parsing code and msg
1103         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1104                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1105             self.reason = 'NO_SPACE'
1106         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1107             self.reason = 'VALUE_TOO_LONG'
1108         else:
1109             self.reason = 'NOT_SUPPORTED'
1110
1111
1112 class XAttrUnavailableError(YoutubeDLError):
1113     pass
1114
1115
1116 def is_path_like(f):
1117     return isinstance(f, (str, bytes, os.PathLike))
1118
1119
1120 def extract_timezone(date_str):
1121     m = re.search(
1122         r'''(?x)
1123             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1124             (?P<tz>Z|                                            # just the UTC Z, or
1125                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1126                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1127                    [ ]?                                          # optional space
1128                 (?P<sign>\+|-)                                   # +/-
1129                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1130             $)
1131         ''', date_str)
1132     if not m:
1133         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1134         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1135         if timezone is not None:
1136             date_str = date_str[:-len(m.group('tz'))]
1137         timezone = datetime.timedelta(hours=timezone or 0)
1138     else:
1139         date_str = date_str[:-len(m.group('tz'))]
1140         if not m.group('sign'):
1141             timezone = datetime.timedelta()
1142         else:
1143             sign = 1 if m.group('sign') == '+' else -1
1144             timezone = datetime.timedelta(
1145                 hours=sign * int(m.group('hours')),
1146                 minutes=sign * int(m.group('minutes')))
1147     return timezone, date_str
1148
1149
1150 def parse_iso8601(date_str, delimiter='T', timezone=None):
1151     """ Return a UNIX timestamp from the given date """
1152
1153     if date_str is None:
1154         return None
1155
1156     date_str = re.sub(r'\.[0-9]+', '', date_str)
1157
1158     if timezone is None:
1159         timezone, date_str = extract_timezone(date_str)
1160
1161     with contextlib.suppress(ValueError):
1162         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1163         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1164         return calendar.timegm(dt.timetuple())
1165
1166
1167 def date_formats(day_first=True):
1168     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1169
1170
1171 def unified_strdate(date_str, day_first=True):
1172     """Return a string with the date in the format YYYYMMDD"""
1173
1174     if date_str is None:
1175         return None
1176     upload_date = None
1177     # Replace commas
1178     date_str = date_str.replace(',', ' ')
1179     # Remove AM/PM + timezone
1180     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1181     _, date_str = extract_timezone(date_str)
1182
1183     for expression in date_formats(day_first):
1184         with contextlib.suppress(ValueError):
1185             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1186     if upload_date is None:
1187         timetuple = email.utils.parsedate_tz(date_str)
1188         if timetuple:
1189             with contextlib.suppress(ValueError):
1190                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1191     if upload_date is not None:
1192         return str(upload_date)
1193
1194
1195 def unified_timestamp(date_str, day_first=True):
1196     if not isinstance(date_str, str):
1197         return None
1198
1199     date_str = re.sub(r'\s+', ' ', re.sub(
1200         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1201
1202     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1203     timezone, date_str = extract_timezone(date_str)
1204
1205     # Remove AM/PM + timezone
1206     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1207
1208     # Remove unrecognized timezones from ISO 8601 alike timestamps
1209     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1210     if m:
1211         date_str = date_str[:-len(m.group('tz'))]
1212
1213     # Python only supports microseconds, so remove nanoseconds
1214     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1215     if m:
1216         date_str = m.group(1)
1217
1218     for expression in date_formats(day_first):
1219         with contextlib.suppress(ValueError):
1220             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1221             return calendar.timegm(dt.timetuple())
1222
1223     timetuple = email.utils.parsedate_tz(date_str)
1224     if timetuple:
1225         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1226
1227
1228 def determine_ext(url, default_ext='unknown_video'):
1229     if url is None or '.' not in url:
1230         return default_ext
1231     guess = url.partition('?')[0].rpartition('.')[2]
1232     if re.match(r'^[A-Za-z0-9]+$', guess):
1233         return guess
1234     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1235     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1236         return guess.rstrip('/')
1237     else:
1238         return default_ext
1239
1240
1241 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1242     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1243
1244
1245 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1246     R"""
1247     Return a datetime object from a string.
1248     Supported format:
1249         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1250
1251     @param format       strftime format of DATE
1252     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1253                         auto: round to the unit provided in date_str (if applicable).
1254     """
1255     auto_precision = False
1256     if precision == 'auto':
1257         auto_precision = True
1258         precision = 'microsecond'
1259     today = datetime_round(datetime.datetime.utcnow(), precision)
1260     if date_str in ('now', 'today'):
1261         return today
1262     if date_str == 'yesterday':
1263         return today - datetime.timedelta(days=1)
1264     match = re.match(
1265         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1266         date_str)
1267     if match is not None:
1268         start_time = datetime_from_str(match.group('start'), precision, format)
1269         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1270         unit = match.group('unit')
1271         if unit == 'month' or unit == 'year':
1272             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1273             unit = 'day'
1274         else:
1275             if unit == 'week':
1276                 unit = 'day'
1277                 time *= 7
1278             delta = datetime.timedelta(**{unit + 's': time})
1279             new_date = start_time + delta
1280         if auto_precision:
1281             return datetime_round(new_date, unit)
1282         return new_date
1283
1284     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1285
1286
1287 def date_from_str(date_str, format='%Y%m%d', strict=False):
1288     R"""
1289     Return a date object from a string using datetime_from_str
1290
1291     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1292                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1293     """
1294     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1295         raise ValueError(f'Invalid date format "{date_str}"')
1296     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1297
1298
1299 def datetime_add_months(dt, months):
1300     """Increment/Decrement a datetime object by months."""
1301     month = dt.month + months - 1
1302     year = dt.year + month // 12
1303     month = month % 12 + 1
1304     day = min(dt.day, calendar.monthrange(year, month)[1])
1305     return dt.replace(year, month, day)
1306
1307
1308 def datetime_round(dt, precision='day'):
1309     """
1310     Round a datetime object's time to a specific precision
1311     """
1312     if precision == 'microsecond':
1313         return dt
1314
1315     unit_seconds = {
1316         'day': 86400,
1317         'hour': 3600,
1318         'minute': 60,
1319         'second': 1,
1320     }
1321     roundto = lambda x, n: ((x + n / 2) // n) * n
1322     timestamp = calendar.timegm(dt.timetuple())
1323     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1324
1325
1326 def hyphenate_date(date_str):
1327     """
1328     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1329     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1330     if match is not None:
1331         return '-'.join(match.groups())
1332     else:
1333         return date_str
1334
1335
1336 class DateRange:
1337     """Represents a time interval between two dates"""
1338
1339     def __init__(self, start=None, end=None):
1340         """start and end must be strings in the format accepted by date"""
1341         if start is not None:
1342             self.start = date_from_str(start, strict=True)
1343         else:
1344             self.start = datetime.datetime.min.date()
1345         if end is not None:
1346             self.end = date_from_str(end, strict=True)
1347         else:
1348             self.end = datetime.datetime.max.date()
1349         if self.start > self.end:
1350             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1351
1352     @classmethod
1353     def day(cls, day):
1354         """Returns a range that only contains the given day"""
1355         return cls(day, day)
1356
1357     def __contains__(self, date):
1358         """Check if the date is in the range"""
1359         if not isinstance(date, datetime.date):
1360             date = date_from_str(date)
1361         return self.start <= date <= self.end
1362
1363     def __repr__(self):
1364         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1365
1366     def __eq__(self, other):
1367         return (isinstance(other, DateRange)
1368                 and self.start == other.start and self.end == other.end)
1369
1370
1371 @functools.cache
1372 def system_identifier():
1373     python_implementation = platform.python_implementation()
1374     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1375         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1376     libc_ver = []
1377     with contextlib.suppress(OSError):  # We may not have access to the executable
1378         libc_ver = platform.libc_ver()
1379
1380     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1381         platform.python_version(),
1382         python_implementation,
1383         platform.machine(),
1384         platform.architecture()[0],
1385         platform.platform(),
1386         ssl.OPENSSL_VERSION,
1387         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1388     )
1389
1390
1391 @functools.cache
1392 def get_windows_version():
1393     ''' Get Windows version. returns () if it's not running on Windows '''
1394     if compat_os_name == 'nt':
1395         return version_tuple(platform.win32_ver()[1])
1396     else:
1397         return ()
1398
1399
1400 def write_string(s, out=None, encoding=None):
1401     assert isinstance(s, str)
1402     out = out or sys.stderr
1403     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1404     if not out:
1405         return
1406
1407     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1408         s = re.sub(r'([\r\n]+)', r' \1', s)
1409
1410     enc, buffer = None, out
1411     if 'b' in getattr(out, 'mode', ''):
1412         enc = encoding or preferredencoding()
1413     elif hasattr(out, 'buffer'):
1414         buffer = out.buffer
1415         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1416
1417     buffer.write(s.encode(enc, 'ignore') if enc else s)
1418     out.flush()
1419
1420
1421 # TODO: Use global logger
1422 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1423     from .. import _IN_CLI
1424     if _IN_CLI:
1425         if msg in deprecation_warning._cache:
1426             return
1427         deprecation_warning._cache.add(msg)
1428         if printer:
1429             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1430         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1431     else:
1432         import warnings
1433         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1434
1435
1436 deprecation_warning._cache = set()
1437
1438
1439 def bytes_to_intlist(bs):
1440     if not bs:
1441         return []
1442     if isinstance(bs[0], int):  # Python 3
1443         return list(bs)
1444     else:
1445         return [ord(c) for c in bs]
1446
1447
1448 def intlist_to_bytes(xs):
1449     if not xs:
1450         return b''
1451     return struct.pack('%dB' % len(xs), *xs)
1452
1453
1454 class LockingUnsupportedError(OSError):
1455     msg = 'File locking is not supported'
1456
1457     def __init__(self):
1458         super().__init__(self.msg)
1459
1460
1461 # Cross-platform file locking
1462 if sys.platform == 'win32':
1463     import ctypes
1464     import ctypes.wintypes
1465     import msvcrt
1466
1467     class OVERLAPPED(ctypes.Structure):
1468         _fields_ = [
1469             ('Internal', ctypes.wintypes.LPVOID),
1470             ('InternalHigh', ctypes.wintypes.LPVOID),
1471             ('Offset', ctypes.wintypes.DWORD),
1472             ('OffsetHigh', ctypes.wintypes.DWORD),
1473             ('hEvent', ctypes.wintypes.HANDLE),
1474         ]
1475
1476     kernel32 = ctypes.WinDLL('kernel32')
1477     LockFileEx = kernel32.LockFileEx
1478     LockFileEx.argtypes = [
1479         ctypes.wintypes.HANDLE,     # hFile
1480         ctypes.wintypes.DWORD,      # dwFlags
1481         ctypes.wintypes.DWORD,      # dwReserved
1482         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1483         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1484         ctypes.POINTER(OVERLAPPED)  # Overlapped
1485     ]
1486     LockFileEx.restype = ctypes.wintypes.BOOL
1487     UnlockFileEx = kernel32.UnlockFileEx
1488     UnlockFileEx.argtypes = [
1489         ctypes.wintypes.HANDLE,     # hFile
1490         ctypes.wintypes.DWORD,      # dwReserved
1491         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1492         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1493         ctypes.POINTER(OVERLAPPED)  # Overlapped
1494     ]
1495     UnlockFileEx.restype = ctypes.wintypes.BOOL
1496     whole_low = 0xffffffff
1497     whole_high = 0x7fffffff
1498
1499     def _lock_file(f, exclusive, block):
1500         overlapped = OVERLAPPED()
1501         overlapped.Offset = 0
1502         overlapped.OffsetHigh = 0
1503         overlapped.hEvent = 0
1504         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1505
1506         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1507                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1508                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1509             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1510             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1511
1512     def _unlock_file(f):
1513         assert f._lock_file_overlapped_p
1514         handle = msvcrt.get_osfhandle(f.fileno())
1515         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1516             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1517
1518 else:
1519     try:
1520         import fcntl
1521
1522         def _lock_file(f, exclusive, block):
1523             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1524             if not block:
1525                 flags |= fcntl.LOCK_NB
1526             try:
1527                 fcntl.flock(f, flags)
1528             except BlockingIOError:
1529                 raise
1530             except OSError:  # AOSP does not have flock()
1531                 fcntl.lockf(f, flags)
1532
1533         def _unlock_file(f):
1534             with contextlib.suppress(OSError):
1535                 return fcntl.flock(f, fcntl.LOCK_UN)
1536             with contextlib.suppress(OSError):
1537                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1538             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1539
1540     except ImportError:
1541
1542         def _lock_file(f, exclusive, block):
1543             raise LockingUnsupportedError()
1544
1545         def _unlock_file(f):
1546             raise LockingUnsupportedError()
1547
1548
1549 class locked_file:
1550     locked = False
1551
1552     def __init__(self, filename, mode, block=True, encoding=None):
1553         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1554             raise NotImplementedError(mode)
1555         self.mode, self.block = mode, block
1556
1557         writable = any(f in mode for f in 'wax+')
1558         readable = any(f in mode for f in 'r+')
1559         flags = functools.reduce(operator.ior, (
1560             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
1561             getattr(os, 'O_BINARY', 0),  # Windows only
1562             getattr(os, 'O_NOINHERIT', 0),  # Windows only
1563             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
1564             os.O_APPEND if 'a' in mode else 0,
1565             os.O_EXCL if 'x' in mode else 0,
1566             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1567         ))
1568
1569         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1570
1571     def __enter__(self):
1572         exclusive = 'r' not in self.mode
1573         try:
1574             _lock_file(self.f, exclusive, self.block)
1575             self.locked = True
1576         except OSError:
1577             self.f.close()
1578             raise
1579         if 'w' in self.mode:
1580             try:
1581                 self.f.truncate()
1582             except OSError as e:
1583                 if e.errno not in (
1584                     errno.ESPIPE,  # Illegal seek - expected for FIFO
1585                     errno.EINVAL,  # Invalid argument - expected for /dev/null
1586                 ):
1587                     raise
1588         return self
1589
1590     def unlock(self):
1591         if not self.locked:
1592             return
1593         try:
1594             _unlock_file(self.f)
1595         finally:
1596             self.locked = False
1597
1598     def __exit__(self, *_):
1599         try:
1600             self.unlock()
1601         finally:
1602             self.f.close()
1603
1604     open = __enter__
1605     close = __exit__
1606
1607     def __getattr__(self, attr):
1608         return getattr(self.f, attr)
1609
1610     def __iter__(self):
1611         return iter(self.f)
1612
1613
1614 @functools.cache
1615 def get_filesystem_encoding():
1616     encoding = sys.getfilesystemencoding()
1617     return encoding if encoding is not None else 'utf-8'
1618
1619
1620 def shell_quote(args):
1621     quoted_args = []
1622     encoding = get_filesystem_encoding()
1623     for a in args:
1624         if isinstance(a, bytes):
1625             # We may get a filename encoded with 'encodeFilename'
1626             a = a.decode(encoding)
1627         quoted_args.append(compat_shlex_quote(a))
1628     return ' '.join(quoted_args)
1629
1630
1631 def smuggle_url(url, data):
1632     """ Pass additional data in a URL for internal use. """
1633
1634     url, idata = unsmuggle_url(url, {})
1635     data.update(idata)
1636     sdata = urllib.parse.urlencode(
1637         {'__youtubedl_smuggle': json.dumps(data)})
1638     return url + '#' + sdata
1639
1640
1641 def unsmuggle_url(smug_url, default=None):
1642     if '#__youtubedl_smuggle' not in smug_url:
1643         return smug_url, default
1644     url, _, sdata = smug_url.rpartition('#')
1645     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1646     data = json.loads(jsond)
1647     return url, data
1648
1649
1650 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1651     """ Formats numbers with decimal sufixes like K, M, etc """
1652     num, factor = float_or_none(num), float(factor)
1653     if num is None or num < 0:
1654         return None
1655     POSSIBLE_SUFFIXES = 'kMGTPEZY'
1656     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1657     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1658     if factor == 1024:
1659         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1660     converted = num / (factor ** exponent)
1661     return fmt % (converted, suffix)
1662
1663
1664 def format_bytes(bytes):
1665     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1666
1667
1668 def lookup_unit_table(unit_table, s, strict=False):
1669     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1670     units_re = '|'.join(re.escape(u) for u in unit_table)
1671     m = (re.fullmatch if strict else re.match)(
1672         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1673     if not m:
1674         return None
1675
1676     num = float(m.group('num').replace(',', '.'))
1677     mult = unit_table[m.group('unit')]
1678     return round(num * mult)
1679
1680
1681 def parse_bytes(s):
1682     """Parse a string indicating a byte quantity into an integer"""
1683     return lookup_unit_table(
1684         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1685         s.upper(), strict=True)
1686
1687
1688 def parse_filesize(s):
1689     if s is None:
1690         return None
1691
1692     # The lower-case forms are of course incorrect and unofficial,
1693     # but we support those too
1694     _UNIT_TABLE = {
1695         'B': 1,
1696         'b': 1,
1697         'bytes': 1,
1698         'KiB': 1024,
1699         'KB': 1000,
1700         'kB': 1024,
1701         'Kb': 1000,
1702         'kb': 1000,
1703         'kilobytes': 1000,
1704         'kibibytes': 1024,
1705         'MiB': 1024 ** 2,
1706         'MB': 1000 ** 2,
1707         'mB': 1024 ** 2,
1708         'Mb': 1000 ** 2,
1709         'mb': 1000 ** 2,
1710         'megabytes': 1000 ** 2,
1711         'mebibytes': 1024 ** 2,
1712         'GiB': 1024 ** 3,
1713         'GB': 1000 ** 3,
1714         'gB': 1024 ** 3,
1715         'Gb': 1000 ** 3,
1716         'gb': 1000 ** 3,
1717         'gigabytes': 1000 ** 3,
1718         'gibibytes': 1024 ** 3,
1719         'TiB': 1024 ** 4,
1720         'TB': 1000 ** 4,
1721         'tB': 1024 ** 4,
1722         'Tb': 1000 ** 4,
1723         'tb': 1000 ** 4,
1724         'terabytes': 1000 ** 4,
1725         'tebibytes': 1024 ** 4,
1726         'PiB': 1024 ** 5,
1727         'PB': 1000 ** 5,
1728         'pB': 1024 ** 5,
1729         'Pb': 1000 ** 5,
1730         'pb': 1000 ** 5,
1731         'petabytes': 1000 ** 5,
1732         'pebibytes': 1024 ** 5,
1733         'EiB': 1024 ** 6,
1734         'EB': 1000 ** 6,
1735         'eB': 1024 ** 6,
1736         'Eb': 1000 ** 6,
1737         'eb': 1000 ** 6,
1738         'exabytes': 1000 ** 6,
1739         'exbibytes': 1024 ** 6,
1740         'ZiB': 1024 ** 7,
1741         'ZB': 1000 ** 7,
1742         'zB': 1024 ** 7,
1743         'Zb': 1000 ** 7,
1744         'zb': 1000 ** 7,
1745         'zettabytes': 1000 ** 7,
1746         'zebibytes': 1024 ** 7,
1747         'YiB': 1024 ** 8,
1748         'YB': 1000 ** 8,
1749         'yB': 1024 ** 8,
1750         'Yb': 1000 ** 8,
1751         'yb': 1000 ** 8,
1752         'yottabytes': 1000 ** 8,
1753         'yobibytes': 1024 ** 8,
1754     }
1755
1756     return lookup_unit_table(_UNIT_TABLE, s)
1757
1758
1759 def parse_count(s):
1760     if s is None:
1761         return None
1762
1763     s = re.sub(r'^[^\d]+\s', '', s).strip()
1764
1765     if re.match(r'^[\d,.]+$', s):
1766         return str_to_int(s)
1767
1768     _UNIT_TABLE = {
1769         'k': 1000,
1770         'K': 1000,
1771         'm': 1000 ** 2,
1772         'M': 1000 ** 2,
1773         'kk': 1000 ** 2,
1774         'KK': 1000 ** 2,
1775         'b': 1000 ** 3,
1776         'B': 1000 ** 3,
1777     }
1778
1779     ret = lookup_unit_table(_UNIT_TABLE, s)
1780     if ret is not None:
1781         return ret
1782
1783     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1784     if mobj:
1785         return str_to_int(mobj.group(1))
1786
1787
1788 def parse_resolution(s, *, lenient=False):
1789     if s is None:
1790         return {}
1791
1792     if lenient:
1793         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1794     else:
1795         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1796     if mobj:
1797         return {
1798             'width': int(mobj.group('w')),
1799             'height': int(mobj.group('h')),
1800         }
1801
1802     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1803     if mobj:
1804         return {'height': int(mobj.group(1))}
1805
1806     mobj = re.search(r'\b([48])[kK]\b', s)
1807     if mobj:
1808         return {'height': int(mobj.group(1)) * 540}
1809
1810     return {}
1811
1812
1813 def parse_bitrate(s):
1814     if not isinstance(s, str):
1815         return
1816     mobj = re.search(r'\b(\d+)\s*kbps', s)
1817     if mobj:
1818         return int(mobj.group(1))
1819
1820
1821 def month_by_name(name, lang='en'):
1822     """ Return the number of a month by (locale-independently) English name """
1823
1824     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1825
1826     try:
1827         return month_names.index(name) + 1
1828     except ValueError:
1829         return None
1830
1831
1832 def month_by_abbreviation(abbrev):
1833     """ Return the number of a month by (locale-independently) English
1834         abbreviations """
1835
1836     try:
1837         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1838     except ValueError:
1839         return None
1840
1841
1842 def fix_xml_ampersands(xml_str):
1843     """Replace all the '&' by '&amp;' in XML"""
1844     return re.sub(
1845         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1846         '&amp;',
1847         xml_str)
1848
1849
1850 def setproctitle(title):
1851     assert isinstance(title, str)
1852
1853     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1854     try:
1855         import ctypes
1856     except ImportError:
1857         return
1858
1859     try:
1860         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1861     except OSError:
1862         return
1863     except TypeError:
1864         # LoadLibrary in Windows Python 2.7.13 only expects
1865         # a bytestring, but since unicode_literals turns
1866         # every string into a unicode string, it fails.
1867         return
1868     title_bytes = title.encode()
1869     buf = ctypes.create_string_buffer(len(title_bytes))
1870     buf.value = title_bytes
1871     try:
1872         libc.prctl(15, buf, 0, 0, 0)
1873     except AttributeError:
1874         return  # Strange libc, just skip this
1875
1876
1877 def remove_start(s, start):
1878     return s[len(start):] if s is not None and s.startswith(start) else s
1879
1880
1881 def remove_end(s, end):
1882     return s[:-len(end)] if s is not None and s.endswith(end) else s
1883
1884
1885 def remove_quotes(s):
1886     if s is None or len(s) < 2:
1887         return s
1888     for quote in ('"', "'", ):
1889         if s[0] == quote and s[-1] == quote:
1890             return s[1:-1]
1891     return s
1892
1893
1894 def get_domain(url):
1895     """
1896     This implementation is inconsistent, but is kept for compatibility.
1897     Use this only for "webpage_url_domain"
1898     """
1899     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1900
1901
1902 def url_basename(url):
1903     path = urllib.parse.urlparse(url).path
1904     return path.strip('/').split('/')[-1]
1905
1906
1907 def base_url(url):
1908     return re.match(r'https?://[^?#]+/', url).group()
1909
1910
1911 def urljoin(base, path):
1912     if isinstance(path, bytes):
1913         path = path.decode()
1914     if not isinstance(path, str) or not path:
1915         return None
1916     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1917         return path
1918     if isinstance(base, bytes):
1919         base = base.decode()
1920     if not isinstance(base, str) or not re.match(
1921             r'^(?:https?:)?//', base):
1922         return None
1923     return urllib.parse.urljoin(base, path)
1924
1925
1926 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1927     if get_attr and v is not None:
1928         v = getattr(v, get_attr, None)
1929     try:
1930         return int(v) * invscale // scale
1931     except (ValueError, TypeError, OverflowError):
1932         return default
1933
1934
1935 def str_or_none(v, default=None):
1936     return default if v is None else str(v)
1937
1938
1939 def str_to_int(int_str):
1940     """ A more relaxed version of int_or_none """
1941     if isinstance(int_str, int):
1942         return int_str
1943     elif isinstance(int_str, str):
1944         int_str = re.sub(r'[,\.\+]', '', int_str)
1945         return int_or_none(int_str)
1946
1947
1948 def float_or_none(v, scale=1, invscale=1, default=None):
1949     if v is None:
1950         return default
1951     try:
1952         return float(v) * invscale / scale
1953     except (ValueError, TypeError):
1954         return default
1955
1956
1957 def bool_or_none(v, default=None):
1958     return v if isinstance(v, bool) else default
1959
1960
1961 def strip_or_none(v, default=None):
1962     return v.strip() if isinstance(v, str) else default
1963
1964
1965 def url_or_none(url):
1966     if not url or not isinstance(url, str):
1967         return None
1968     url = url.strip()
1969     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
1970
1971
1972 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
1973     datetime_object = None
1974     try:
1975         if isinstance(timestamp, (int, float)):  # unix timestamp
1976             # Using naive datetime here can break timestamp() in Windows
1977             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
1978             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
1979             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
1980             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
1981                                + datetime.timedelta(seconds=timestamp))
1982         elif isinstance(timestamp, str):  # assume YYYYMMDD
1983             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
1984         date_format = re.sub(  # Support %s on windows
1985             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
1986         return datetime_object.strftime(date_format)
1987     except (ValueError, TypeError, AttributeError):
1988         return default
1989
1990
1991 def parse_duration(s):
1992     if not isinstance(s, str):
1993         return None
1994     s = s.strip()
1995     if not s:
1996         return None
1997
1998     days, hours, mins, secs, ms = [None] * 5
1999     m = re.match(r'''(?x)
2000             (?P<before_secs>
2001                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2002             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2003             (?P<ms>[.:][0-9]+)?Z?$
2004         ''', s)
2005     if m:
2006         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2007     else:
2008         m = re.match(
2009             r'''(?ix)(?:P?
2010                 (?:
2011                     [0-9]+\s*y(?:ears?)?,?\s*
2012                 )?
2013                 (?:
2014                     [0-9]+\s*m(?:onths?)?,?\s*
2015                 )?
2016                 (?:
2017                     [0-9]+\s*w(?:eeks?)?,?\s*
2018                 )?
2019                 (?:
2020                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2021                 )?
2022                 T)?
2023                 (?:
2024                     (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2025                 )?
2026                 (?:
2027                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2028                 )?
2029                 (?:
2030                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2031                 )?Z?$''', s)
2032         if m:
2033             days, hours, mins, secs, ms = m.groups()
2034         else:
2035             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2036             if m:
2037                 hours, mins = m.groups()
2038             else:
2039                 return None
2040
2041     if ms:
2042         ms = ms.replace(':', '.')
2043     return sum(float(part or 0) * mult for part, mult in (
2044         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2045
2046
2047 def prepend_extension(filename, ext, expected_real_ext=None):
2048     name, real_ext = os.path.splitext(filename)
2049     return (
2050         f'{name}.{ext}{real_ext}'
2051         if not expected_real_ext or real_ext[1:] == expected_real_ext
2052         else f'{filename}.{ext}')
2053
2054
2055 def replace_extension(filename, ext, expected_real_ext=None):
2056     name, real_ext = os.path.splitext(filename)
2057     return '{}.{}'.format(
2058         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2059         ext)
2060
2061
2062 def check_executable(exe, args=[]):
2063     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2064     args can be a list of arguments for a short output (like -version) """
2065     try:
2066         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2067     except OSError:
2068         return False
2069     return exe
2070
2071
2072 def _get_exe_version_output(exe, args):
2073     try:
2074         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2075         # SIGTTOU if yt-dlp is run in the background.
2076         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2077         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2078                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2079         if ret:
2080             return None
2081     except OSError:
2082         return False
2083     return stdout
2084
2085
2086 def detect_exe_version(output, version_re=None, unrecognized='present'):
2087     assert isinstance(output, str)
2088     if version_re is None:
2089         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2090     m = re.search(version_re, output)
2091     if m:
2092         return m.group(1)
2093     else:
2094         return unrecognized
2095
2096
2097 def get_exe_version(exe, args=['--version'],
2098                     version_re=None, unrecognized=('present', 'broken')):
2099     """ Returns the version of the specified executable,
2100     or False if the executable is not present """
2101     unrecognized = variadic(unrecognized)
2102     assert len(unrecognized) in (1, 2)
2103     out = _get_exe_version_output(exe, args)
2104     if out is None:
2105         return unrecognized[-1]
2106     return out and detect_exe_version(out, version_re, unrecognized[0])
2107
2108
2109 def frange(start=0, stop=None, step=1):
2110     """Float range"""
2111     if stop is None:
2112         start, stop = 0, start
2113     sign = [-1, 1][step > 0] if step else 0
2114     while sign * start < sign * stop:
2115         yield start
2116         start += step
2117
2118
2119 class LazyList(collections.abc.Sequence):
2120     """Lazy immutable list from an iterable
2121     Note that slices of a LazyList are lists and not LazyList"""
2122
2123     class IndexError(IndexError):
2124         pass
2125
2126     def __init__(self, iterable, *, reverse=False, _cache=None):
2127         self._iterable = iter(iterable)
2128         self._cache = [] if _cache is None else _cache
2129         self._reversed = reverse
2130
2131     def __iter__(self):
2132         if self._reversed:
2133             # We need to consume the entire iterable to iterate in reverse
2134             yield from self.exhaust()
2135             return
2136         yield from self._cache
2137         for item in self._iterable:
2138             self._cache.append(item)
2139             yield item
2140
2141     def _exhaust(self):
2142         self._cache.extend(self._iterable)
2143         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2144         return self._cache
2145
2146     def exhaust(self):
2147         """Evaluate the entire iterable"""
2148         return self._exhaust()[::-1 if self._reversed else 1]
2149
2150     @staticmethod
2151     def _reverse_index(x):
2152         return None if x is None else ~x
2153
2154     def __getitem__(self, idx):
2155         if isinstance(idx, slice):
2156             if self._reversed:
2157                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2158             start, stop, step = idx.start, idx.stop, idx.step or 1
2159         elif isinstance(idx, int):
2160             if self._reversed:
2161                 idx = self._reverse_index(idx)
2162             start, stop, step = idx, idx, 0
2163         else:
2164             raise TypeError('indices must be integers or slices')
2165         if ((start or 0) < 0 or (stop or 0) < 0
2166                 or (start is None and step < 0)
2167                 or (stop is None and step > 0)):
2168             # We need to consume the entire iterable to be able to slice from the end
2169             # Obviously, never use this with infinite iterables
2170             self._exhaust()
2171             try:
2172                 return self._cache[idx]
2173             except IndexError as e:
2174                 raise self.IndexError(e) from e
2175         n = max(start or 0, stop or 0) - len(self._cache) + 1
2176         if n > 0:
2177             self._cache.extend(itertools.islice(self._iterable, n))
2178         try:
2179             return self._cache[idx]
2180         except IndexError as e:
2181             raise self.IndexError(e) from e
2182
2183     def __bool__(self):
2184         try:
2185             self[-1] if self._reversed else self[0]
2186         except self.IndexError:
2187             return False
2188         return True
2189
2190     def __len__(self):
2191         self._exhaust()
2192         return len(self._cache)
2193
2194     def __reversed__(self):
2195         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2196
2197     def __copy__(self):
2198         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2199
2200     def __repr__(self):
2201         # repr and str should mimic a list. So we exhaust the iterable
2202         return repr(self.exhaust())
2203
2204     def __str__(self):
2205         return repr(self.exhaust())
2206
2207
2208 class PagedList:
2209
2210     class IndexError(IndexError):
2211         pass
2212
2213     def __len__(self):
2214         # This is only useful for tests
2215         return len(self.getslice())
2216
2217     def __init__(self, pagefunc, pagesize, use_cache=True):
2218         self._pagefunc = pagefunc
2219         self._pagesize = pagesize
2220         self._pagecount = float('inf')
2221         self._use_cache = use_cache
2222         self._cache = {}
2223
2224     def getpage(self, pagenum):
2225         page_results = self._cache.get(pagenum)
2226         if page_results is None:
2227             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2228         if self._use_cache:
2229             self._cache[pagenum] = page_results
2230         return page_results
2231
2232     def getslice(self, start=0, end=None):
2233         return list(self._getslice(start, end))
2234
2235     def _getslice(self, start, end):
2236         raise NotImplementedError('This method must be implemented by subclasses')
2237
2238     def __getitem__(self, idx):
2239         assert self._use_cache, 'Indexing PagedList requires cache'
2240         if not isinstance(idx, int) or idx < 0:
2241             raise TypeError('indices must be non-negative integers')
2242         entries = self.getslice(idx, idx + 1)
2243         if not entries:
2244             raise self.IndexError()
2245         return entries[0]
2246
2247
2248 class OnDemandPagedList(PagedList):
2249     """Download pages until a page with less than maximum results"""
2250
2251     def _getslice(self, start, end):
2252         for pagenum in itertools.count(start // self._pagesize):
2253             firstid = pagenum * self._pagesize
2254             nextfirstid = pagenum * self._pagesize + self._pagesize
2255             if start >= nextfirstid:
2256                 continue
2257
2258             startv = (
2259                 start % self._pagesize
2260                 if firstid <= start < nextfirstid
2261                 else 0)
2262             endv = (
2263                 ((end - 1) % self._pagesize) + 1
2264                 if (end is not None and firstid <= end <= nextfirstid)
2265                 else None)
2266
2267             try:
2268                 page_results = self.getpage(pagenum)
2269             except Exception:
2270                 self._pagecount = pagenum - 1
2271                 raise
2272             if startv != 0 or endv is not None:
2273                 page_results = page_results[startv:endv]
2274             yield from page_results
2275
2276             # A little optimization - if current page is not "full", ie. does
2277             # not contain page_size videos then we can assume that this page
2278             # is the last one - there are no more ids on further pages -
2279             # i.e. no need to query again.
2280             if len(page_results) + startv < self._pagesize:
2281                 break
2282
2283             # If we got the whole page, but the next page is not interesting,
2284             # break out early as well
2285             if end == nextfirstid:
2286                 break
2287
2288
2289 class InAdvancePagedList(PagedList):
2290     """PagedList with total number of pages known in advance"""
2291
2292     def __init__(self, pagefunc, pagecount, pagesize):
2293         PagedList.__init__(self, pagefunc, pagesize, True)
2294         self._pagecount = pagecount
2295
2296     def _getslice(self, start, end):
2297         start_page = start // self._pagesize
2298         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2299         skip_elems = start - start_page * self._pagesize
2300         only_more = None if end is None else end - start
2301         for pagenum in range(start_page, end_page):
2302             page_results = self.getpage(pagenum)
2303             if skip_elems:
2304                 page_results = page_results[skip_elems:]
2305                 skip_elems = None
2306             if only_more is not None:
2307                 if len(page_results) < only_more:
2308                     only_more -= len(page_results)
2309                 else:
2310                     yield from page_results[:only_more]
2311                     break
2312             yield from page_results
2313
2314
2315 class PlaylistEntries:
2316     MissingEntry = object()
2317     is_exhausted = False
2318
2319     def __init__(self, ydl, info_dict):
2320         self.ydl = ydl
2321
2322         # _entries must be assigned now since infodict can change during iteration
2323         entries = info_dict.get('entries')
2324         if entries is None:
2325             raise EntryNotInPlaylist('There are no entries')
2326         elif isinstance(entries, list):
2327             self.is_exhausted = True
2328
2329         requested_entries = info_dict.get('requested_entries')
2330         self.is_incomplete = requested_entries is not None
2331         if self.is_incomplete:
2332             assert self.is_exhausted
2333             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2334             for i, entry in zip(requested_entries, entries):
2335                 self._entries[i - 1] = entry
2336         elif isinstance(entries, (list, PagedList, LazyList)):
2337             self._entries = entries
2338         else:
2339             self._entries = LazyList(entries)
2340
2341     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2342         (?P<start>[+-]?\d+)?
2343         (?P<range>[:-]
2344             (?P<end>[+-]?\d+|inf(?:inite)?)?
2345             (?::(?P<step>[+-]?\d+))?
2346         )?''')
2347
2348     @classmethod
2349     def parse_playlist_items(cls, string):
2350         for segment in string.split(','):
2351             if not segment:
2352                 raise ValueError('There is two or more consecutive commas')
2353             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2354             if not mobj:
2355                 raise ValueError(f'{segment!r} is not a valid specification')
2356             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2357             if int_or_none(step) == 0:
2358                 raise ValueError(f'Step in {segment!r} cannot be zero')
2359             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2360
2361     def get_requested_items(self):
2362         playlist_items = self.ydl.params.get('playlist_items')
2363         playlist_start = self.ydl.params.get('playliststart', 1)
2364         playlist_end = self.ydl.params.get('playlistend')
2365         # For backwards compatibility, interpret -1 as whole list
2366         if playlist_end in (-1, None):
2367             playlist_end = ''
2368         if not playlist_items:
2369             playlist_items = f'{playlist_start}:{playlist_end}'
2370         elif playlist_start != 1 or playlist_end:
2371             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2372
2373         for index in self.parse_playlist_items(playlist_items):
2374             for i, entry in self[index]:
2375                 yield i, entry
2376                 if not entry:
2377                     continue
2378                 try:
2379                     # The item may have just been added to archive. Don't break due to it
2380                     if not self.ydl.params.get('lazy_playlist'):
2381                         # TODO: Add auto-generated fields
2382                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2383                 except (ExistingVideoReached, RejectedVideoReached):
2384                     return
2385
2386     def get_full_count(self):
2387         if self.is_exhausted and not self.is_incomplete:
2388             return len(self)
2389         elif isinstance(self._entries, InAdvancePagedList):
2390             if self._entries._pagesize == 1:
2391                 return self._entries._pagecount
2392
2393     @functools.cached_property
2394     def _getter(self):
2395         if isinstance(self._entries, list):
2396             def get_entry(i):
2397                 try:
2398                     entry = self._entries[i]
2399                 except IndexError:
2400                     entry = self.MissingEntry
2401                     if not self.is_incomplete:
2402                         raise self.IndexError()
2403                 if entry is self.MissingEntry:
2404                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2405                 return entry
2406         else:
2407             def get_entry(i):
2408                 try:
2409                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2410                 except (LazyList.IndexError, PagedList.IndexError):
2411                     raise self.IndexError()
2412         return get_entry
2413
2414     def __getitem__(self, idx):
2415         if isinstance(idx, int):
2416             idx = slice(idx, idx)
2417
2418         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2419         step = 1 if idx.step is None else idx.step
2420         if idx.start is None:
2421             start = 0 if step > 0 else len(self) - 1
2422         else:
2423             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2424
2425         # NB: Do not call len(self) when idx == [:]
2426         if idx.stop is None:
2427             stop = 0 if step < 0 else float('inf')
2428         else:
2429             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2430         stop += [-1, 1][step > 0]
2431
2432         for i in frange(start, stop, step):
2433             if i < 0:
2434                 continue
2435             try:
2436                 entry = self._getter(i)
2437             except self.IndexError:
2438                 self.is_exhausted = True
2439                 if step > 0:
2440                     break
2441                 continue
2442             yield i + 1, entry
2443
2444     def __len__(self):
2445         return len(tuple(self[:]))
2446
2447     class IndexError(IndexError):
2448         pass
2449
2450
2451 def uppercase_escape(s):
2452     unicode_escape = codecs.getdecoder('unicode_escape')
2453     return re.sub(
2454         r'\\U[0-9a-fA-F]{8}',
2455         lambda m: unicode_escape(m.group(0))[0],
2456         s)
2457
2458
2459 def lowercase_escape(s):
2460     unicode_escape = codecs.getdecoder('unicode_escape')
2461     return re.sub(
2462         r'\\u[0-9a-fA-F]{4}',
2463         lambda m: unicode_escape(m.group(0))[0],
2464         s)
2465
2466
2467 def parse_qs(url, **kwargs):
2468     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2469
2470
2471 def read_batch_urls(batch_fd):
2472     def fixup(url):
2473         if not isinstance(url, str):
2474             url = url.decode('utf-8', 'replace')
2475         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2476         for bom in BOM_UTF8:
2477             if url.startswith(bom):
2478                 url = url[len(bom):]
2479         url = url.lstrip()
2480         if not url or url.startswith(('#', ';', ']')):
2481             return False
2482         # "#" cannot be stripped out since it is part of the URI
2483         # However, it can be safely stripped out if following a whitespace
2484         return re.split(r'\s#', url, 1)[0].rstrip()
2485
2486     with contextlib.closing(batch_fd) as fd:
2487         return [url for url in map(fixup, fd) if url]
2488
2489
2490 def urlencode_postdata(*args, **kargs):
2491     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2492
2493
2494 def update_url(url, *, query_update=None, **kwargs):
2495     """Replace URL components specified by kwargs
2496        @param url           str or parse url tuple
2497        @param query_update  update query
2498        @returns             str
2499     """
2500     if isinstance(url, str):
2501         if not kwargs and not query_update:
2502             return url
2503         else:
2504             url = urllib.parse.urlparse(url)
2505     if query_update:
2506         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2507         kwargs['query'] = urllib.parse.urlencode({
2508             **urllib.parse.parse_qs(url.query),
2509             **query_update
2510         }, True)
2511     return urllib.parse.urlunparse(url._replace(**kwargs))
2512
2513
2514 def update_url_query(url, query):
2515     return update_url(url, query_update=query)
2516
2517
2518 def _multipart_encode_impl(data, boundary):
2519     content_type = 'multipart/form-data; boundary=%s' % boundary
2520
2521     out = b''
2522     for k, v in data.items():
2523         out += b'--' + boundary.encode('ascii') + b'\r\n'
2524         if isinstance(k, str):
2525             k = k.encode()
2526         if isinstance(v, str):
2527             v = v.encode()
2528         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2529         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2530         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2531         if boundary.encode('ascii') in content:
2532             raise ValueError('Boundary overlaps with data')
2533         out += content
2534
2535     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2536
2537     return out, content_type
2538
2539
2540 def multipart_encode(data, boundary=None):
2541     '''
2542     Encode a dict to RFC 7578-compliant form-data
2543
2544     data:
2545         A dict where keys and values can be either Unicode or bytes-like
2546         objects.
2547     boundary:
2548         If specified a Unicode object, it's used as the boundary. Otherwise
2549         a random boundary is generated.
2550
2551     Reference: https://tools.ietf.org/html/rfc7578
2552     '''
2553     has_specified_boundary = boundary is not None
2554
2555     while True:
2556         if boundary is None:
2557             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2558
2559         try:
2560             out, content_type = _multipart_encode_impl(data, boundary)
2561             break
2562         except ValueError:
2563             if has_specified_boundary:
2564                 raise
2565             boundary = None
2566
2567     return out, content_type
2568
2569
2570 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2571     if blocked_types is NO_DEFAULT:
2572         blocked_types = (str, bytes, collections.abc.Mapping)
2573     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2574
2575
2576 def variadic(x, allowed_types=NO_DEFAULT):
2577     if not isinstance(allowed_types, (tuple, type)):
2578         deprecation_warning('allowed_types should be a tuple or a type')
2579         allowed_types = tuple(allowed_types)
2580     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2581
2582
2583 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2584     for f in funcs:
2585         try:
2586             val = f(*args, **kwargs)
2587         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2588             pass
2589         else:
2590             if expected_type is None or isinstance(val, expected_type):
2591                 return val
2592
2593
2594 def try_get(src, getter, expected_type=None):
2595     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2596
2597
2598 def filter_dict(dct, cndn=lambda _, v: v is not None):
2599     return {k: v for k, v in dct.items() if cndn(k, v)}
2600
2601
2602 def merge_dicts(*dicts):
2603     merged = {}
2604     for a_dict in dicts:
2605         for k, v in a_dict.items():
2606             if (v is not None and k not in merged
2607                     or isinstance(v, str) and merged[k] == ''):
2608                 merged[k] = v
2609     return merged
2610
2611
2612 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2613     return string if isinstance(string, str) else str(string, encoding, errors)
2614
2615
2616 US_RATINGS = {
2617     'G': 0,
2618     'PG': 10,
2619     'PG-13': 13,
2620     'R': 16,
2621     'NC': 18,
2622 }
2623
2624
2625 TV_PARENTAL_GUIDELINES = {
2626     'TV-Y': 0,
2627     'TV-Y7': 7,
2628     'TV-G': 0,
2629     'TV-PG': 0,
2630     'TV-14': 14,
2631     'TV-MA': 17,
2632 }
2633
2634
2635 def parse_age_limit(s):
2636     # isinstance(False, int) is True. So type() must be used instead
2637     if type(s) is int:  # noqa: E721
2638         return s if 0 <= s <= 21 else None
2639     elif not isinstance(s, str):
2640         return None
2641     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2642     if m:
2643         return int(m.group('age'))
2644     s = s.upper()
2645     if s in US_RATINGS:
2646         return US_RATINGS[s]
2647     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2648     if m:
2649         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2650     return None
2651
2652
2653 def strip_jsonp(code):
2654     return re.sub(
2655         r'''(?sx)^
2656             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2657             (?:\s*&&\s*(?P=func_name))?
2658             \s*\(\s*(?P<callback_data>.*)\);?
2659             \s*?(?://[^\n]*)*$''',
2660         r'\g<callback_data>', code)
2661
2662
2663 def js_to_json(code, vars={}, *, strict=False):
2664     # vars is a dict of var, val pairs to substitute
2665     STRING_QUOTES = '\'"`'
2666     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2667     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2668     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2669     INTEGER_TABLE = (
2670         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2671         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2672     )
2673
2674     def process_escape(match):
2675         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2676         escape = match.group(1) or match.group(2)
2677
2678         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2679                 else R'\u00' if escape == 'x'
2680                 else '' if escape == '\n'
2681                 else escape)
2682
2683     def template_substitute(match):
2684         evaluated = js_to_json(match.group(1), vars, strict=strict)
2685         if evaluated[0] == '"':
2686             return json.loads(evaluated)
2687         return evaluated
2688
2689     def fix_kv(m):
2690         v = m.group(0)
2691         if v in ('true', 'false', 'null'):
2692             return v
2693         elif v in ('undefined', 'void 0'):
2694             return 'null'
2695         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2696             return ''
2697
2698         if v[0] in STRING_QUOTES:
2699             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2700             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2701             return f'"{escaped}"'
2702
2703         for regex, base in INTEGER_TABLE:
2704             im = re.match(regex, v)
2705             if im:
2706                 i = int(im.group(1), base)
2707                 return f'"{i}":' if v.endswith(':') else str(i)
2708
2709         if v in vars:
2710             try:
2711                 if not strict:
2712                     json.loads(vars[v])
2713             except json.JSONDecodeError:
2714                 return json.dumps(vars[v])
2715             else:
2716                 return vars[v]
2717
2718         if not strict:
2719             return f'"{v}"'
2720
2721         raise ValueError(f'Unknown value: {v}')
2722
2723     def create_map(mobj):
2724         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2725
2726     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2727     if not strict:
2728         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
2729         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2730         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2731         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2732
2733     return re.sub(rf'''(?sx)
2734         {STRING_RE}|
2735         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2736         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2737         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2738         [0-9]+(?={SKIP_RE}:)|
2739         !+
2740         ''', fix_kv, code)
2741
2742
2743 def qualities(quality_ids):
2744     """ Get a numeric quality value out of a list of possible values """
2745     def q(qid):
2746         try:
2747             return quality_ids.index(qid)
2748         except ValueError:
2749             return -1
2750     return q
2751
2752
2753 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2754
2755
2756 DEFAULT_OUTTMPL = {
2757     'default': '%(title)s [%(id)s].%(ext)s',
2758     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2759 }
2760 OUTTMPL_TYPES = {
2761     'chapter': None,
2762     'subtitle': None,
2763     'thumbnail': None,
2764     'description': 'description',
2765     'annotation': 'annotations.xml',
2766     'infojson': 'info.json',
2767     'link': None,
2768     'pl_video': None,
2769     'pl_thumbnail': None,
2770     'pl_description': 'description',
2771     'pl_infojson': 'info.json',
2772 }
2773
2774 # As of [1] format syntax is:
2775 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2776 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2777 STR_FORMAT_RE_TMPL = r'''(?x)
2778     (?<!%)(?P<prefix>(?:%%)*)
2779     %
2780     (?P<has_key>\((?P<key>{0})\))?
2781     (?P<format>
2782         (?P<conversion>[#0\-+ ]+)?
2783         (?P<min_width>\d+)?
2784         (?P<precision>\.\d+)?
2785         (?P<len_mod>[hlL])?  # unused in python
2786         {1}  # conversion type
2787     )
2788 '''
2789
2790
2791 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2792
2793
2794 def limit_length(s, length):
2795     """ Add ellipses to overly long strings """
2796     if s is None:
2797         return None
2798     ELLIPSES = '...'
2799     if len(s) > length:
2800         return s[:length - len(ELLIPSES)] + ELLIPSES
2801     return s
2802
2803
2804 def version_tuple(v):
2805     return tuple(int(e) for e in re.split(r'[-.]', v))
2806
2807
2808 def is_outdated_version(version, limit, assume_new=True):
2809     if not version:
2810         return not assume_new
2811     try:
2812         return version_tuple(version) < version_tuple(limit)
2813     except ValueError:
2814         return not assume_new
2815
2816
2817 def ytdl_is_updateable():
2818     """ Returns if yt-dlp can be updated with -U """
2819
2820     from ..update import is_non_updateable
2821
2822     return not is_non_updateable()
2823
2824
2825 def args_to_str(args):
2826     # Get a short string representation for a subprocess command
2827     return ' '.join(compat_shlex_quote(a) for a in args)
2828
2829
2830 def error_to_str(err):
2831     return f'{type(err).__name__}: {err}'
2832
2833
2834 def mimetype2ext(mt, default=NO_DEFAULT):
2835     if not isinstance(mt, str):
2836         if default is not NO_DEFAULT:
2837             return default
2838         return None
2839
2840     MAP = {
2841         # video
2842         '3gpp': '3gp',
2843         'mp2t': 'ts',
2844         'mp4': 'mp4',
2845         'mpeg': 'mpeg',
2846         'mpegurl': 'm3u8',
2847         'quicktime': 'mov',
2848         'webm': 'webm',
2849         'vp9': 'vp9',
2850         'video/ogg': 'ogv',
2851         'x-flv': 'flv',
2852         'x-m4v': 'm4v',
2853         'x-matroska': 'mkv',
2854         'x-mng': 'mng',
2855         'x-mp4-fragmented': 'mp4',
2856         'x-ms-asf': 'asf',
2857         'x-ms-wmv': 'wmv',
2858         'x-msvideo': 'avi',
2859
2860         # application (streaming playlists)
2861         'dash+xml': 'mpd',
2862         'f4m+xml': 'f4m',
2863         'hds+xml': 'f4m',
2864         'vnd.apple.mpegurl': 'm3u8',
2865         'vnd.ms-sstr+xml': 'ism',
2866         'x-mpegurl': 'm3u8',
2867
2868         # audio
2869         'audio/mp4': 'm4a',
2870         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2871         # Using .mp3 as it's the most popular one
2872         'audio/mpeg': 'mp3',
2873         'audio/webm': 'webm',
2874         'audio/x-matroska': 'mka',
2875         'audio/x-mpegurl': 'm3u',
2876         'midi': 'mid',
2877         'ogg': 'ogg',
2878         'wav': 'wav',
2879         'wave': 'wav',
2880         'x-aac': 'aac',
2881         'x-flac': 'flac',
2882         'x-m4a': 'm4a',
2883         'x-realaudio': 'ra',
2884         'x-wav': 'wav',
2885
2886         # image
2887         'avif': 'avif',
2888         'bmp': 'bmp',
2889         'gif': 'gif',
2890         'jpeg': 'jpg',
2891         'png': 'png',
2892         'svg+xml': 'svg',
2893         'tiff': 'tif',
2894         'vnd.wap.wbmp': 'wbmp',
2895         'webp': 'webp',
2896         'x-icon': 'ico',
2897         'x-jng': 'jng',
2898         'x-ms-bmp': 'bmp',
2899
2900         # caption
2901         'filmstrip+json': 'fs',
2902         'smptett+xml': 'tt',
2903         'ttaf+xml': 'dfxp',
2904         'ttml+xml': 'ttml',
2905         'x-ms-sami': 'sami',
2906
2907         # misc
2908         'gzip': 'gz',
2909         'json': 'json',
2910         'xml': 'xml',
2911         'zip': 'zip',
2912     }
2913
2914     mimetype = mt.partition(';')[0].strip().lower()
2915     _, _, subtype = mimetype.rpartition('/')
2916
2917     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2918     if ext:
2919         return ext
2920     elif default is not NO_DEFAULT:
2921         return default
2922     return subtype.replace('+', '.')
2923
2924
2925 def ext2mimetype(ext_or_url):
2926     if not ext_or_url:
2927         return None
2928     if '.' not in ext_or_url:
2929         ext_or_url = f'file.{ext_or_url}'
2930     return mimetypes.guess_type(ext_or_url)[0]
2931
2932
2933 def parse_codecs(codecs_str):
2934     # http://tools.ietf.org/html/rfc6381
2935     if not codecs_str:
2936         return {}
2937     split_codecs = list(filter(None, map(
2938         str.strip, codecs_str.strip().strip(',').split(','))))
2939     vcodec, acodec, scodec, hdr = None, None, None, None
2940     for full_codec in split_codecs:
2941         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2942         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2943                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2944             if vcodec:
2945                 continue
2946             vcodec = full_codec
2947             if parts[0] in ('dvh1', 'dvhe'):
2948                 hdr = 'DV'
2949             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2950                 hdr = 'HDR10'
2951             elif parts[:2] == ['vp9', '2']:
2952                 hdr = 'HDR10'
2953         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
2954                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2955             acodec = acodec or full_codec
2956         elif parts[0] in ('stpp', 'wvtt'):
2957             scodec = scodec or full_codec
2958         else:
2959             write_string(f'WARNING: Unknown codec {full_codec}\n')
2960     if vcodec or acodec or scodec:
2961         return {
2962             'vcodec': vcodec or 'none',
2963             'acodec': acodec or 'none',
2964             'dynamic_range': hdr,
2965             **({'scodec': scodec} if scodec is not None else {}),
2966         }
2967     elif len(split_codecs) == 2:
2968         return {
2969             'vcodec': split_codecs[0],
2970             'acodec': split_codecs[1],
2971         }
2972     return {}
2973
2974
2975 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
2976     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
2977
2978     allow_mkv = not preferences or 'mkv' in preferences
2979
2980     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
2981         return 'mkv'  # TODO: any other format allows this?
2982
2983     # TODO: All codecs supported by parse_codecs isn't handled here
2984     COMPATIBLE_CODECS = {
2985         'mp4': {
2986             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
2987             'h264', 'aacl', 'ec-3',  # Set in ISM
2988         },
2989         'webm': {
2990             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
2991             'vp9x', 'vp8x',  # in the webm spec
2992         },
2993     }
2994
2995     sanitize_codec = functools.partial(
2996         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
2997     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
2998
2999     for ext in preferences or COMPATIBLE_CODECS.keys():
3000         codec_set = COMPATIBLE_CODECS.get(ext, set())
3001         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3002             return ext
3003
3004     COMPATIBLE_EXTS = (
3005         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3006         {'webm', 'weba'},
3007     )
3008     for ext in preferences or vexts:
3009         current_exts = {ext, *vexts, *aexts}
3010         if ext == 'mkv' or current_exts == {ext} or any(
3011                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3012             return ext
3013     return 'mkv' if allow_mkv else preferences[-1]
3014
3015
3016 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3017     getheader = url_handle.headers.get
3018
3019     cd = getheader('Content-Disposition')
3020     if cd:
3021         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3022         if m:
3023             e = determine_ext(m.group('filename'), default_ext=None)
3024             if e:
3025                 return e
3026
3027     meta_ext = getheader('x-amz-meta-name')
3028     if meta_ext:
3029         e = meta_ext.rpartition('.')[2]
3030         if e:
3031             return e
3032
3033     return mimetype2ext(getheader('Content-Type'), default=default)
3034
3035
3036 def encode_data_uri(data, mime_type):
3037     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3038
3039
3040 def age_restricted(content_limit, age_limit):
3041     """ Returns True iff the content should be blocked """
3042
3043     if age_limit is None:  # No limit set
3044         return False
3045     if content_limit is None:
3046         return False  # Content available for everyone
3047     return age_limit < content_limit
3048
3049
3050 # List of known byte-order-marks (BOM)
3051 BOMS = [
3052     (b'\xef\xbb\xbf', 'utf-8'),
3053     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3054     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3055     (b'\xff\xfe', 'utf-16-le'),
3056     (b'\xfe\xff', 'utf-16-be'),
3057 ]
3058
3059
3060 def is_html(first_bytes):
3061     """ Detect whether a file contains HTML by examining its first bytes. """
3062
3063     encoding = 'utf-8'
3064     for bom, enc in BOMS:
3065         while first_bytes.startswith(bom):
3066             encoding, first_bytes = enc, first_bytes[len(bom):]
3067
3068     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3069
3070
3071 def determine_protocol(info_dict):
3072     protocol = info_dict.get('protocol')
3073     if protocol is not None:
3074         return protocol
3075
3076     url = sanitize_url(info_dict['url'])
3077     if url.startswith('rtmp'):
3078         return 'rtmp'
3079     elif url.startswith('mms'):
3080         return 'mms'
3081     elif url.startswith('rtsp'):
3082         return 'rtsp'
3083
3084     ext = determine_ext(url)
3085     if ext == 'm3u8':
3086         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3087     elif ext == 'f4m':
3088         return 'f4m'
3089
3090     return urllib.parse.urlparse(url).scheme
3091
3092
3093 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3094     """ Render a list of rows, each as a list of values.
3095     Text after a \t will be right aligned """
3096     def width(string):
3097         return len(remove_terminal_sequences(string).replace('\t', ''))
3098
3099     def get_max_lens(table):
3100         return [max(width(str(v)) for v in col) for col in zip(*table)]
3101
3102     def filter_using_list(row, filterArray):
3103         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3104
3105     max_lens = get_max_lens(data) if hide_empty else []
3106     header_row = filter_using_list(header_row, max_lens)
3107     data = [filter_using_list(row, max_lens) for row in data]
3108
3109     table = [header_row] + data
3110     max_lens = get_max_lens(table)
3111     extra_gap += 1
3112     if delim:
3113         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3114         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3115     for row in table:
3116         for pos, text in enumerate(map(str, row)):
3117             if '\t' in text:
3118                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3119             else:
3120                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3121     ret = '\n'.join(''.join(row).rstrip() for row in table)
3122     return ret
3123
3124
3125 def _match_one(filter_part, dct, incomplete):
3126     # TODO: Generalize code with YoutubeDL._build_format_filter
3127     STRING_OPERATORS = {
3128         '*=': operator.contains,
3129         '^=': lambda attr, value: attr.startswith(value),
3130         '$=': lambda attr, value: attr.endswith(value),
3131         '~=': lambda attr, value: re.search(value, attr),
3132     }
3133     COMPARISON_OPERATORS = {
3134         **STRING_OPERATORS,
3135         '<=': operator.le,  # "<=" must be defined above "<"
3136         '<': operator.lt,
3137         '>=': operator.ge,
3138         '>': operator.gt,
3139         '=': operator.eq,
3140     }
3141
3142     if isinstance(incomplete, bool):
3143         is_incomplete = lambda _: incomplete
3144     else:
3145         is_incomplete = lambda k: k in incomplete
3146
3147     operator_rex = re.compile(r'''(?x)
3148         (?P<key>[a-z_]+)
3149         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3150         (?:
3151             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3152             (?P<strval>.+?)
3153         )
3154         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3155     m = operator_rex.fullmatch(filter_part.strip())
3156     if m:
3157         m = m.groupdict()
3158         unnegated_op = COMPARISON_OPERATORS[m['op']]
3159         if m['negation']:
3160             op = lambda attr, value: not unnegated_op(attr, value)
3161         else:
3162             op = unnegated_op
3163         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3164         if m['quote']:
3165             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3166         actual_value = dct.get(m['key'])
3167         numeric_comparison = None
3168         if isinstance(actual_value, (int, float)):
3169             # If the original field is a string and matching comparisonvalue is
3170             # a number we should respect the origin of the original field
3171             # and process comparison value as a string (see
3172             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3173             try:
3174                 numeric_comparison = int(comparison_value)
3175             except ValueError:
3176                 numeric_comparison = parse_filesize(comparison_value)
3177                 if numeric_comparison is None:
3178                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3179                 if numeric_comparison is None:
3180                     numeric_comparison = parse_duration(comparison_value)
3181         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3182             raise ValueError('Operator %s only supports string values!' % m['op'])
3183         if actual_value is None:
3184             return is_incomplete(m['key']) or m['none_inclusive']
3185         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3186
3187     UNARY_OPERATORS = {
3188         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3189         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3190     }
3191     operator_rex = re.compile(r'''(?x)
3192         (?P<op>%s)\s*(?P<key>[a-z_]+)
3193         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3194     m = operator_rex.fullmatch(filter_part.strip())
3195     if m:
3196         op = UNARY_OPERATORS[m.group('op')]
3197         actual_value = dct.get(m.group('key'))
3198         if is_incomplete(m.group('key')) and actual_value is None:
3199             return True
3200         return op(actual_value)
3201
3202     raise ValueError('Invalid filter part %r' % filter_part)
3203
3204
3205 def match_str(filter_str, dct, incomplete=False):
3206     """ Filter a dictionary with a simple string syntax.
3207     @returns           Whether the filter passes
3208     @param incomplete  Set of keys that is expected to be missing from dct.
3209                        Can be True/False to indicate all/none of the keys may be missing.
3210                        All conditions on incomplete keys pass if the key is missing
3211     """
3212     return all(
3213         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3214         for filter_part in re.split(r'(?<!\\)&', filter_str))
3215
3216
3217 def match_filter_func(filters, breaking_filters=None):
3218     if not filters and not breaking_filters:
3219         return None
3220     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3221     filters = set(variadic(filters or []))
3222
3223     interactive = '-' in filters
3224     if interactive:
3225         filters.remove('-')
3226
3227     def _match_func(info_dict, incomplete=False):
3228         ret = breaking_filters(info_dict, incomplete)
3229         if ret is not None:
3230             raise RejectedVideoReached(ret)
3231
3232         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3233             return NO_DEFAULT if interactive and not incomplete else None
3234         else:
3235             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3236             filter_str = ') | ('.join(map(str.strip, filters))
3237             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3238     return _match_func
3239
3240
3241 class download_range_func:
3242     def __init__(self, chapters, ranges, from_info=False):
3243         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3244
3245     def __call__(self, info_dict, ydl):
3246
3247         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3248                    else 'Cannot match chapters since chapter information is unavailable')
3249         for regex in self.chapters or []:
3250             for i, chapter in enumerate(info_dict.get('chapters') or []):
3251                 if re.search(regex, chapter['title']):
3252                     warning = None
3253                     yield {**chapter, 'index': i}
3254         if self.chapters and warning:
3255             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3256
3257         for start, end in self.ranges or []:
3258             yield {
3259                 'start_time': self._handle_negative_timestamp(start, info_dict),
3260                 'end_time': self._handle_negative_timestamp(end, info_dict),
3261             }
3262
3263         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3264             yield {
3265                 'start_time': info_dict.get('start_time') or 0,
3266                 'end_time': info_dict.get('end_time') or float('inf'),
3267             }
3268         elif not self.ranges and not self.chapters:
3269             yield {}
3270
3271     @staticmethod
3272     def _handle_negative_timestamp(time, info):
3273         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3274
3275     def __eq__(self, other):
3276         return (isinstance(other, download_range_func)
3277                 and self.chapters == other.chapters and self.ranges == other.ranges)
3278
3279     def __repr__(self):
3280         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3281
3282
3283 def parse_dfxp_time_expr(time_expr):
3284     if not time_expr:
3285         return
3286
3287     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3288     if mobj:
3289         return float(mobj.group('time_offset'))
3290
3291     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3292     if mobj:
3293         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3294
3295
3296 def srt_subtitles_timecode(seconds):
3297     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3298
3299
3300 def ass_subtitles_timecode(seconds):
3301     time = timetuple_from_msec(seconds * 1000)
3302     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3303
3304
3305 def dfxp2srt(dfxp_data):
3306     '''
3307     @param dfxp_data A bytes-like object containing DFXP data
3308     @returns A unicode object containing converted SRT data
3309     '''
3310     LEGACY_NAMESPACES = (
3311         (b'http://www.w3.org/ns/ttml', [
3312             b'http://www.w3.org/2004/11/ttaf1',
3313             b'http://www.w3.org/2006/04/ttaf1',
3314             b'http://www.w3.org/2006/10/ttaf1',
3315         ]),
3316         (b'http://www.w3.org/ns/ttml#styling', [
3317             b'http://www.w3.org/ns/ttml#style',
3318         ]),
3319     )
3320
3321     SUPPORTED_STYLING = [
3322         'color',
3323         'fontFamily',
3324         'fontSize',
3325         'fontStyle',
3326         'fontWeight',
3327         'textDecoration'
3328     ]
3329
3330     _x = functools.partial(xpath_with_ns, ns_map={
3331         'xml': 'http://www.w3.org/XML/1998/namespace',
3332         'ttml': 'http://www.w3.org/ns/ttml',
3333         'tts': 'http://www.w3.org/ns/ttml#styling',
3334     })
3335
3336     styles = {}
3337     default_style = {}
3338
3339     class TTMLPElementParser:
3340         _out = ''
3341         _unclosed_elements = []
3342         _applied_styles = []
3343
3344         def start(self, tag, attrib):
3345             if tag in (_x('ttml:br'), 'br'):
3346                 self._out += '\n'
3347             else:
3348                 unclosed_elements = []
3349                 style = {}
3350                 element_style_id = attrib.get('style')
3351                 if default_style:
3352                     style.update(default_style)
3353                 if element_style_id:
3354                     style.update(styles.get(element_style_id, {}))
3355                 for prop in SUPPORTED_STYLING:
3356                     prop_val = attrib.get(_x('tts:' + prop))
3357                     if prop_val:
3358                         style[prop] = prop_val
3359                 if style:
3360                     font = ''
3361                     for k, v in sorted(style.items()):
3362                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3363                             continue
3364                         if k == 'color':
3365                             font += ' color="%s"' % v
3366                         elif k == 'fontSize':
3367                             font += ' size="%s"' % v
3368                         elif k == 'fontFamily':
3369                             font += ' face="%s"' % v
3370                         elif k == 'fontWeight' and v == 'bold':
3371                             self._out += '<b>'
3372                             unclosed_elements.append('b')
3373                         elif k == 'fontStyle' and v == 'italic':
3374                             self._out += '<i>'
3375                             unclosed_elements.append('i')
3376                         elif k == 'textDecoration' and v == 'underline':
3377                             self._out += '<u>'
3378                             unclosed_elements.append('u')
3379                     if font:
3380                         self._out += '<font' + font + '>'
3381                         unclosed_elements.append('font')
3382                     applied_style = {}
3383                     if self._applied_styles:
3384                         applied_style.update(self._applied_styles[-1])
3385                     applied_style.update(style)
3386                     self._applied_styles.append(applied_style)
3387                 self._unclosed_elements.append(unclosed_elements)
3388
3389         def end(self, tag):
3390             if tag not in (_x('ttml:br'), 'br'):
3391                 unclosed_elements = self._unclosed_elements.pop()
3392                 for element in reversed(unclosed_elements):
3393                     self._out += '</%s>' % element
3394                 if unclosed_elements and self._applied_styles:
3395                     self._applied_styles.pop()
3396
3397         def data(self, data):
3398             self._out += data
3399
3400         def close(self):
3401             return self._out.strip()
3402
3403     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3404     # This will not trigger false positives since only UTF-8 text is being replaced
3405     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3406
3407     def parse_node(node):
3408         target = TTMLPElementParser()
3409         parser = xml.etree.ElementTree.XMLParser(target=target)
3410         parser.feed(xml.etree.ElementTree.tostring(node))
3411         return parser.close()
3412
3413     for k, v in LEGACY_NAMESPACES:
3414         for ns in v:
3415             dfxp_data = dfxp_data.replace(ns, k)
3416
3417     dfxp = compat_etree_fromstring(dfxp_data)
3418     out = []
3419     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3420
3421     if not paras:
3422         raise ValueError('Invalid dfxp/TTML subtitle')
3423
3424     repeat = False
3425     while True:
3426         for style in dfxp.findall(_x('.//ttml:style')):
3427             style_id = style.get('id') or style.get(_x('xml:id'))
3428             if not style_id:
3429                 continue
3430             parent_style_id = style.get('style')
3431             if parent_style_id:
3432                 if parent_style_id not in styles:
3433                     repeat = True
3434                     continue
3435                 styles[style_id] = styles[parent_style_id].copy()
3436             for prop in SUPPORTED_STYLING:
3437                 prop_val = style.get(_x('tts:' + prop))
3438                 if prop_val:
3439                     styles.setdefault(style_id, {})[prop] = prop_val
3440         if repeat:
3441             repeat = False
3442         else:
3443             break
3444
3445     for p in ('body', 'div'):
3446         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3447         if ele is None:
3448             continue
3449         style = styles.get(ele.get('style'))
3450         if not style:
3451             continue
3452         default_style.update(style)
3453
3454     for para, index in zip(paras, itertools.count(1)):
3455         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3456         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3457         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3458         if begin_time is None:
3459             continue
3460         if not end_time:
3461             if not dur:
3462                 continue
3463             end_time = begin_time + dur
3464         out.append('%d\n%s --> %s\n%s\n\n' % (
3465             index,
3466             srt_subtitles_timecode(begin_time),
3467             srt_subtitles_timecode(end_time),
3468             parse_node(para)))
3469
3470     return ''.join(out)
3471
3472
3473 def cli_option(params, command_option, param, separator=None):
3474     param = params.get(param)
3475     return ([] if param is None
3476             else [command_option, str(param)] if separator is None
3477             else [f'{command_option}{separator}{param}'])
3478
3479
3480 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3481     param = params.get(param)
3482     assert param in (True, False, None)
3483     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3484
3485
3486 def cli_valueless_option(params, command_option, param, expected_value=True):
3487     return [command_option] if params.get(param) == expected_value else []
3488
3489
3490 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3491     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3492         if use_compat:
3493             return argdict
3494         else:
3495             argdict = None
3496     if argdict is None:
3497         return default
3498     assert isinstance(argdict, dict)
3499
3500     assert isinstance(keys, (list, tuple))
3501     for key_list in keys:
3502         arg_list = list(filter(
3503             lambda x: x is not None,
3504             [argdict.get(key.lower()) for key in variadic(key_list)]))
3505         if arg_list:
3506             return [arg for args in arg_list for arg in args]
3507     return default
3508
3509
3510 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3511     main_key, exe = main_key.lower(), exe.lower()
3512     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3513     keys = [f'{root_key}{k}' for k in (keys or [''])]
3514     if root_key in keys:
3515         if main_key != exe:
3516             keys.append((main_key, exe))
3517         keys.append('default')
3518     else:
3519         use_compat = False
3520     return cli_configuration_args(argdict, keys, default, use_compat)
3521
3522
3523 class ISO639Utils:
3524     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3525     _lang_map = {
3526         'aa': 'aar',
3527         'ab': 'abk',
3528         'ae': 'ave',
3529         'af': 'afr',
3530         'ak': 'aka',
3531         'am': 'amh',
3532         'an': 'arg',
3533         'ar': 'ara',
3534         'as': 'asm',
3535         'av': 'ava',
3536         'ay': 'aym',
3537         'az': 'aze',
3538         'ba': 'bak',
3539         'be': 'bel',
3540         'bg': 'bul',
3541         'bh': 'bih',
3542         'bi': 'bis',
3543         'bm': 'bam',
3544         'bn': 'ben',
3545         'bo': 'bod',
3546         'br': 'bre',
3547         'bs': 'bos',
3548         'ca': 'cat',
3549         'ce': 'che',
3550         'ch': 'cha',
3551         'co': 'cos',
3552         'cr': 'cre',
3553         'cs': 'ces',
3554         'cu': 'chu',
3555         'cv': 'chv',
3556         'cy': 'cym',
3557         'da': 'dan',
3558         'de': 'deu',
3559         'dv': 'div',
3560         'dz': 'dzo',
3561         'ee': 'ewe',
3562         'el': 'ell',
3563         'en': 'eng',
3564         'eo': 'epo',
3565         'es': 'spa',
3566         'et': 'est',
3567         'eu': 'eus',
3568         'fa': 'fas',
3569         'ff': 'ful',
3570         'fi': 'fin',
3571         'fj': 'fij',
3572         'fo': 'fao',
3573         'fr': 'fra',
3574         'fy': 'fry',
3575         'ga': 'gle',
3576         'gd': 'gla',
3577         'gl': 'glg',
3578         'gn': 'grn',
3579         'gu': 'guj',
3580         'gv': 'glv',
3581         'ha': 'hau',
3582         'he': 'heb',
3583         'iw': 'heb',  # Replaced by he in 1989 revision
3584         'hi': 'hin',
3585         'ho': 'hmo',
3586         'hr': 'hrv',
3587         'ht': 'hat',
3588         'hu': 'hun',
3589         'hy': 'hye',
3590         'hz': 'her',
3591         'ia': 'ina',
3592         'id': 'ind',
3593         'in': 'ind',  # Replaced by id in 1989 revision
3594         'ie': 'ile',
3595         'ig': 'ibo',
3596         'ii': 'iii',
3597         'ik': 'ipk',
3598         'io': 'ido',
3599         'is': 'isl',
3600         'it': 'ita',
3601         'iu': 'iku',
3602         'ja': 'jpn',
3603         'jv': 'jav',
3604         'ka': 'kat',
3605         'kg': 'kon',
3606         'ki': 'kik',
3607         'kj': 'kua',
3608         'kk': 'kaz',
3609         'kl': 'kal',
3610         'km': 'khm',
3611         'kn': 'kan',
3612         'ko': 'kor',
3613         'kr': 'kau',
3614         'ks': 'kas',
3615         'ku': 'kur',
3616         'kv': 'kom',
3617         'kw': 'cor',
3618         'ky': 'kir',
3619         'la': 'lat',
3620         'lb': 'ltz',
3621         'lg': 'lug',
3622         'li': 'lim',
3623         'ln': 'lin',
3624         'lo': 'lao',
3625         'lt': 'lit',
3626         'lu': 'lub',
3627         'lv': 'lav',
3628         'mg': 'mlg',
3629         'mh': 'mah',
3630         'mi': 'mri',
3631         'mk': 'mkd',
3632         'ml': 'mal',
3633         'mn': 'mon',
3634         'mr': 'mar',
3635         'ms': 'msa',
3636         'mt': 'mlt',
3637         'my': 'mya',
3638         'na': 'nau',
3639         'nb': 'nob',
3640         'nd': 'nde',
3641         'ne': 'nep',
3642         'ng': 'ndo',
3643         'nl': 'nld',
3644         'nn': 'nno',
3645         'no': 'nor',
3646         'nr': 'nbl',
3647         'nv': 'nav',
3648         'ny': 'nya',
3649         'oc': 'oci',
3650         'oj': 'oji',
3651         'om': 'orm',
3652         'or': 'ori',
3653         'os': 'oss',
3654         'pa': 'pan',
3655         'pe': 'per',
3656         'pi': 'pli',
3657         'pl': 'pol',
3658         'ps': 'pus',
3659         'pt': 'por',
3660         'qu': 'que',
3661         'rm': 'roh',
3662         'rn': 'run',
3663         'ro': 'ron',
3664         'ru': 'rus',
3665         'rw': 'kin',
3666         'sa': 'san',
3667         'sc': 'srd',
3668         'sd': 'snd',
3669         'se': 'sme',
3670         'sg': 'sag',
3671         'si': 'sin',
3672         'sk': 'slk',
3673         'sl': 'slv',
3674         'sm': 'smo',
3675         'sn': 'sna',
3676         'so': 'som',
3677         'sq': 'sqi',
3678         'sr': 'srp',
3679         'ss': 'ssw',
3680         'st': 'sot',
3681         'su': 'sun',
3682         'sv': 'swe',
3683         'sw': 'swa',
3684         'ta': 'tam',
3685         'te': 'tel',
3686         'tg': 'tgk',
3687         'th': 'tha',
3688         'ti': 'tir',
3689         'tk': 'tuk',
3690         'tl': 'tgl',
3691         'tn': 'tsn',
3692         'to': 'ton',
3693         'tr': 'tur',
3694         'ts': 'tso',
3695         'tt': 'tat',
3696         'tw': 'twi',
3697         'ty': 'tah',
3698         'ug': 'uig',
3699         'uk': 'ukr',
3700         'ur': 'urd',
3701         'uz': 'uzb',
3702         've': 'ven',
3703         'vi': 'vie',
3704         'vo': 'vol',
3705         'wa': 'wln',
3706         'wo': 'wol',
3707         'xh': 'xho',
3708         'yi': 'yid',
3709         'ji': 'yid',  # Replaced by yi in 1989 revision
3710         'yo': 'yor',
3711         'za': 'zha',
3712         'zh': 'zho',
3713         'zu': 'zul',
3714     }
3715
3716     @classmethod
3717     def short2long(cls, code):
3718         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3719         return cls._lang_map.get(code[:2])
3720
3721     @classmethod
3722     def long2short(cls, code):
3723         """Convert language code from ISO 639-2/T to ISO 639-1"""
3724         for short_name, long_name in cls._lang_map.items():
3725             if long_name == code:
3726                 return short_name
3727
3728
3729 class ISO3166Utils:
3730     # From http://data.okfn.org/data/core/country-list
3731     _country_map = {
3732         'AF': 'Afghanistan',
3733         'AX': 'Åland Islands',
3734         'AL': 'Albania',
3735         'DZ': 'Algeria',
3736         'AS': 'American Samoa',
3737         'AD': 'Andorra',
3738         'AO': 'Angola',
3739         'AI': 'Anguilla',
3740         'AQ': 'Antarctica',
3741         'AG': 'Antigua and Barbuda',
3742         'AR': 'Argentina',
3743         'AM': 'Armenia',
3744         'AW': 'Aruba',
3745         'AU': 'Australia',
3746         'AT': 'Austria',
3747         'AZ': 'Azerbaijan',
3748         'BS': 'Bahamas',
3749         'BH': 'Bahrain',
3750         'BD': 'Bangladesh',
3751         'BB': 'Barbados',
3752         'BY': 'Belarus',
3753         'BE': 'Belgium',
3754         'BZ': 'Belize',
3755         'BJ': 'Benin',
3756         'BM': 'Bermuda',
3757         'BT': 'Bhutan',
3758         'BO': 'Bolivia, Plurinational State of',
3759         'BQ': 'Bonaire, Sint Eustatius and Saba',
3760         'BA': 'Bosnia and Herzegovina',
3761         'BW': 'Botswana',
3762         'BV': 'Bouvet Island',
3763         'BR': 'Brazil',
3764         'IO': 'British Indian Ocean Territory',
3765         'BN': 'Brunei Darussalam',
3766         'BG': 'Bulgaria',
3767         'BF': 'Burkina Faso',
3768         'BI': 'Burundi',
3769         'KH': 'Cambodia',
3770         'CM': 'Cameroon',
3771         'CA': 'Canada',
3772         'CV': 'Cape Verde',
3773         'KY': 'Cayman Islands',
3774         'CF': 'Central African Republic',
3775         'TD': 'Chad',
3776         'CL': 'Chile',
3777         'CN': 'China',
3778         'CX': 'Christmas Island',
3779         'CC': 'Cocos (Keeling) Islands',
3780         'CO': 'Colombia',
3781         'KM': 'Comoros',
3782         'CG': 'Congo',
3783         'CD': 'Congo, the Democratic Republic of the',
3784         'CK': 'Cook Islands',
3785         'CR': 'Costa Rica',
3786         'CI': 'Côte d\'Ivoire',
3787         'HR': 'Croatia',
3788         'CU': 'Cuba',
3789         'CW': 'Curaçao',
3790         'CY': 'Cyprus',
3791         'CZ': 'Czech Republic',
3792         'DK': 'Denmark',
3793         'DJ': 'Djibouti',
3794         'DM': 'Dominica',
3795         'DO': 'Dominican Republic',
3796         'EC': 'Ecuador',
3797         'EG': 'Egypt',
3798         'SV': 'El Salvador',
3799         'GQ': 'Equatorial Guinea',
3800         'ER': 'Eritrea',
3801         'EE': 'Estonia',
3802         'ET': 'Ethiopia',
3803         'FK': 'Falkland Islands (Malvinas)',
3804         'FO': 'Faroe Islands',
3805         'FJ': 'Fiji',
3806         'FI': 'Finland',
3807         'FR': 'France',
3808         'GF': 'French Guiana',
3809         'PF': 'French Polynesia',
3810         'TF': 'French Southern Territories',
3811         'GA': 'Gabon',
3812         'GM': 'Gambia',
3813         'GE': 'Georgia',
3814         'DE': 'Germany',
3815         'GH': 'Ghana',
3816         'GI': 'Gibraltar',
3817         'GR': 'Greece',
3818         'GL': 'Greenland',
3819         'GD': 'Grenada',
3820         'GP': 'Guadeloupe',
3821         'GU': 'Guam',
3822         'GT': 'Guatemala',
3823         'GG': 'Guernsey',
3824         'GN': 'Guinea',
3825         'GW': 'Guinea-Bissau',
3826         'GY': 'Guyana',
3827         'HT': 'Haiti',
3828         'HM': 'Heard Island and McDonald Islands',
3829         'VA': 'Holy See (Vatican City State)',
3830         'HN': 'Honduras',
3831         'HK': 'Hong Kong',
3832         'HU': 'Hungary',
3833         'IS': 'Iceland',
3834         'IN': 'India',
3835         'ID': 'Indonesia',
3836         'IR': 'Iran, Islamic Republic of',
3837         'IQ': 'Iraq',
3838         'IE': 'Ireland',
3839         'IM': 'Isle of Man',
3840         'IL': 'Israel',
3841         'IT': 'Italy',
3842         'JM': 'Jamaica',
3843         'JP': 'Japan',
3844         'JE': 'Jersey',
3845         'JO': 'Jordan',
3846         'KZ': 'Kazakhstan',
3847         'KE': 'Kenya',
3848         'KI': 'Kiribati',
3849         'KP': 'Korea, Democratic People\'s Republic of',
3850         'KR': 'Korea, Republic of',
3851         'KW': 'Kuwait',
3852         'KG': 'Kyrgyzstan',
3853         'LA': 'Lao People\'s Democratic Republic',
3854         'LV': 'Latvia',
3855         'LB': 'Lebanon',
3856         'LS': 'Lesotho',
3857         'LR': 'Liberia',
3858         'LY': 'Libya',
3859         'LI': 'Liechtenstein',
3860         'LT': 'Lithuania',
3861         'LU': 'Luxembourg',
3862         'MO': 'Macao',
3863         'MK': 'Macedonia, the Former Yugoslav Republic of',
3864         'MG': 'Madagascar',
3865         'MW': 'Malawi',
3866         'MY': 'Malaysia',
3867         'MV': 'Maldives',
3868         'ML': 'Mali',
3869         'MT': 'Malta',
3870         'MH': 'Marshall Islands',
3871         'MQ': 'Martinique',
3872         'MR': 'Mauritania',
3873         'MU': 'Mauritius',
3874         'YT': 'Mayotte',
3875         'MX': 'Mexico',
3876         'FM': 'Micronesia, Federated States of',
3877         'MD': 'Moldova, Republic of',
3878         'MC': 'Monaco',
3879         'MN': 'Mongolia',
3880         'ME': 'Montenegro',
3881         'MS': 'Montserrat',
3882         'MA': 'Morocco',
3883         'MZ': 'Mozambique',
3884         'MM': 'Myanmar',
3885         'NA': 'Namibia',
3886         'NR': 'Nauru',
3887         'NP': 'Nepal',
3888         'NL': 'Netherlands',
3889         'NC': 'New Caledonia',
3890         'NZ': 'New Zealand',
3891         'NI': 'Nicaragua',
3892         'NE': 'Niger',
3893         'NG': 'Nigeria',
3894         'NU': 'Niue',
3895         'NF': 'Norfolk Island',
3896         'MP': 'Northern Mariana Islands',
3897         'NO': 'Norway',
3898         'OM': 'Oman',
3899         'PK': 'Pakistan',
3900         'PW': 'Palau',
3901         'PS': 'Palestine, State of',
3902         'PA': 'Panama',
3903         'PG': 'Papua New Guinea',
3904         'PY': 'Paraguay',
3905         'PE': 'Peru',
3906         'PH': 'Philippines',
3907         'PN': 'Pitcairn',
3908         'PL': 'Poland',
3909         'PT': 'Portugal',
3910         'PR': 'Puerto Rico',
3911         'QA': 'Qatar',
3912         'RE': 'Réunion',
3913         'RO': 'Romania',
3914         'RU': 'Russian Federation',
3915         'RW': 'Rwanda',
3916         'BL': 'Saint Barthélemy',
3917         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3918         'KN': 'Saint Kitts and Nevis',
3919         'LC': 'Saint Lucia',
3920         'MF': 'Saint Martin (French part)',
3921         'PM': 'Saint Pierre and Miquelon',
3922         'VC': 'Saint Vincent and the Grenadines',
3923         'WS': 'Samoa',
3924         'SM': 'San Marino',
3925         'ST': 'Sao Tome and Principe',
3926         'SA': 'Saudi Arabia',
3927         'SN': 'Senegal',
3928         'RS': 'Serbia',
3929         'SC': 'Seychelles',
3930         'SL': 'Sierra Leone',
3931         'SG': 'Singapore',
3932         'SX': 'Sint Maarten (Dutch part)',
3933         'SK': 'Slovakia',
3934         'SI': 'Slovenia',
3935         'SB': 'Solomon Islands',
3936         'SO': 'Somalia',
3937         'ZA': 'South Africa',
3938         'GS': 'South Georgia and the South Sandwich Islands',
3939         'SS': 'South Sudan',
3940         'ES': 'Spain',
3941         'LK': 'Sri Lanka',
3942         'SD': 'Sudan',
3943         'SR': 'Suriname',
3944         'SJ': 'Svalbard and Jan Mayen',
3945         'SZ': 'Swaziland',
3946         'SE': 'Sweden',
3947         'CH': 'Switzerland',
3948         'SY': 'Syrian Arab Republic',
3949         'TW': 'Taiwan, Province of China',
3950         'TJ': 'Tajikistan',
3951         'TZ': 'Tanzania, United Republic of',
3952         'TH': 'Thailand',
3953         'TL': 'Timor-Leste',
3954         'TG': 'Togo',
3955         'TK': 'Tokelau',
3956         'TO': 'Tonga',
3957         'TT': 'Trinidad and Tobago',
3958         'TN': 'Tunisia',
3959         'TR': 'Turkey',
3960         'TM': 'Turkmenistan',
3961         'TC': 'Turks and Caicos Islands',
3962         'TV': 'Tuvalu',
3963         'UG': 'Uganda',
3964         'UA': 'Ukraine',
3965         'AE': 'United Arab Emirates',
3966         'GB': 'United Kingdom',
3967         'US': 'United States',
3968         'UM': 'United States Minor Outlying Islands',
3969         'UY': 'Uruguay',
3970         'UZ': 'Uzbekistan',
3971         'VU': 'Vanuatu',
3972         'VE': 'Venezuela, Bolivarian Republic of',
3973         'VN': 'Viet Nam',
3974         'VG': 'Virgin Islands, British',
3975         'VI': 'Virgin Islands, U.S.',
3976         'WF': 'Wallis and Futuna',
3977         'EH': 'Western Sahara',
3978         'YE': 'Yemen',
3979         'ZM': 'Zambia',
3980         'ZW': 'Zimbabwe',
3981         # Not ISO 3166 codes, but used for IP blocks
3982         'AP': 'Asia/Pacific Region',
3983         'EU': 'Europe',
3984     }
3985
3986     @classmethod
3987     def short2full(cls, code):
3988         """Convert an ISO 3166-2 country code to the corresponding full name"""
3989         return cls._country_map.get(code.upper())
3990
3991
3992 class GeoUtils:
3993     # Major IPv4 address blocks per country
3994     _country_ip_map = {
3995         'AD': '46.172.224.0/19',
3996         'AE': '94.200.0.0/13',
3997         'AF': '149.54.0.0/17',
3998         'AG': '209.59.64.0/18',
3999         'AI': '204.14.248.0/21',
4000         'AL': '46.99.0.0/16',
4001         'AM': '46.70.0.0/15',
4002         'AO': '105.168.0.0/13',
4003         'AP': '182.50.184.0/21',
4004         'AQ': '23.154.160.0/24',
4005         'AR': '181.0.0.0/12',
4006         'AS': '202.70.112.0/20',
4007         'AT': '77.116.0.0/14',
4008         'AU': '1.128.0.0/11',
4009         'AW': '181.41.0.0/18',
4010         'AX': '185.217.4.0/22',
4011         'AZ': '5.197.0.0/16',
4012         'BA': '31.176.128.0/17',
4013         'BB': '65.48.128.0/17',
4014         'BD': '114.130.0.0/16',
4015         'BE': '57.0.0.0/8',
4016         'BF': '102.178.0.0/15',
4017         'BG': '95.42.0.0/15',
4018         'BH': '37.131.0.0/17',
4019         'BI': '154.117.192.0/18',
4020         'BJ': '137.255.0.0/16',
4021         'BL': '185.212.72.0/23',
4022         'BM': '196.12.64.0/18',
4023         'BN': '156.31.0.0/16',
4024         'BO': '161.56.0.0/16',
4025         'BQ': '161.0.80.0/20',
4026         'BR': '191.128.0.0/12',
4027         'BS': '24.51.64.0/18',
4028         'BT': '119.2.96.0/19',
4029         'BW': '168.167.0.0/16',
4030         'BY': '178.120.0.0/13',
4031         'BZ': '179.42.192.0/18',
4032         'CA': '99.224.0.0/11',
4033         'CD': '41.243.0.0/16',
4034         'CF': '197.242.176.0/21',
4035         'CG': '160.113.0.0/16',
4036         'CH': '85.0.0.0/13',
4037         'CI': '102.136.0.0/14',
4038         'CK': '202.65.32.0/19',
4039         'CL': '152.172.0.0/14',
4040         'CM': '102.244.0.0/14',
4041         'CN': '36.128.0.0/10',
4042         'CO': '181.240.0.0/12',
4043         'CR': '201.192.0.0/12',
4044         'CU': '152.206.0.0/15',
4045         'CV': '165.90.96.0/19',
4046         'CW': '190.88.128.0/17',
4047         'CY': '31.153.0.0/16',
4048         'CZ': '88.100.0.0/14',
4049         'DE': '53.0.0.0/8',
4050         'DJ': '197.241.0.0/17',
4051         'DK': '87.48.0.0/12',
4052         'DM': '192.243.48.0/20',
4053         'DO': '152.166.0.0/15',
4054         'DZ': '41.96.0.0/12',
4055         'EC': '186.68.0.0/15',
4056         'EE': '90.190.0.0/15',
4057         'EG': '156.160.0.0/11',
4058         'ER': '196.200.96.0/20',
4059         'ES': '88.0.0.0/11',
4060         'ET': '196.188.0.0/14',
4061         'EU': '2.16.0.0/13',
4062         'FI': '91.152.0.0/13',
4063         'FJ': '144.120.0.0/16',
4064         'FK': '80.73.208.0/21',
4065         'FM': '119.252.112.0/20',
4066         'FO': '88.85.32.0/19',
4067         'FR': '90.0.0.0/9',
4068         'GA': '41.158.0.0/15',
4069         'GB': '25.0.0.0/8',
4070         'GD': '74.122.88.0/21',
4071         'GE': '31.146.0.0/16',
4072         'GF': '161.22.64.0/18',
4073         'GG': '62.68.160.0/19',
4074         'GH': '154.160.0.0/12',
4075         'GI': '95.164.0.0/16',
4076         'GL': '88.83.0.0/19',
4077         'GM': '160.182.0.0/15',
4078         'GN': '197.149.192.0/18',
4079         'GP': '104.250.0.0/19',
4080         'GQ': '105.235.224.0/20',
4081         'GR': '94.64.0.0/13',
4082         'GT': '168.234.0.0/16',
4083         'GU': '168.123.0.0/16',
4084         'GW': '197.214.80.0/20',
4085         'GY': '181.41.64.0/18',
4086         'HK': '113.252.0.0/14',
4087         'HN': '181.210.0.0/16',
4088         'HR': '93.136.0.0/13',
4089         'HT': '148.102.128.0/17',
4090         'HU': '84.0.0.0/14',
4091         'ID': '39.192.0.0/10',
4092         'IE': '87.32.0.0/12',
4093         'IL': '79.176.0.0/13',
4094         'IM': '5.62.80.0/20',
4095         'IN': '117.192.0.0/10',
4096         'IO': '203.83.48.0/21',
4097         'IQ': '37.236.0.0/14',
4098         'IR': '2.176.0.0/12',
4099         'IS': '82.221.0.0/16',
4100         'IT': '79.0.0.0/10',
4101         'JE': '87.244.64.0/18',
4102         'JM': '72.27.0.0/17',
4103         'JO': '176.29.0.0/16',
4104         'JP': '133.0.0.0/8',
4105         'KE': '105.48.0.0/12',
4106         'KG': '158.181.128.0/17',
4107         'KH': '36.37.128.0/17',
4108         'KI': '103.25.140.0/22',
4109         'KM': '197.255.224.0/20',
4110         'KN': '198.167.192.0/19',
4111         'KP': '175.45.176.0/22',
4112         'KR': '175.192.0.0/10',
4113         'KW': '37.36.0.0/14',
4114         'KY': '64.96.0.0/15',
4115         'KZ': '2.72.0.0/13',
4116         'LA': '115.84.64.0/18',
4117         'LB': '178.135.0.0/16',
4118         'LC': '24.92.144.0/20',
4119         'LI': '82.117.0.0/19',
4120         'LK': '112.134.0.0/15',
4121         'LR': '102.183.0.0/16',
4122         'LS': '129.232.0.0/17',
4123         'LT': '78.56.0.0/13',
4124         'LU': '188.42.0.0/16',
4125         'LV': '46.109.0.0/16',
4126         'LY': '41.252.0.0/14',
4127         'MA': '105.128.0.0/11',
4128         'MC': '88.209.64.0/18',
4129         'MD': '37.246.0.0/16',
4130         'ME': '178.175.0.0/17',
4131         'MF': '74.112.232.0/21',
4132         'MG': '154.126.0.0/17',
4133         'MH': '117.103.88.0/21',
4134         'MK': '77.28.0.0/15',
4135         'ML': '154.118.128.0/18',
4136         'MM': '37.111.0.0/17',
4137         'MN': '49.0.128.0/17',
4138         'MO': '60.246.0.0/16',
4139         'MP': '202.88.64.0/20',
4140         'MQ': '109.203.224.0/19',
4141         'MR': '41.188.64.0/18',
4142         'MS': '208.90.112.0/22',
4143         'MT': '46.11.0.0/16',
4144         'MU': '105.16.0.0/12',
4145         'MV': '27.114.128.0/18',
4146         'MW': '102.70.0.0/15',
4147         'MX': '187.192.0.0/11',
4148         'MY': '175.136.0.0/13',
4149         'MZ': '197.218.0.0/15',
4150         'NA': '41.182.0.0/16',
4151         'NC': '101.101.0.0/18',
4152         'NE': '197.214.0.0/18',
4153         'NF': '203.17.240.0/22',
4154         'NG': '105.112.0.0/12',
4155         'NI': '186.76.0.0/15',
4156         'NL': '145.96.0.0/11',
4157         'NO': '84.208.0.0/13',
4158         'NP': '36.252.0.0/15',
4159         'NR': '203.98.224.0/19',
4160         'NU': '49.156.48.0/22',
4161         'NZ': '49.224.0.0/14',
4162         'OM': '5.36.0.0/15',
4163         'PA': '186.72.0.0/15',
4164         'PE': '186.160.0.0/14',
4165         'PF': '123.50.64.0/18',
4166         'PG': '124.240.192.0/19',
4167         'PH': '49.144.0.0/13',
4168         'PK': '39.32.0.0/11',
4169         'PL': '83.0.0.0/11',
4170         'PM': '70.36.0.0/20',
4171         'PR': '66.50.0.0/16',
4172         'PS': '188.161.0.0/16',
4173         'PT': '85.240.0.0/13',
4174         'PW': '202.124.224.0/20',
4175         'PY': '181.120.0.0/14',
4176         'QA': '37.210.0.0/15',
4177         'RE': '102.35.0.0/16',
4178         'RO': '79.112.0.0/13',
4179         'RS': '93.86.0.0/15',
4180         'RU': '5.136.0.0/13',
4181         'RW': '41.186.0.0/16',
4182         'SA': '188.48.0.0/13',
4183         'SB': '202.1.160.0/19',
4184         'SC': '154.192.0.0/11',
4185         'SD': '102.120.0.0/13',
4186         'SE': '78.64.0.0/12',
4187         'SG': '8.128.0.0/10',
4188         'SI': '188.196.0.0/14',
4189         'SK': '78.98.0.0/15',
4190         'SL': '102.143.0.0/17',
4191         'SM': '89.186.32.0/19',
4192         'SN': '41.82.0.0/15',
4193         'SO': '154.115.192.0/18',
4194         'SR': '186.179.128.0/17',
4195         'SS': '105.235.208.0/21',
4196         'ST': '197.159.160.0/19',
4197         'SV': '168.243.0.0/16',
4198         'SX': '190.102.0.0/20',
4199         'SY': '5.0.0.0/16',
4200         'SZ': '41.84.224.0/19',
4201         'TC': '65.255.48.0/20',
4202         'TD': '154.68.128.0/19',
4203         'TG': '196.168.0.0/14',
4204         'TH': '171.96.0.0/13',
4205         'TJ': '85.9.128.0/18',
4206         'TK': '27.96.24.0/21',
4207         'TL': '180.189.160.0/20',
4208         'TM': '95.85.96.0/19',
4209         'TN': '197.0.0.0/11',
4210         'TO': '175.176.144.0/21',
4211         'TR': '78.160.0.0/11',
4212         'TT': '186.44.0.0/15',
4213         'TV': '202.2.96.0/19',
4214         'TW': '120.96.0.0/11',
4215         'TZ': '156.156.0.0/14',
4216         'UA': '37.52.0.0/14',
4217         'UG': '102.80.0.0/13',
4218         'US': '6.0.0.0/8',
4219         'UY': '167.56.0.0/13',
4220         'UZ': '84.54.64.0/18',
4221         'VA': '212.77.0.0/19',
4222         'VC': '207.191.240.0/21',
4223         'VE': '186.88.0.0/13',
4224         'VG': '66.81.192.0/20',
4225         'VI': '146.226.0.0/16',
4226         'VN': '14.160.0.0/11',
4227         'VU': '202.80.32.0/20',
4228         'WF': '117.20.32.0/21',
4229         'WS': '202.4.32.0/19',
4230         'YE': '134.35.0.0/16',
4231         'YT': '41.242.116.0/22',
4232         'ZA': '41.0.0.0/11',
4233         'ZM': '102.144.0.0/13',
4234         'ZW': '102.177.192.0/18',
4235     }
4236
4237     @classmethod
4238     def random_ipv4(cls, code_or_block):
4239         if len(code_or_block) == 2:
4240             block = cls._country_ip_map.get(code_or_block.upper())
4241             if not block:
4242                 return None
4243         else:
4244             block = code_or_block
4245         addr, preflen = block.split('/')
4246         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4247         addr_max = addr_min | (0xffffffff >> int(preflen))
4248         return str(socket.inet_ntoa(
4249             struct.pack('!L', random.randint(addr_min, addr_max))))
4250
4251
4252 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4253 # released into Public Domain
4254 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4255
4256 def long_to_bytes(n, blocksize=0):
4257     """long_to_bytes(n:long, blocksize:int) : string
4258     Convert a long integer to a byte string.
4259
4260     If optional blocksize is given and greater than zero, pad the front of the
4261     byte string with binary zeros so that the length is a multiple of
4262     blocksize.
4263     """
4264     # after much testing, this algorithm was deemed to be the fastest
4265     s = b''
4266     n = int(n)
4267     while n > 0:
4268         s = struct.pack('>I', n & 0xffffffff) + s
4269         n = n >> 32
4270     # strip off leading zeros
4271     for i in range(len(s)):
4272         if s[i] != b'\000'[0]:
4273             break
4274     else:
4275         # only happens when n == 0
4276         s = b'\000'
4277         i = 0
4278     s = s[i:]
4279     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4280     # de-padding being done above, but sigh...
4281     if blocksize > 0 and len(s) % blocksize:
4282         s = (blocksize - len(s) % blocksize) * b'\000' + s
4283     return s
4284
4285
4286 def bytes_to_long(s):
4287     """bytes_to_long(string) : long
4288     Convert a byte string to a long integer.
4289
4290     This is (essentially) the inverse of long_to_bytes().
4291     """
4292     acc = 0
4293     length = len(s)
4294     if length % 4:
4295         extra = (4 - length % 4)
4296         s = b'\000' * extra + s
4297         length = length + extra
4298     for i in range(0, length, 4):
4299         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4300     return acc
4301
4302
4303 def ohdave_rsa_encrypt(data, exponent, modulus):
4304     '''
4305     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4306
4307     Input:
4308         data: data to encrypt, bytes-like object
4309         exponent, modulus: parameter e and N of RSA algorithm, both integer
4310     Output: hex string of encrypted data
4311
4312     Limitation: supports one block encryption only
4313     '''
4314
4315     payload = int(binascii.hexlify(data[::-1]), 16)
4316     encrypted = pow(payload, exponent, modulus)
4317     return '%x' % encrypted
4318
4319
4320 def pkcs1pad(data, length):
4321     """
4322     Padding input data with PKCS#1 scheme
4323
4324     @param {int[]} data        input data
4325     @param {int}   length      target length
4326     @returns {int[]}           padded data
4327     """
4328     if len(data) > length - 11:
4329         raise ValueError('Input data too long for PKCS#1 padding')
4330
4331     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4332     return [0, 2] + pseudo_random + [0] + data
4333
4334
4335 def _base_n_table(n, table):
4336     if not table and not n:
4337         raise ValueError('Either table or n must be specified')
4338     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4339
4340     if n and n != len(table):
4341         raise ValueError(f'base {n} exceeds table length {len(table)}')
4342     return table
4343
4344
4345 def encode_base_n(num, n=None, table=None):
4346     """Convert given int to a base-n string"""
4347     table = _base_n_table(n, table)
4348     if not num:
4349         return table[0]
4350
4351     result, base = '', len(table)
4352     while num:
4353         result = table[num % base] + result
4354         num = num // base
4355     return result
4356
4357
4358 def decode_base_n(string, n=None, table=None):
4359     """Convert given base-n string to int"""
4360     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4361     result, base = 0, len(table)
4362     for char in string:
4363         result = result * base + table[char]
4364     return result
4365
4366
4367 def decode_packed_codes(code):
4368     mobj = re.search(PACKED_CODES_RE, code)
4369     obfuscated_code, base, count, symbols = mobj.groups()
4370     base = int(base)
4371     count = int(count)
4372     symbols = symbols.split('|')
4373     symbol_table = {}
4374
4375     while count:
4376         count -= 1
4377         base_n_count = encode_base_n(count, base)
4378         symbol_table[base_n_count] = symbols[count] or base_n_count
4379
4380     return re.sub(
4381         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4382         obfuscated_code)
4383
4384
4385 def caesar(s, alphabet, shift):
4386     if shift == 0:
4387         return s
4388     l = len(alphabet)
4389     return ''.join(
4390         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4391         for c in s)
4392
4393
4394 def rot47(s):
4395     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4396
4397
4398 def parse_m3u8_attributes(attrib):
4399     info = {}
4400     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4401         if val.startswith('"'):
4402             val = val[1:-1]
4403         info[key] = val
4404     return info
4405
4406
4407 def urshift(val, n):
4408     return val >> n if val >= 0 else (val + 0x100000000) >> n
4409
4410
4411 def write_xattr(path, key, value):
4412     # Windows: Write xattrs to NTFS Alternate Data Streams:
4413     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4414     if compat_os_name == 'nt':
4415         assert ':' not in key
4416         assert os.path.exists(path)
4417
4418         try:
4419             with open(f'{path}:{key}', 'wb') as f:
4420                 f.write(value)
4421         except OSError as e:
4422             raise XAttrMetadataError(e.errno, e.strerror)
4423         return
4424
4425     # UNIX Method 1. Use xattrs/pyxattrs modules
4426
4427     setxattr = None
4428     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4429         # Unicode arguments are not supported in pyxattr until version 0.5.0
4430         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4431         if version_tuple(xattr.__version__) >= (0, 5, 0):
4432             setxattr = xattr.set
4433     elif xattr:
4434         setxattr = xattr.setxattr
4435
4436     if setxattr:
4437         try:
4438             setxattr(path, key, value)
4439         except OSError as e:
4440             raise XAttrMetadataError(e.errno, e.strerror)
4441         return
4442
4443     # UNIX Method 2. Use setfattr/xattr executables
4444     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4445            else 'xattr' if check_executable('xattr', ['-h']) else None)
4446     if not exe:
4447         raise XAttrUnavailableError(
4448             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4449             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4450
4451     value = value.decode()
4452     try:
4453         _, stderr, returncode = Popen.run(
4454             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4455             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4456     except OSError as e:
4457         raise XAttrMetadataError(e.errno, e.strerror)
4458     if returncode:
4459         raise XAttrMetadataError(returncode, stderr)
4460
4461
4462 def random_birthday(year_field, month_field, day_field):
4463     start_date = datetime.date(1950, 1, 1)
4464     end_date = datetime.date(1995, 12, 31)
4465     offset = random.randint(0, (end_date - start_date).days)
4466     random_date = start_date + datetime.timedelta(offset)
4467     return {
4468         year_field: str(random_date.year),
4469         month_field: str(random_date.month),
4470         day_field: str(random_date.day),
4471     }
4472
4473
4474 def find_available_port(interface=''):
4475     try:
4476         with socket.socket() as sock:
4477             sock.bind((interface, 0))
4478             return sock.getsockname()[1]
4479     except OSError:
4480         return None
4481
4482
4483 # Templates for internet shortcut files, which are plain text files.
4484 DOT_URL_LINK_TEMPLATE = '''\
4485 [InternetShortcut]
4486 URL=%(url)s
4487 '''
4488
4489 DOT_WEBLOC_LINK_TEMPLATE = '''\
4490 <?xml version="1.0" encoding="UTF-8"?>
4491 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4492 <plist version="1.0">
4493 <dict>
4494 \t<key>URL</key>
4495 \t<string>%(url)s</string>
4496 </dict>
4497 </plist>
4498 '''
4499
4500 DOT_DESKTOP_LINK_TEMPLATE = '''\
4501 [Desktop Entry]
4502 Encoding=UTF-8
4503 Name=%(filename)s
4504 Type=Link
4505 URL=%(url)s
4506 Icon=text-html
4507 '''
4508
4509 LINK_TEMPLATES = {
4510     'url': DOT_URL_LINK_TEMPLATE,
4511     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4512     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4513 }
4514
4515
4516 def iri_to_uri(iri):
4517     """
4518     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4519
4520     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4521     """
4522
4523     iri_parts = urllib.parse.urlparse(iri)
4524
4525     if '[' in iri_parts.netloc:
4526         raise ValueError('IPv6 URIs are not, yet, supported.')
4527         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4528
4529     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4530
4531     net_location = ''
4532     if iri_parts.username:
4533         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4534         if iri_parts.password is not None:
4535             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4536         net_location += '@'
4537
4538     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4539     # The 'idna' encoding produces ASCII text.
4540     if iri_parts.port is not None and iri_parts.port != 80:
4541         net_location += ':' + str(iri_parts.port)
4542
4543     return urllib.parse.urlunparse(
4544         (iri_parts.scheme,
4545             net_location,
4546
4547             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4548
4549             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4550             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4551
4552             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4553             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4554
4555             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4556
4557     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4558
4559
4560 def to_high_limit_path(path):
4561     if sys.platform in ['win32', 'cygwin']:
4562         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4563         return '\\\\?\\' + os.path.abspath(path)
4564
4565     return path
4566
4567
4568 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4569     val = traversal.traverse_obj(obj, *variadic(field))
4570     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4571         return default
4572     return template % func(val)
4573
4574
4575 def clean_podcast_url(url):
4576     url = re.sub(r'''(?x)
4577         (?:
4578             (?:
4579                 chtbl\.com/track|
4580                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4581                 play\.podtrac\.com|
4582                 chrt\.fm/track|
4583                 mgln\.ai/e
4584             )(?:/[^/.]+)?|
4585             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4586             flex\.acast\.com|
4587             pd(?:
4588                 cn\.co| # https://podcorn.com/analytics-prefix/
4589                 st\.fm # https://podsights.com/docs/
4590             )/e|
4591             [0-9]\.gum\.fm|
4592             pscrb\.fm/rss/p
4593         )/''', '', url)
4594     return re.sub(r'^\w+://(\w+://)', r'\1', url)
4595
4596
4597 _HEX_TABLE = '0123456789abcdef'
4598
4599
4600 def random_uuidv4():
4601     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4602
4603
4604 def make_dir(path, to_screen=None):
4605     try:
4606         dn = os.path.dirname(path)
4607         if dn:
4608             os.makedirs(dn, exist_ok=True)
4609         return True
4610     except OSError as err:
4611         if callable(to_screen) is not None:
4612             to_screen(f'unable to create directory {err}')
4613         return False
4614
4615
4616 def get_executable_path():
4617     from ..update import _get_variant_and_executable_path
4618
4619     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4620
4621
4622 def get_user_config_dirs(package_name):
4623     # .config (e.g. ~/.config/package_name)
4624     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4625     yield os.path.join(xdg_config_home, package_name)
4626
4627     # appdata (%APPDATA%/package_name)
4628     appdata_dir = os.getenv('appdata')
4629     if appdata_dir:
4630         yield os.path.join(appdata_dir, package_name)
4631
4632     # home (~/.package_name)
4633     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4634
4635
4636 def get_system_config_dirs(package_name):
4637     # /etc/package_name
4638     yield os.path.join('/etc', package_name)
4639
4640
4641 def time_seconds(**kwargs):
4642     """
4643     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4644     """
4645     return time.time() + datetime.timedelta(**kwargs).total_seconds()
4646
4647
4648 # create a JSON Web Signature (jws) with HS256 algorithm
4649 # the resulting format is in JWS Compact Serialization
4650 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4651 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4652 def jwt_encode_hs256(payload_data, key, headers={}):
4653     header_data = {
4654         'alg': 'HS256',
4655         'typ': 'JWT',
4656     }
4657     if headers:
4658         header_data.update(headers)
4659     header_b64 = base64.b64encode(json.dumps(header_data).encode())
4660     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4661     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4662     signature_b64 = base64.b64encode(h.digest())
4663     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4664     return token
4665
4666
4667 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4668 def jwt_decode_hs256(jwt):
4669     header_b64, payload_b64, signature_b64 = jwt.split('.')
4670     # add trailing ='s that may have been stripped, superfluous ='s are ignored
4671     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4672     return payload_data
4673
4674
4675 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4676
4677
4678 @functools.cache
4679 def supports_terminal_sequences(stream):
4680     if compat_os_name == 'nt':
4681         if not WINDOWS_VT_MODE:
4682             return False
4683     elif not os.getenv('TERM'):
4684         return False
4685     try:
4686         return stream.isatty()
4687     except BaseException:
4688         return False
4689
4690
4691 def windows_enable_vt_mode():
4692     """Ref: https://bugs.python.org/issue30075 """
4693     if get_windows_version() < (10, 0, 10586):
4694         return
4695
4696     import ctypes
4697     import ctypes.wintypes
4698     import msvcrt
4699
4700     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4701
4702     dll = ctypes.WinDLL('kernel32', use_last_error=False)
4703     handle = os.open('CONOUT$', os.O_RDWR)
4704     try:
4705         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4706         dw_original_mode = ctypes.wintypes.DWORD()
4707         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4708         if not success:
4709             raise Exception('GetConsoleMode failed')
4710
4711         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4712             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4713         if not success:
4714             raise Exception('SetConsoleMode failed')
4715     finally:
4716         os.close(handle)
4717
4718     global WINDOWS_VT_MODE
4719     WINDOWS_VT_MODE = True
4720     supports_terminal_sequences.cache_clear()
4721
4722
4723 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4724
4725
4726 def remove_terminal_sequences(string):
4727     return _terminal_sequences_re.sub('', string)
4728
4729
4730 def number_of_digits(number):
4731     return len('%d' % number)
4732
4733
4734 def join_nonempty(*values, delim='-', from_dict=None):
4735     if from_dict is not None:
4736         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4737     return delim.join(map(str, filter(None, values)))
4738
4739
4740 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4741     """
4742     Find the largest format dimensions in terms of video width and, for each thumbnail:
4743     * Modify the URL: Match the width with the provided regex and replace with the former width
4744     * Update dimensions
4745
4746     This function is useful with video services that scale the provided thumbnails on demand
4747     """
4748     _keys = ('width', 'height')
4749     max_dimensions = max(
4750         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4751         default=(0, 0))
4752     if not max_dimensions[0]:
4753         return thumbnails
4754     return [
4755         merge_dicts(
4756             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4757             dict(zip(_keys, max_dimensions)), thumbnail)
4758         for thumbnail in thumbnails
4759     ]
4760
4761
4762 def parse_http_range(range):
4763     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4764     if not range:
4765         return None, None, None
4766     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4767     if not crg:
4768         return None, None, None
4769     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4770
4771
4772 def read_stdin(what):
4773     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4774     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4775     return sys.stdin
4776
4777
4778 def determine_file_encoding(data):
4779     """
4780     Detect the text encoding used
4781     @returns (encoding, bytes to skip)
4782     """
4783
4784     # BOM marks are given priority over declarations
4785     for bom, enc in BOMS:
4786         if data.startswith(bom):
4787             return enc, len(bom)
4788
4789     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4790     # We ignore the endianness to get a good enough match
4791     data = data.replace(b'\0', b'')
4792     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4793     return mobj.group(1).decode() if mobj else None, 0
4794
4795
4796 class Config:
4797     own_args = None
4798     parsed_args = None
4799     filename = None
4800     __initialized = False
4801
4802     def __init__(self, parser, label=None):
4803         self.parser, self.label = parser, label
4804         self._loaded_paths, self.configs = set(), []
4805
4806     def init(self, args=None, filename=None):
4807         assert not self.__initialized
4808         self.own_args, self.filename = args, filename
4809         return self.load_configs()
4810
4811     def load_configs(self):
4812         directory = ''
4813         if self.filename:
4814             location = os.path.realpath(self.filename)
4815             directory = os.path.dirname(location)
4816             if location in self._loaded_paths:
4817                 return False
4818             self._loaded_paths.add(location)
4819
4820         self.__initialized = True
4821         opts, _ = self.parser.parse_known_args(self.own_args)
4822         self.parsed_args = self.own_args
4823         for location in opts.config_locations or []:
4824             if location == '-':
4825                 if location in self._loaded_paths:
4826                     continue
4827                 self._loaded_paths.add(location)
4828                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4829                 continue
4830             location = os.path.join(directory, expand_path(location))
4831             if os.path.isdir(location):
4832                 location = os.path.join(location, 'yt-dlp.conf')
4833             if not os.path.exists(location):
4834                 self.parser.error(f'config location {location} does not exist')
4835             self.append_config(self.read_file(location), location)
4836         return True
4837
4838     def __str__(self):
4839         label = join_nonempty(
4840             self.label, 'config', f'"{self.filename}"' if self.filename else '',
4841             delim=' ')
4842         return join_nonempty(
4843             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4844             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4845             delim='\n')
4846
4847     @staticmethod
4848     def read_file(filename, default=[]):
4849         try:
4850             optionf = open(filename, 'rb')
4851         except OSError:
4852             return default  # silently skip if file is not present
4853         try:
4854             enc, skip = determine_file_encoding(optionf.read(512))
4855             optionf.seek(skip, io.SEEK_SET)
4856         except OSError:
4857             enc = None  # silently skip read errors
4858         try:
4859             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4860             contents = optionf.read().decode(enc or preferredencoding())
4861             res = shlex.split(contents, comments=True)
4862         except Exception as err:
4863             raise ValueError(f'Unable to parse "{filename}": {err}')
4864         finally:
4865             optionf.close()
4866         return res
4867
4868     @staticmethod
4869     def hide_login_info(opts):
4870         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4871         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4872
4873         def _scrub_eq(o):
4874             m = eqre.match(o)
4875             if m:
4876                 return m.group('key') + '=PRIVATE'
4877             else:
4878                 return o
4879
4880         opts = list(map(_scrub_eq, opts))
4881         for idx, opt in enumerate(opts):
4882             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4883                 opts[idx + 1] = 'PRIVATE'
4884         return opts
4885
4886     def append_config(self, *args, label=None):
4887         config = type(self)(self.parser, label)
4888         config._loaded_paths = self._loaded_paths
4889         if config.init(*args):
4890             self.configs.append(config)
4891
4892     @property
4893     def all_args(self):
4894         for config in reversed(self.configs):
4895             yield from config.all_args
4896         yield from self.parsed_args or []
4897
4898     def parse_known_args(self, **kwargs):
4899         return self.parser.parse_known_args(self.all_args, **kwargs)
4900
4901     def parse_args(self):
4902         return self.parser.parse_args(self.all_args)
4903
4904
4905 class WebSocketsWrapper:
4906     """Wraps websockets module to use in non-async scopes"""
4907     pool = None
4908
4909     def __init__(self, url, headers=None, connect=True):
4910         self.loop = asyncio.new_event_loop()
4911         # XXX: "loop" is deprecated
4912         self.conn = websockets.connect(
4913             url, extra_headers=headers, ping_interval=None,
4914             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
4915         if connect:
4916             self.__enter__()
4917         atexit.register(self.__exit__, None, None, None)
4918
4919     def __enter__(self):
4920         if not self.pool:
4921             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
4922         return self
4923
4924     def send(self, *args):
4925         self.run_with_loop(self.pool.send(*args), self.loop)
4926
4927     def recv(self, *args):
4928         return self.run_with_loop(self.pool.recv(*args), self.loop)
4929
4930     def __exit__(self, type, value, traceback):
4931         try:
4932             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
4933         finally:
4934             self.loop.close()
4935             self._cancel_all_tasks(self.loop)
4936
4937     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
4938     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
4939     @staticmethod
4940     def run_with_loop(main, loop):
4941         if not asyncio.iscoroutine(main):
4942             raise ValueError(f'a coroutine was expected, got {main!r}')
4943
4944         try:
4945             return loop.run_until_complete(main)
4946         finally:
4947             loop.run_until_complete(loop.shutdown_asyncgens())
4948             if hasattr(loop, 'shutdown_default_executor'):
4949                 loop.run_until_complete(loop.shutdown_default_executor())
4950
4951     @staticmethod
4952     def _cancel_all_tasks(loop):
4953         to_cancel = asyncio.all_tasks(loop)
4954
4955         if not to_cancel:
4956             return
4957
4958         for task in to_cancel:
4959             task.cancel()
4960
4961         # XXX: "loop" is removed in python 3.10+
4962         loop.run_until_complete(
4963             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
4964
4965         for task in to_cancel:
4966             if task.cancelled():
4967                 continue
4968             if task.exception() is not None:
4969                 loop.call_exception_handler({
4970                     'message': 'unhandled exception during asyncio.run() shutdown',
4971                     'exception': task.exception(),
4972                     'task': task,
4973                 })
4974
4975
4976 def merge_headers(*dicts):
4977     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4978     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
4979
4980
4981 def cached_method(f):
4982     """Cache a method"""
4983     signature = inspect.signature(f)
4984
4985     @functools.wraps(f)
4986     def wrapper(self, *args, **kwargs):
4987         bound_args = signature.bind(self, *args, **kwargs)
4988         bound_args.apply_defaults()
4989         key = tuple(bound_args.arguments.values())[1:]
4990
4991         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
4992         if key not in cache:
4993             cache[key] = f(self, *args, **kwargs)
4994         return cache[key]
4995     return wrapper
4996
4997
4998 class classproperty:
4999     """property access for class methods with optional caching"""
5000     def __new__(cls, func=None, *args, **kwargs):
5001         if not func:
5002             return functools.partial(cls, *args, **kwargs)
5003         return super().__new__(cls)
5004
5005     def __init__(self, func, *, cache=False):
5006         functools.update_wrapper(self, func)
5007         self.func = func
5008         self._cache = {} if cache else None
5009
5010     def __get__(self, _, cls):
5011         if self._cache is None:
5012             return self.func(cls)
5013         elif cls not in self._cache:
5014             self._cache[cls] = self.func(cls)
5015         return self._cache[cls]
5016
5017
5018 class function_with_repr:
5019     def __init__(self, func, repr_=None):
5020         functools.update_wrapper(self, func)
5021         self.func, self.__repr = func, repr_
5022
5023     def __call__(self, *args, **kwargs):
5024         return self.func(*args, **kwargs)
5025
5026     def __repr__(self):
5027         if self.__repr:
5028             return self.__repr
5029         return f'{self.func.__module__}.{self.func.__qualname__}'
5030
5031
5032 class Namespace(types.SimpleNamespace):
5033     """Immutable namespace"""
5034
5035     def __iter__(self):
5036         return iter(self.__dict__.values())
5037
5038     @property
5039     def items_(self):
5040         return self.__dict__.items()
5041
5042
5043 MEDIA_EXTENSIONS = Namespace(
5044     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5045     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5046     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5047     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5048     thumbnails=('jpg', 'png', 'webp'),
5049     storyboards=('mhtml', ),
5050     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5051     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5052 )
5053 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5054 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5055
5056 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5057
5058
5059 class RetryManager:
5060     """Usage:
5061         for retry in RetryManager(...):
5062             try:
5063                 ...
5064             except SomeException as err:
5065                 retry.error = err
5066                 continue
5067     """
5068     attempt, _error = 0, None
5069
5070     def __init__(self, _retries, _error_callback, **kwargs):
5071         self.retries = _retries or 0
5072         self.error_callback = functools.partial(_error_callback, **kwargs)
5073
5074     def _should_retry(self):
5075         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5076
5077     @property
5078     def error(self):
5079         if self._error is NO_DEFAULT:
5080             return None
5081         return self._error
5082
5083     @error.setter
5084     def error(self, value):
5085         self._error = value
5086
5087     def __iter__(self):
5088         while self._should_retry():
5089             self.error = NO_DEFAULT
5090             self.attempt += 1
5091             yield self
5092             if self.error:
5093                 self.error_callback(self.error, self.attempt, self.retries)
5094
5095     @staticmethod
5096     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5097         """Utility function for reporting retries"""
5098         if count > retries:
5099             if error:
5100                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5101             raise e
5102
5103         if not count:
5104             return warn(e)
5105         elif isinstance(e, ExtractorError):
5106             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5107         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5108
5109         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5110         if delay:
5111             info(f'Sleeping {delay:.2f} seconds ...')
5112             time.sleep(delay)
5113
5114
5115 def make_archive_id(ie, video_id):
5116     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5117     return f'{ie_key.lower()} {video_id}'
5118
5119
5120 def truncate_string(s, left, right=0):
5121     assert left > 3 and right >= 0
5122     if s is None or len(s) <= left + right:
5123         return s
5124     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5125
5126
5127 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5128     assert 'all' in alias_dict, '"all" alias is required'
5129     requested = list(start or [])
5130     for val in options:
5131         discard = val.startswith('-')
5132         if discard:
5133             val = val[1:]
5134
5135         if val in alias_dict:
5136             val = alias_dict[val] if not discard else [
5137                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5138             # NB: Do not allow regex in aliases for performance
5139             requested = orderedSet_from_options(val, alias_dict, start=requested)
5140             continue
5141
5142         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5143                    else [val] if val in alias_dict['all'] else None)
5144         if current is None:
5145             raise ValueError(val)
5146
5147         if discard:
5148             for item in current:
5149                 while item in requested:
5150                     requested.remove(item)
5151         else:
5152             requested.extend(current)
5153
5154     return orderedSet(requested)
5155
5156
5157 # TODO: Rewrite
5158 class FormatSorter:
5159     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5160
5161     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5162                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5163                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5164     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5165                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5166                     'fps', 'fs_approx', 'source', 'id')
5167
5168     settings = {
5169         'vcodec': {'type': 'ordered', 'regex': True,
5170                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5171         'acodec': {'type': 'ordered', 'regex': True,
5172                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5173         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5174                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5175         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5176                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5177         'vext': {'type': 'ordered', 'field': 'video_ext',
5178                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5179                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5180         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5181                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5182                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5183         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5184         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5185                        'field': ('vcodec', 'acodec'),
5186                        'function': lambda it: int(any(v != 'none' for v in it))},
5187         'ie_pref': {'priority': True, 'type': 'extractor'},
5188         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5189         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5190         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5191         'quality': {'convert': 'float', 'default': -1},
5192         'filesize': {'convert': 'bytes'},
5193         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5194         'id': {'convert': 'string', 'field': 'format_id'},
5195         'height': {'convert': 'float_none'},
5196         'width': {'convert': 'float_none'},
5197         'fps': {'convert': 'float_none'},
5198         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5199         'tbr': {'convert': 'float_none'},
5200         'vbr': {'convert': 'float_none'},
5201         'abr': {'convert': 'float_none'},
5202         'asr': {'convert': 'float_none'},
5203         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5204
5205         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5206         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5207                'function': lambda it: next(filter(None, it), None)},
5208         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5209                  'function': lambda it: next(filter(None, it), None)},
5210         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5211         'res': {'type': 'multiple', 'field': ('height', 'width'),
5212                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5213
5214         # Actual field names
5215         'format_id': {'type': 'alias', 'field': 'id'},
5216         'preference': {'type': 'alias', 'field': 'ie_pref'},
5217         'language_preference': {'type': 'alias', 'field': 'lang'},
5218         'source_preference': {'type': 'alias', 'field': 'source'},
5219         'protocol': {'type': 'alias', 'field': 'proto'},
5220         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5221         'audio_channels': {'type': 'alias', 'field': 'channels'},
5222
5223         # Deprecated
5224         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5225         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5226         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5227         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5228         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5229         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5230         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5231         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5232         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5233         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5234         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5235         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5236         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5237         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5238         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5239         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5240         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5241         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5242         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5243         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5244     }
5245
5246     def __init__(self, ydl, field_preference):
5247         self.ydl = ydl
5248         self._order = []
5249         self.evaluate_params(self.ydl.params, field_preference)
5250         if ydl.params.get('verbose'):
5251             self.print_verbose_info(self.ydl.write_debug)
5252
5253     def _get_field_setting(self, field, key):
5254         if field not in self.settings:
5255             if key in ('forced', 'priority'):
5256                 return False
5257             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5258                                         'deprecated and may be removed in a future version')
5259             self.settings[field] = {}
5260         propObj = self.settings[field]
5261         if key not in propObj:
5262             type = propObj.get('type')
5263             if key == 'field':
5264                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5265             elif key == 'convert':
5266                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5267             else:
5268                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5269             propObj[key] = default
5270         return propObj[key]
5271
5272     def _resolve_field_value(self, field, value, convertNone=False):
5273         if value is None:
5274             if not convertNone:
5275                 return None
5276         else:
5277             value = value.lower()
5278         conversion = self._get_field_setting(field, 'convert')
5279         if conversion == 'ignore':
5280             return None
5281         if conversion == 'string':
5282             return value
5283         elif conversion == 'float_none':
5284             return float_or_none(value)
5285         elif conversion == 'bytes':
5286             return parse_bytes(value)
5287         elif conversion == 'order':
5288             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5289             use_regex = self._get_field_setting(field, 'regex')
5290             list_length = len(order_list)
5291             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5292             if use_regex and value is not None:
5293                 for i, regex in enumerate(order_list):
5294                     if regex and re.match(regex, value):
5295                         return list_length - i
5296                 return list_length - empty_pos  # not in list
5297             else:  # not regex or  value = None
5298                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5299         else:
5300             if value.isnumeric():
5301                 return float(value)
5302             else:
5303                 self.settings[field]['convert'] = 'string'
5304                 return value
5305
5306     def evaluate_params(self, params, sort_extractor):
5307         self._use_free_order = params.get('prefer_free_formats', False)
5308         self._sort_user = params.get('format_sort', [])
5309         self._sort_extractor = sort_extractor
5310
5311         def add_item(field, reverse, closest, limit_text):
5312             field = field.lower()
5313             if field in self._order:
5314                 return
5315             self._order.append(field)
5316             limit = self._resolve_field_value(field, limit_text)
5317             data = {
5318                 'reverse': reverse,
5319                 'closest': False if limit is None else closest,
5320                 'limit_text': limit_text,
5321                 'limit': limit}
5322             if field in self.settings:
5323                 self.settings[field].update(data)
5324             else:
5325                 self.settings[field] = data
5326
5327         sort_list = (
5328             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5329             + (tuple() if params.get('format_sort_force', False)
5330                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5331             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5332
5333         for item in sort_list:
5334             match = re.match(self.regex, item)
5335             if match is None:
5336                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5337             field = match.group('field')
5338             if field is None:
5339                 continue
5340             if self._get_field_setting(field, 'type') == 'alias':
5341                 alias, field = field, self._get_field_setting(field, 'field')
5342                 if self._get_field_setting(alias, 'deprecated'):
5343                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5344                                                 f'be removed in a future version. Please use {field} instead')
5345             reverse = match.group('reverse') is not None
5346             closest = match.group('separator') == '~'
5347             limit_text = match.group('limit')
5348
5349             has_limit = limit_text is not None
5350             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5351             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5352
5353             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5354             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5355             limit_count = len(limits)
5356             for (i, f) in enumerate(fields):
5357                 add_item(f, reverse, closest,
5358                          limits[i] if i < limit_count
5359                          else limits[0] if has_limit and not has_multiple_limits
5360                          else None)
5361
5362     def print_verbose_info(self, write_debug):
5363         if self._sort_user:
5364             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5365         if self._sort_extractor:
5366             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5367         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5368             '+' if self._get_field_setting(field, 'reverse') else '', field,
5369             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5370                           self._get_field_setting(field, 'limit_text'),
5371                           self._get_field_setting(field, 'limit'))
5372             if self._get_field_setting(field, 'limit_text') is not None else '')
5373             for field in self._order if self._get_field_setting(field, 'visible')]))
5374
5375     def _calculate_field_preference_from_value(self, format, field, type, value):
5376         reverse = self._get_field_setting(field, 'reverse')
5377         closest = self._get_field_setting(field, 'closest')
5378         limit = self._get_field_setting(field, 'limit')
5379
5380         if type == 'extractor':
5381             maximum = self._get_field_setting(field, 'max')
5382             if value is None or (maximum is not None and value >= maximum):
5383                 value = -1
5384         elif type == 'boolean':
5385             in_list = self._get_field_setting(field, 'in_list')
5386             not_in_list = self._get_field_setting(field, 'not_in_list')
5387             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5388         elif type == 'ordered':
5389             value = self._resolve_field_value(field, value, True)
5390
5391         # try to convert to number
5392         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5393         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5394         if is_num:
5395             value = val_num
5396
5397         return ((-10, 0) if value is None
5398                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5399                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5400                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5401                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5402                 else (-1, value, 0))
5403
5404     def _calculate_field_preference(self, format, field):
5405         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5406         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5407         if type == 'multiple':
5408             type = 'field'  # Only 'field' is allowed in multiple for now
5409             actual_fields = self._get_field_setting(field, 'field')
5410
5411             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5412         else:
5413             value = get_value(field)
5414         return self._calculate_field_preference_from_value(format, field, type, value)
5415
5416     def calculate_preference(self, format):
5417         # Determine missing protocol
5418         if not format.get('protocol'):
5419             format['protocol'] = determine_protocol(format)
5420
5421         # Determine missing ext
5422         if not format.get('ext') and 'url' in format:
5423             format['ext'] = determine_ext(format['url'])
5424         if format.get('vcodec') == 'none':
5425             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5426             format['video_ext'] = 'none'
5427         else:
5428             format['video_ext'] = format['ext']
5429             format['audio_ext'] = 'none'
5430         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5431         #    format['preference'] = -1000
5432
5433         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5434             # HEVC-over-FLV is out-of-spec by FLV's original spec
5435             # ref. https://trac.ffmpeg.org/ticket/6389
5436             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5437             format['preference'] = -100
5438
5439         # Determine missing bitrates
5440         if format.get('vcodec') == 'none':
5441             format['vbr'] = 0
5442         if format.get('acodec') == 'none':
5443             format['abr'] = 0
5444         if not format.get('vbr') and format.get('vcodec') != 'none':
5445             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5446         if not format.get('abr') and format.get('acodec') != 'none':
5447             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5448         if not format.get('tbr'):
5449             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5450
5451         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5452
5453
5454 # XXX: Temporary
5455 class _YDLLogger:
5456     def __init__(self, ydl=None):
5457         self._ydl = ydl
5458
5459     def debug(self, message):
5460         if self._ydl:
5461             self._ydl.write_debug(message)
5462
5463     def info(self, message):
5464         if self._ydl:
5465             self._ydl.to_screen(message)
5466
5467     def warning(self, message, *, once=False):
5468         if self._ydl:
5469             self._ydl.report_warning(message, once)
5470
5471     def error(self, message, *, is_error=True):
5472         if self._ydl:
5473             self._ydl.report_error(message, is_error=is_error)
5474
5475     def stdout(self, message):
5476         if self._ydl:
5477             self._ydl.to_stdout(message)
5478
5479     def stderr(self, message):
5480         if self._ydl:
5481             self._ydl.to_stderr(message)