yt_dlp/utils/_utils.py

   1 import base64
   2 import binascii
   3 import calendar
   4 import codecs
   5 import collections
   6 import collections.abc
   7 import contextlib
   8 import datetime as dt
   9 import email.header
  10 import email.utils
  11 import errno
  12 import hashlib
  13 import hmac
  14 import html.entities
  15 import html.parser
  16 import inspect
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import mimetypes
  23 import netrc
  24 import operator
  25 import os
  26 import platform
  27 import random
  28 import re
  29 import shlex
  30 import socket
  31 import ssl
  32 import struct
  33 import subprocess
  34 import sys
  35 import tempfile
  36 import time
  37 import traceback
  38 import types
  39 import unicodedata
  40 import urllib.error
  41 import urllib.parse
  42 import urllib.request
  43 import xml.etree.ElementTree
  44
  45 from . import traversal
  46
  47 from ..compat import functools  # isort: split
  48 from ..compat import (
  49     compat_etree_fromstring,
  50     compat_expanduser,
  51     compat_HTMLParseError,
  52     compat_os_name,
  53 )
  54 from ..dependencies import xattr
  55
  56 __name__ = __name__.rsplit('.', 1)[0]  # noqa: A001: Pretend to be the parent module
  57
  58 # This is not clearly defined otherwise
  59 compiled_regex_type = type(re.compile(''))
  60
  61
  62 class NO_DEFAULT:
  63     pass
  64
  65
  66 def IDENTITY(x):
  67     return x
  68
  69
  70 ENGLISH_MONTH_NAMES = [
  71     'January', 'February', 'March', 'April', 'May', 'June',
  72     'July', 'August', 'September', 'October', 'November', 'December']
  73
  74 MONTH_NAMES = {
  75     'en': ENGLISH_MONTH_NAMES,
  76     'fr': [
  77         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  78         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  79     # these follow the genitive grammatical case (dopełniacz)
  80     # some websites might be using nominative, which will require another month list
  81     # https://en.wikibooks.org/wiki/Polish/Noun_cases
  82     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  83            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  84 }
  85
  86 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  87 TIMEZONE_NAMES = {
  88     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  89     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
  90     'EST': -5, 'EDT': -4,  # Eastern
  91     'CST': -6, 'CDT': -5,  # Central
  92     'MST': -7, 'MDT': -6,  # Mountain
  93     'PST': -8, 'PDT': -7,   # Pacific
  94 }
  95
  96 # needed for sanitizing filenames in restricted mode
  97 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
  98                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
  99                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 100
 101 DATE_FORMATS = (
 102     '%d %B %Y',
 103     '%d %b %Y',
 104     '%B %d %Y',
 105     '%B %dst %Y',
 106     '%B %dnd %Y',
 107     '%B %drd %Y',
 108     '%B %dth %Y',
 109     '%b %d %Y',
 110     '%b %dst %Y',
 111     '%b %dnd %Y',
 112     '%b %drd %Y',
 113     '%b %dth %Y',
 114     '%b %dst %Y %I:%M',
 115     '%b %dnd %Y %I:%M',
 116     '%b %drd %Y %I:%M',
 117     '%b %dth %Y %I:%M',
 118     '%Y %m %d',
 119     '%Y-%m-%d',
 120     '%Y.%m.%d.',
 121     '%Y/%m/%d',
 122     '%Y/%m/%d %H:%M',
 123     '%Y/%m/%d %H:%M:%S',
 124     '%Y%m%d%H%M',
 125     '%Y%m%d%H%M%S',
 126     '%Y%m%d',
 127     '%Y-%m-%d %H:%M',
 128     '%Y-%m-%d %H:%M:%S',
 129     '%Y-%m-%d %H:%M:%S.%f',
 130     '%Y-%m-%d %H:%M:%S:%f',
 131     '%d.%m.%Y %H:%M',
 132     '%d.%m.%Y %H.%M',
 133     '%Y-%m-%dT%H:%M:%SZ',
 134     '%Y-%m-%dT%H:%M:%S.%fZ',
 135     '%Y-%m-%dT%H:%M:%S.%f0Z',
 136     '%Y-%m-%dT%H:%M:%S',
 137     '%Y-%m-%dT%H:%M:%S.%f',
 138     '%Y-%m-%dT%H:%M',
 139     '%b %d %Y at %H:%M',
 140     '%b %d %Y at %H:%M:%S',
 141     '%B %d %Y at %H:%M',
 142     '%B %d %Y at %H:%M:%S',
 143     '%H:%M %d-%b-%Y',
 144 )
 145
 146 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 147 DATE_FORMATS_DAY_FIRST.extend([
 148     '%d-%m-%Y',
 149     '%d.%m.%Y',
 150     '%d.%m.%y',
 151     '%d/%m/%Y',
 152     '%d/%m/%y',
 153     '%d/%m/%Y %H:%M:%S',
 154     '%d-%m-%Y %H:%M',
 155     '%H:%M %d/%m/%Y',
 156 ])
 157
 158 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 159 DATE_FORMATS_MONTH_FIRST.extend([
 160     '%m-%d-%Y',
 161     '%m.%d.%Y',
 162     '%m/%d/%Y',
 163     '%m/%d/%y',
 164     '%m/%d/%Y %H:%M:%S',
 165 ])
 166
 167 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 168 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 169
 170 NUMBER_RE = r'\d+(?:\.\d+)?'
 171
 172
 173 @functools.cache
 174 def preferredencoding():
 175     """Get preferred encoding.
 176
 177     Returns the best encoding scheme for the system, based on
 178     locale.getpreferredencoding() and some further tweaks.
 179     """
 180     try:
 181         pref = locale.getpreferredencoding()
 182         'TEST'.encode(pref)
 183     except Exception:
 184         pref = 'UTF-8'
 185
 186     return pref
 187
 188
 189 def write_json_file(obj, fn):
 190     """ Encode obj as JSON and write it to fn, atomically if possible """
 191
 192     tf = tempfile.NamedTemporaryFile(
 193         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 194         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 195
 196     try:
 197         with tf:
 198             json.dump(obj, tf, ensure_ascii=False)
 199         if sys.platform == 'win32':
 200             # Need to remove existing file on Windows, else os.rename raises
 201             # WindowsError or FileExistsError.
 202             with contextlib.suppress(OSError):
 203                 os.unlink(fn)
 204         with contextlib.suppress(OSError):
 205             mask = os.umask(0)
 206             os.umask(mask)
 207             os.chmod(tf.name, 0o666 & ~mask)
 208         os.rename(tf.name, fn)
 209     except Exception:
 210         with contextlib.suppress(OSError):
 211             os.remove(tf.name)
 212         raise
 213
 214
 215 def find_xpath_attr(node, xpath, key, val=None):
 216     """ Find the xpath xpath[@key=val] """
 217     assert re.match(r'^[a-zA-Z_-]+$', key)
 218     expr = xpath + (f'[@{key}]' if val is None else f"[@{key}='{val}']")
 219     return node.find(expr)
 220
 221 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 222 # the namespace parameter
 223
 224
 225 def xpath_with_ns(path, ns_map):
 226     components = [c.split(':') for c in path.split('/')]
 227     replaced = []
 228     for c in components:
 229         if len(c) == 1:
 230             replaced.append(c[0])
 231         else:
 232             ns, tag = c
 233             replaced.append(f'{{{ns_map[ns]}}}{tag}')
 234     return '/'.join(replaced)
 235
 236
 237 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 238     def _find_xpath(xpath):
 239         return node.find(xpath)
 240
 241     if isinstance(xpath, str):
 242         n = _find_xpath(xpath)
 243     else:
 244         for xp in xpath:
 245             n = _find_xpath(xp)
 246             if n is not None:
 247                 break
 248
 249     if n is None:
 250         if default is not NO_DEFAULT:
 251             return default
 252         elif fatal:
 253             name = xpath if name is None else name
 254             raise ExtractorError(f'Could not find XML element {name}')
 255         else:
 256             return None
 257     return n
 258
 259
 260 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 261     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 262     if n is None or n == default:
 263         return n
 264     if n.text is None:
 265         if default is not NO_DEFAULT:
 266             return default
 267         elif fatal:
 268             name = xpath if name is None else name
 269             raise ExtractorError(f'Could not find XML element\'s text {name}')
 270         else:
 271             return None
 272     return n.text
 273
 274
 275 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 276     n = find_xpath_attr(node, xpath, key)
 277     if n is None:
 278         if default is not NO_DEFAULT:
 279             return default
 280         elif fatal:
 281             name = f'{xpath}[@{key}]' if name is None else name
 282             raise ExtractorError(f'Could not find XML attribute {name}')
 283         else:
 284             return None
 285     return n.attrib[key]
 286
 287
 288 def get_element_by_id(id, html, **kwargs):
 289     """Return the content of the tag with the specified ID in the passed HTML document"""
 290     return get_element_by_attribute('id', id, html, **kwargs)
 291
 292
 293 def get_element_html_by_id(id, html, **kwargs):
 294     """Return the html of the tag with the specified ID in the passed HTML document"""
 295     return get_element_html_by_attribute('id', id, html, **kwargs)
 296
 297
 298 def get_element_by_class(class_name, html):
 299     """Return the content of the first tag with the specified class in the passed HTML document"""
 300     retval = get_elements_by_class(class_name, html)
 301     return retval[0] if retval else None
 302
 303
 304 def get_element_html_by_class(class_name, html):
 305     """Return the html of the first tag with the specified class in the passed HTML document"""
 306     retval = get_elements_html_by_class(class_name, html)
 307     return retval[0] if retval else None
 308
 309
 310 def get_element_by_attribute(attribute, value, html, **kwargs):
 311     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 312     return retval[0] if retval else None
 313
 314
 315 def get_element_html_by_attribute(attribute, value, html, **kargs):
 316     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 317     return retval[0] if retval else None
 318
 319
 320 def get_elements_by_class(class_name, html, **kargs):
 321     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 322     return get_elements_by_attribute(
 323         'class', rf'[^\'"]*(?<=[\'"\s]){re.escape(class_name)}(?=[\'"\s])[^\'"]*',
 324         html, escape_value=False)
 325
 326
 327 def get_elements_html_by_class(class_name, html):
 328     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 329     return get_elements_html_by_attribute(
 330         'class', rf'[^\'"]*(?<=[\'"\s]){re.escape(class_name)}(?=[\'"\s])[^\'"]*',
 331         html, escape_value=False)
 332
 333
 334 def get_elements_by_attribute(*args, **kwargs):
 335     """Return the content of the tag with the specified attribute in the passed HTML document"""
 336     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 337
 338
 339 def get_elements_html_by_attribute(*args, **kwargs):
 340     """Return the html of the tag with the specified attribute in the passed HTML document"""
 341     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 342
 343
 344 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 345     """
 346     Return the text (content) and the html (whole) of the tag with the specified
 347     attribute in the passed HTML document
 348     """
 349     if not value:
 350         return
 351
 352     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 353
 354     value = re.escape(value) if escape_value else value
 355
 356     partial_element_re = rf'''(?x)
 357         <(?P<tag>{tag})
 358          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 359          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 360         '''
 361
 362     for m in re.finditer(partial_element_re, html):
 363         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 364
 365         yield (
 366             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 367             whole,
 368         )
 369
 370
 371 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 372     """
 373     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 374     closing tag for the first opening tag it has encountered, and can be used
 375     as a context manager
 376     """
 377
 378     class HTMLBreakOnClosingTagException(Exception):
 379         pass
 380
 381     def __init__(self):
 382         self.tagstack = collections.deque()
 383         html.parser.HTMLParser.__init__(self)
 384
 385     def __enter__(self):
 386         return self
 387
 388     def __exit__(self, *_):
 389         self.close()
 390
 391     def close(self):
 392         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 393         # so data remains buffered; we no longer have any interest in it, thus
 394         # override this method to discard it
 395         pass
 396
 397     def handle_starttag(self, tag, _):
 398         self.tagstack.append(tag)
 399
 400     def handle_endtag(self, tag):
 401         if not self.tagstack:
 402             raise compat_HTMLParseError('no tags in the stack')
 403         while self.tagstack:
 404             inner_tag = self.tagstack.pop()
 405             if inner_tag == tag:
 406                 break
 407         else:
 408             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 409         if not self.tagstack:
 410             raise self.HTMLBreakOnClosingTagException
 411
 412
 413 # XXX: This should be far less strict
 414 def get_element_text_and_html_by_tag(tag, html):
 415     """
 416     For the first element with the specified tag in the passed HTML document
 417     return its' content (text) and the whole element (html)
 418     """
 419     def find_or_raise(haystack, needle, exc):
 420         try:
 421             return haystack.index(needle)
 422         except ValueError:
 423             raise exc
 424     closing_tag = f'</{tag}>'
 425     whole_start = find_or_raise(
 426         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 427     content_start = find_or_raise(
 428         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 429     content_start += whole_start + 1
 430     with HTMLBreakOnClosingTagParser() as parser:
 431         parser.feed(html[whole_start:content_start])
 432         if not parser.tagstack or parser.tagstack[0] != tag:
 433             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 434         offset = content_start
 435         while offset < len(html):
 436             next_closing_tag_start = find_or_raise(
 437                 html[offset:], closing_tag,
 438                 compat_HTMLParseError(f'closing {tag} tag not found'))
 439             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 440             try:
 441                 parser.feed(html[offset:offset + next_closing_tag_end])
 442                 offset += next_closing_tag_end
 443             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 444                 return html[content_start:offset + next_closing_tag_start], \
 445                     html[whole_start:offset + next_closing_tag_end]
 446         raise compat_HTMLParseError('unexpected end of html')
 447
 448
 449 class HTMLAttributeParser(html.parser.HTMLParser):
 450     """Trivial HTML parser to gather the attributes for a single element"""
 451
 452     def __init__(self):
 453         self.attrs = {}
 454         html.parser.HTMLParser.__init__(self)
 455
 456     def handle_starttag(self, tag, attrs):
 457         self.attrs = dict(attrs)
 458         raise compat_HTMLParseError('done')
 459
 460
 461 class HTMLListAttrsParser(html.parser.HTMLParser):
 462     """HTML parser to gather the attributes for the elements of a list"""
 463
 464     def __init__(self):
 465         html.parser.HTMLParser.__init__(self)
 466         self.items = []
 467         self._level = 0
 468
 469     def handle_starttag(self, tag, attrs):
 470         if tag == 'li' and self._level == 0:
 471             self.items.append(dict(attrs))
 472         self._level += 1
 473
 474     def handle_endtag(self, tag):
 475         self._level -= 1
 476
 477
 478 def extract_attributes(html_element):
 479     """Given a string for an HTML element such as
 480     <el
 481          a="foo" B="bar" c="&98;az" d=boz
 482          empty= noval entity="&amp;"
 483          sq='"' dq="'"
 484     >
 485     Decode and return a dictionary of attributes.
 486     {
 487         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 488         'empty': '', 'noval': None, 'entity': '&',
 489         'sq': '"', 'dq': '\''
 490     }.
 491     """
 492     parser = HTMLAttributeParser()
 493     with contextlib.suppress(compat_HTMLParseError):
 494         parser.feed(html_element)
 495         parser.close()
 496     return parser.attrs
 497
 498
 499 def parse_list(webpage):
 500     """Given a string for an series of HTML <li> elements,
 501     return a dictionary of their attributes"""
 502     parser = HTMLListAttrsParser()
 503     parser.feed(webpage)
 504     parser.close()
 505     return parser.items
 506
 507
 508 def clean_html(html):
 509     """Clean an HTML snippet into a readable string"""
 510
 511     if html is None:  # Convenience for sanitizing descriptions etc.
 512         return html
 513
 514     html = re.sub(r'\s+', ' ', html)
 515     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 516     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 517     # Strip html tags
 518     html = re.sub('<.*?>', '', html)
 519     # Replace html entities
 520     html = unescapeHTML(html)
 521     return html.strip()
 522
 523
 524 class LenientJSONDecoder(json.JSONDecoder):
 525     # TODO: Write tests
 526     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 527         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 528         self._close_attempts = 2 * close_objects
 529         super().__init__(*args, **kwargs)
 530
 531     @staticmethod
 532     def _close_object(err):
 533         doc = err.doc[:err.pos]
 534         # We need to add comma first to get the correct error message
 535         if err.msg.startswith('Expecting \',\''):
 536             return doc + ','
 537         elif not doc.endswith(','):
 538             return
 539
 540         if err.msg.startswith('Expecting property name'):
 541             return doc[:-1] + '}'
 542         elif err.msg.startswith('Expecting value'):
 543             return doc[:-1] + ']'
 544
 545     def decode(self, s):
 546         if self.transform_source:
 547             s = self.transform_source(s)
 548         for attempt in range(self._close_attempts + 1):
 549             try:
 550                 if self.ignore_extra:
 551                     return self.raw_decode(s.lstrip())[0]
 552                 return super().decode(s)
 553             except json.JSONDecodeError as e:
 554                 if e.pos is None:
 555                     raise
 556                 elif attempt < self._close_attempts:
 557                     s = self._close_object(e)
 558                     if s is not None:
 559                         continue
 560                 raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
 561         assert False, 'Too many attempts to decode JSON'
 562
 563
 564 def sanitize_open(filename, open_mode):
 565     """Try to open the given filename, and slightly tweak it if this fails.
 566
 567     Attempts to open the given filename. If this fails, it tries to change
 568     the filename slightly, step by step, until it's either able to open it
 569     or it fails and raises a final exception, like the standard open()
 570     function.
 571
 572     It returns the tuple (stream, definitive_file_name).
 573     """
 574     if filename == '-':
 575         if sys.platform == 'win32':
 576             import msvcrt
 577
 578             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 579             with contextlib.suppress(io.UnsupportedOperation):
 580                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 581         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 582
 583     for attempt in range(2):
 584         try:
 585             try:
 586                 if sys.platform == 'win32':
 587                     # FIXME: An exclusive lock also locks the file from being read.
 588                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 589                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 590                     raise LockingUnsupportedError
 591                 stream = locked_file(filename, open_mode, block=False).__enter__()
 592             except OSError:
 593                 stream = open(filename, open_mode)
 594             return stream, filename
 595         except OSError as err:
 596             if attempt or err.errno in (errno.EACCES,):
 597                 raise
 598             old_filename, filename = filename, sanitize_path(filename)
 599             if old_filename == filename:
 600                 raise
 601
 602
 603 def timeconvert(timestr):
 604     """Convert RFC 2822 defined time string into system timestamp"""
 605     timestamp = None
 606     timetuple = email.utils.parsedate_tz(timestr)
 607     if timetuple is not None:
 608         timestamp = email.utils.mktime_tz(timetuple)
 609     return timestamp
 610
 611
 612 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 613     """Sanitizes a string so it could be used as part of a filename.
 614     @param restricted   Use a stricter subset of allowed characters
 615     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 616                         If unset, yt-dlp's new sanitization rules are in effect
 617     """
 618     if s == '':
 619         return ''
 620
 621     def replace_insane(char):
 622         if restricted and char in ACCENT_CHARS:
 623             return ACCENT_CHARS[char]
 624         elif not restricted and char == '\n':
 625             return '\0 '
 626         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 627             # Replace with their full-width unicode counterparts
 628             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 629         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 630             return ''
 631         elif char == '"':
 632             return '' if restricted else '\''
 633         elif char == ':':
 634             return '\0_\0-' if restricted else '\0 \0-'
 635         elif char in '\\/|*<>':
 636             return '\0_'
 637         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 638             return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
 639         return char
 640
 641     # Replace look-alike Unicode glyphs
 642     if restricted and (is_id is NO_DEFAULT or not is_id):
 643         s = unicodedata.normalize('NFKC', s)
 644     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 645     result = ''.join(map(replace_insane, s))
 646     if is_id is NO_DEFAULT:
 647         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 648         STRIP_RE = r'(?:\0.|[ _-])*'
 649         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 650     result = result.replace('\0', '') or '_'
 651
 652     if not is_id:
 653         while '__' in result:
 654             result = result.replace('__', '_')
 655         result = result.strip('_')
 656         # Common case of "Foreign band name - English song title"
 657         if restricted and result.startswith('-_'):
 658             result = result[2:]
 659         if result.startswith('-'):
 660             result = '_' + result[len('-'):]
 661         result = result.lstrip('.')
 662         if not result:
 663             result = '_'
 664     return result
 665
 666
 667 def sanitize_path(s, force=False):
 668     """Sanitizes and normalizes path on Windows"""
 669     # XXX: this handles drive relative paths (c:sth) incorrectly
 670     if sys.platform == 'win32':
 671         force = False
 672         drive_or_unc, _ = os.path.splitdrive(s)
 673     elif force:
 674         drive_or_unc = ''
 675     else:
 676         return s
 677
 678     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 679     if drive_or_unc:
 680         norm_path.pop(0)
 681     sanitized_path = [
 682         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 683         for path_part in norm_path]
 684     if drive_or_unc:
 685         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 686     elif force and s and s[0] == os.path.sep:
 687         sanitized_path.insert(0, os.path.sep)
 688     # TODO: Fix behavioral differences <3.12
 689     # The workaround using `normpath` only superficially passes tests
 690     # Ref: https://github.com/python/cpython/pull/100351
 691     return os.path.normpath(os.path.join(*sanitized_path))
 692
 693
 694 def sanitize_url(url, *, scheme='http'):
 695     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 696     # the number of unwanted failures due to missing protocol
 697     if url is None:
 698         return
 699     elif url.startswith('//'):
 700         return f'{scheme}:{url}'
 701     # Fix some common typos seen so far
 702     COMMON_TYPOS = (
 703         # https://github.com/ytdl-org/youtube-dl/issues/15649
 704         (r'^httpss://', r'https://'),
 705         # https://bx1.be/lives/direct-tv/
 706         (r'^rmtp([es]?)://', r'rtmp\1://'),
 707     )
 708     for mistake, fixup in COMMON_TYPOS:
 709         if re.match(mistake, url):
 710             return re.sub(mistake, fixup, url)
 711     return url
 712
 713
 714 def extract_basic_auth(url):
 715     parts = urllib.parse.urlsplit(url)
 716     if parts.username is None:
 717         return url, None
 718     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 719         parts.hostname if parts.port is None
 720         else f'{parts.hostname}:{parts.port}')))
 721     auth_payload = base64.b64encode(
 722         ('{}:{}'.format(parts.username, parts.password or '')).encode())
 723     return url, f'Basic {auth_payload.decode()}'
 724
 725
 726 def expand_path(s):
 727     """Expand shell variables and ~"""
 728     return os.path.expandvars(compat_expanduser(s))
 729
 730
 731 def orderedSet(iterable, *, lazy=False):
 732     """Remove all duplicates from the input iterable"""
 733     def _iter():
 734         seen = []  # Do not use set since the items can be unhashable
 735         for x in iterable:
 736             if x not in seen:
 737                 seen.append(x)
 738                 yield x
 739
 740     return _iter() if lazy else list(_iter())
 741
 742
 743 def _htmlentity_transform(entity_with_semicolon):
 744     """Transforms an HTML entity to a character."""
 745     entity = entity_with_semicolon[:-1]
 746
 747     # Known non-numeric HTML entity
 748     if entity in html.entities.name2codepoint:
 749         return chr(html.entities.name2codepoint[entity])
 750
 751     # TODO: HTML5 allows entities without a semicolon.
 752     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 753     if entity_with_semicolon in html.entities.html5:
 754         return html.entities.html5[entity_with_semicolon]
 755
 756     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 757     if mobj is not None:
 758         numstr = mobj.group(1)
 759         if numstr.startswith('x'):
 760             base = 16
 761             numstr = f'0{numstr}'
 762         else:
 763             base = 10
 764         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 765         with contextlib.suppress(ValueError):
 766             return chr(int(numstr, base))
 767
 768     # Unknown entity in name, return its literal representation
 769     return f'&{entity};'
 770
 771
 772 def unescapeHTML(s):
 773     if s is None:
 774         return None
 775     assert isinstance(s, str)
 776
 777     return re.sub(
 778         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 779
 780
 781 def escapeHTML(text):
 782     return (
 783         text
 784         .replace('&', '&amp;')
 785         .replace('<', '&lt;')
 786         .replace('>', '&gt;')
 787         .replace('"', '&quot;')
 788         .replace("'", '&#39;')
 789     )
 790
 791
 792 class netrc_from_content(netrc.netrc):
 793     def __init__(self, content):
 794         self.hosts, self.macros = {}, {}
 795         with io.StringIO(content) as stream:
 796             self._parse('-', stream, False)
 797
 798
 799 class Popen(subprocess.Popen):
 800     if sys.platform == 'win32':
 801         _startupinfo = subprocess.STARTUPINFO()
 802         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 803     else:
 804         _startupinfo = None
 805
 806     @staticmethod
 807     def _fix_pyinstaller_ld_path(env):
 808         """Restore LD_LIBRARY_PATH when using PyInstaller
 809             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 810                  https://github.com/yt-dlp/yt-dlp/issues/4573
 811         """
 812         if not hasattr(sys, '_MEIPASS'):
 813             return
 814
 815         def _fix(key):
 816             orig = env.get(f'{key}_ORIG')
 817             if orig is None:
 818                 env.pop(key, None)
 819             else:
 820                 env[key] = orig
 821
 822         _fix('LD_LIBRARY_PATH')  # Linux
 823         _fix('DYLD_LIBRARY_PATH')  # macOS
 824
 825     def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
 826         if env is None:
 827             env = os.environ.copy()
 828         self._fix_pyinstaller_ld_path(env)
 829
 830         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 831         if text is True:
 832             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 833             kwargs.setdefault('encoding', 'utf-8')
 834             kwargs.setdefault('errors', 'replace')
 835
 836         if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
 837             if not isinstance(args, str):
 838                 args = shell_quote(args, shell=True)
 839             shell = False
 840             # Set variable for `cmd.exe` newline escaping (see `utils.shell_quote`)
 841             env['='] = '"^\n\n"'
 842             args = f'{self.__comspec()} /Q /S /D /V:OFF /E:ON /C "{args}"'
 843
 844         super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
 845
 846     def __comspec(self):
 847         comspec = os.environ.get('ComSpec') or os.path.join(
 848             os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
 849         if os.path.isabs(comspec):
 850             return comspec
 851         raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
 852
 853     def communicate_or_kill(self, *args, **kwargs):
 854         try:
 855             return self.communicate(*args, **kwargs)
 856         except BaseException:  # Including KeyboardInterrupt
 857             self.kill(timeout=None)
 858             raise
 859
 860     def kill(self, *, timeout=0):
 861         super().kill()
 862         if timeout != 0:
 863             self.wait(timeout=timeout)
 864
 865     @classmethod
 866     def run(cls, *args, timeout=None, **kwargs):
 867         with cls(*args, **kwargs) as proc:
 868             default = '' if proc.__text_mode else b''
 869             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 870             return stdout or default, stderr or default, proc.returncode
 871
 872
 873 def encodeArgument(s):
 874     # Legacy code that uses byte strings
 875     # Uncomment the following line after fixing all post processors
 876     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 877     return s if isinstance(s, str) else s.decode('ascii')
 878
 879
 880 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 881
 882
 883 def timetuple_from_msec(msec):
 884     secs, msec = divmod(msec, 1000)
 885     mins, secs = divmod(secs, 60)
 886     hrs, mins = divmod(mins, 60)
 887     return _timetuple(hrs, mins, secs, msec)
 888
 889
 890 def formatSeconds(secs, delim=':', msec=False):
 891     time = timetuple_from_msec(secs * 1000)
 892     if time.hours:
 893         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 894     elif time.minutes:
 895         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 896     else:
 897         ret = '%d' % time.seconds
 898     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 899
 900
 901 def bug_reports_message(before=';'):
 902     from ..update import REPOSITORY
 903
 904     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 905            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 906
 907     before = before.rstrip()
 908     if not before or before.endswith(('.', '!', '?')):
 909         msg = msg[0].title() + msg[1:]
 910
 911     return (before + ' ' if before else '') + msg
 912
 913
 914 class YoutubeDLError(Exception):
 915     """Base exception for YoutubeDL errors."""
 916     msg = None
 917
 918     def __init__(self, msg=None):
 919         if msg is not None:
 920             self.msg = msg
 921         elif self.msg is None:
 922             self.msg = type(self).__name__
 923         super().__init__(self.msg)
 924
 925
 926 class ExtractorError(YoutubeDLError):
 927     """Error during info extraction."""
 928
 929     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 930         """ tb, if given, is the original traceback (so that it can be printed out).
 931         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 932         """
 933         from ..networking.exceptions import network_exceptions
 934         if sys.exc_info()[0] in network_exceptions:
 935             expected = True
 936
 937         self.orig_msg = str(msg)
 938         self.traceback = tb
 939         self.expected = expected
 940         self.cause = cause
 941         self.video_id = video_id
 942         self.ie = ie
 943         self.exc_info = sys.exc_info()  # preserve original exception
 944         if isinstance(self.exc_info[1], ExtractorError):
 945             self.exc_info = self.exc_info[1].exc_info
 946         super().__init__(self.__msg)
 947
 948     @property
 949     def __msg(self):
 950         return ''.join((
 951             format_field(self.ie, None, '[%s] '),
 952             format_field(self.video_id, None, '%s: '),
 953             self.orig_msg,
 954             format_field(self.cause, None, ' (caused by %r)'),
 955             '' if self.expected else bug_reports_message()))
 956
 957     def format_traceback(self):
 958         return join_nonempty(
 959             self.traceback and ''.join(traceback.format_tb(self.traceback)),
 960             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
 961             delim='\n') or None
 962
 963     def __setattr__(self, name, value):
 964         super().__setattr__(name, value)
 965         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
 966             self.msg = self.__msg or type(self).__name__
 967             self.args = (self.msg, )  # Cannot be property
 968
 969
 970 class UnsupportedError(ExtractorError):
 971     def __init__(self, url):
 972         super().__init__(
 973             f'Unsupported URL: {url}', expected=True)
 974         self.url = url
 975
 976
 977 class RegexNotFoundError(ExtractorError):
 978     """Error when a regex didn't match"""
 979     pass
 980
 981
 982 class GeoRestrictedError(ExtractorError):
 983     """Geographic restriction Error exception.
 984
 985     This exception may be thrown when a video is not available from your
 986     geographic location due to geographic restrictions imposed by a website.
 987     """
 988
 989     def __init__(self, msg, countries=None, **kwargs):
 990         kwargs['expected'] = True
 991         super().__init__(msg, **kwargs)
 992         self.countries = countries
 993
 994
 995 class UserNotLive(ExtractorError):
 996     """Error when a channel/user is not live"""
 997
 998     def __init__(self, msg=None, **kwargs):
 999         kwargs['expected'] = True
1000         super().__init__(msg or 'The channel is not currently live', **kwargs)
1001
1002
1003 class DownloadError(YoutubeDLError):
1004     """Download Error exception.
1005
1006     This exception may be thrown by FileDownloader objects if they are not
1007     configured to continue on errors. They will contain the appropriate
1008     error message.
1009     """
1010
1011     def __init__(self, msg, exc_info=None):
1012         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1013         super().__init__(msg)
1014         self.exc_info = exc_info
1015
1016
1017 class EntryNotInPlaylist(YoutubeDLError):
1018     """Entry not in playlist exception.
1019
1020     This exception will be thrown by YoutubeDL when a requested entry
1021     is not found in the playlist info_dict
1022     """
1023     msg = 'Entry not found in info'
1024
1025
1026 class SameFileError(YoutubeDLError):
1027     """Same File exception.
1028
1029     This exception will be thrown by FileDownloader objects if they detect
1030     multiple files would have to be downloaded to the same file on disk.
1031     """
1032     msg = 'Fixed output name but more than one file to download'
1033
1034     def __init__(self, filename=None):
1035         if filename is not None:
1036             self.msg += f': {filename}'
1037         super().__init__(self.msg)
1038
1039
1040 class PostProcessingError(YoutubeDLError):
1041     """Post Processing exception.
1042
1043     This exception may be raised by PostProcessor's .run() method to
1044     indicate an error in the postprocessing task.
1045     """
1046
1047
1048 class DownloadCancelled(YoutubeDLError):
1049     """ Exception raised when the download queue should be interrupted """
1050     msg = 'The download was cancelled'
1051
1052
1053 class ExistingVideoReached(DownloadCancelled):
1054     """ --break-on-existing triggered """
1055     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1056
1057
1058 class RejectedVideoReached(DownloadCancelled):
1059     """ --break-match-filter triggered """
1060     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1061
1062
1063 class MaxDownloadsReached(DownloadCancelled):
1064     """ --max-downloads limit has been reached. """
1065     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1066
1067
1068 class ReExtractInfo(YoutubeDLError):
1069     """ Video info needs to be re-extracted. """
1070
1071     def __init__(self, msg, expected=False):
1072         super().__init__(msg)
1073         self.expected = expected
1074
1075
1076 class ThrottledDownload(ReExtractInfo):
1077     """ Download speed below --throttled-rate. """
1078     msg = 'The download speed is below throttle limit'
1079
1080     def __init__(self):
1081         super().__init__(self.msg, expected=False)
1082
1083
1084 class UnavailableVideoError(YoutubeDLError):
1085     """Unavailable Format exception.
1086
1087     This exception will be thrown when a video is requested
1088     in a format that is not available for that video.
1089     """
1090     msg = 'Unable to download video'
1091
1092     def __init__(self, err=None):
1093         if err is not None:
1094             self.msg += f': {err}'
1095         super().__init__(self.msg)
1096
1097
1098 class ContentTooShortError(YoutubeDLError):
1099     """Content Too Short exception.
1100
1101     This exception may be raised by FileDownloader objects when a file they
1102     download is too small for what the server announced first, indicating
1103     the connection was probably interrupted.
1104     """
1105
1106     def __init__(self, downloaded, expected):
1107         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1108         # Both in bytes
1109         self.downloaded = downloaded
1110         self.expected = expected
1111
1112
1113 class XAttrMetadataError(YoutubeDLError):
1114     def __init__(self, code=None, msg='Unknown error'):
1115         super().__init__(msg)
1116         self.code = code
1117         self.msg = msg
1118
1119         # Parsing code and msg
1120         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1121                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1122             self.reason = 'NO_SPACE'
1123         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1124             self.reason = 'VALUE_TOO_LONG'
1125         else:
1126             self.reason = 'NOT_SUPPORTED'
1127
1128
1129 class XAttrUnavailableError(YoutubeDLError):
1130     pass
1131
1132
1133 def is_path_like(f):
1134     return isinstance(f, (str, bytes, os.PathLike))
1135
1136
1137 def extract_timezone(date_str, default=None):
1138     m = re.search(
1139         r'''(?x)
1140             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1141             (?P<tz>Z|                                            # just the UTC Z, or
1142                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1143                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1144                    [ ]?                                          # optional space
1145                 (?P<sign>\+|-)                                   # +/-
1146                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1147             $)
1148         ''', date_str)
1149     timezone = None
1150
1151     if not m:
1152         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1153         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1154         if timezone is not None:
1155             date_str = date_str[:-len(m.group('tz'))]
1156             timezone = dt.timedelta(hours=timezone)
1157     else:
1158         date_str = date_str[:-len(m.group('tz'))]
1159         if m.group('sign'):
1160             sign = 1 if m.group('sign') == '+' else -1
1161             timezone = dt.timedelta(
1162                 hours=sign * int(m.group('hours')),
1163                 minutes=sign * int(m.group('minutes')))
1164
1165     if timezone is None and default is not NO_DEFAULT:
1166         timezone = default or dt.timedelta()
1167
1168     return timezone, date_str
1169
1170
1171 def parse_iso8601(date_str, delimiter='T', timezone=None):
1172     """ Return a UNIX timestamp from the given date """
1173
1174     if date_str is None:
1175         return None
1176
1177     date_str = re.sub(r'\.[0-9]+', '', date_str)
1178
1179     timezone, date_str = extract_timezone(date_str, timezone)
1180
1181     with contextlib.suppress(ValueError, TypeError):
1182         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1183         dt_ = dt.datetime.strptime(date_str, date_format) - timezone
1184         return calendar.timegm(dt_.timetuple())
1185
1186
1187 def date_formats(day_first=True):
1188     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1189
1190
1191 def unified_strdate(date_str, day_first=True):
1192     """Return a string with the date in the format YYYYMMDD"""
1193
1194     if date_str is None:
1195         return None
1196     upload_date = None
1197     # Replace commas
1198     date_str = date_str.replace(',', ' ')
1199     # Remove AM/PM + timezone
1200     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1201     _, date_str = extract_timezone(date_str)
1202
1203     for expression in date_formats(day_first):
1204         with contextlib.suppress(ValueError):
1205             upload_date = dt.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1206     if upload_date is None:
1207         timetuple = email.utils.parsedate_tz(date_str)
1208         if timetuple:
1209             with contextlib.suppress(ValueError):
1210                 upload_date = dt.datetime(*timetuple[:6]).strftime('%Y%m%d')
1211     if upload_date is not None:
1212         return str(upload_date)
1213
1214
1215 def unified_timestamp(date_str, day_first=True):
1216     if not isinstance(date_str, str):
1217         return None
1218
1219     date_str = re.sub(r'\s+', ' ', re.sub(
1220         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1221
1222     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1223     timezone, date_str = extract_timezone(date_str)
1224
1225     # Remove AM/PM + timezone
1226     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1227
1228     # Remove unrecognized timezones from ISO 8601 alike timestamps
1229     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1230     if m:
1231         date_str = date_str[:-len(m.group('tz'))]
1232
1233     # Python only supports microseconds, so remove nanoseconds
1234     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1235     if m:
1236         date_str = m.group(1)
1237
1238     for expression in date_formats(day_first):
1239         with contextlib.suppress(ValueError):
1240             dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta)
1241             return calendar.timegm(dt_.timetuple())
1242
1243     timetuple = email.utils.parsedate_tz(date_str)
1244     if timetuple:
1245         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1246
1247
1248 def determine_ext(url, default_ext='unknown_video'):
1249     if url is None or '.' not in url:
1250         return default_ext
1251     guess = url.partition('?')[0].rpartition('.')[2]
1252     if re.match(r'^[A-Za-z0-9]+$', guess):
1253         return guess
1254     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1255     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1256         return guess.rstrip('/')
1257     else:
1258         return default_ext
1259
1260
1261 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1262     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1263
1264
1265 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1266     R"""
1267     Return a datetime object from a string.
1268     Supported format:
1269         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1270
1271     @param format       strftime format of DATE
1272     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1273                         auto: round to the unit provided in date_str (if applicable).
1274     """
1275     auto_precision = False
1276     if precision == 'auto':
1277         auto_precision = True
1278         precision = 'microsecond'
1279     today = datetime_round(dt.datetime.now(dt.timezone.utc), precision)
1280     if date_str in ('now', 'today'):
1281         return today
1282     if date_str == 'yesterday':
1283         return today - dt.timedelta(days=1)
1284     match = re.match(
1285         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1286         date_str)
1287     if match is not None:
1288         start_time = datetime_from_str(match.group('start'), precision, format)
1289         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1290         unit = match.group('unit')
1291         if unit == 'month' or unit == 'year':
1292             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1293             unit = 'day'
1294         else:
1295             if unit == 'week':
1296                 unit = 'day'
1297                 time *= 7
1298             delta = dt.timedelta(**{unit + 's': time})
1299             new_date = start_time + delta
1300         if auto_precision:
1301             return datetime_round(new_date, unit)
1302         return new_date
1303
1304     return datetime_round(dt.datetime.strptime(date_str, format), precision)
1305
1306
1307 def date_from_str(date_str, format='%Y%m%d', strict=False):
1308     R"""
1309     Return a date object from a string using datetime_from_str
1310
1311     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1312                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1313     """
1314     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1315         raise ValueError(f'Invalid date format "{date_str}"')
1316     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1317
1318
1319 def datetime_add_months(dt_, months):
1320     """Increment/Decrement a datetime object by months."""
1321     month = dt_.month + months - 1
1322     year = dt_.year + month // 12
1323     month = month % 12 + 1
1324     day = min(dt_.day, calendar.monthrange(year, month)[1])
1325     return dt_.replace(year, month, day)
1326
1327
1328 def datetime_round(dt_, precision='day'):
1329     """
1330     Round a datetime object's time to a specific precision
1331     """
1332     if precision == 'microsecond':
1333         return dt_
1334
1335     unit_seconds = {
1336         'day': 86400,
1337         'hour': 3600,
1338         'minute': 60,
1339         'second': 1,
1340     }
1341     roundto = lambda x, n: ((x + n / 2) // n) * n
1342     timestamp = roundto(calendar.timegm(dt_.timetuple()), unit_seconds[precision])
1343     return dt.datetime.fromtimestamp(timestamp, dt.timezone.utc)
1344
1345
1346 def hyphenate_date(date_str):
1347     """
1348     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1349     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1350     if match is not None:
1351         return '-'.join(match.groups())
1352     else:
1353         return date_str
1354
1355
1356 class DateRange:
1357     """Represents a time interval between two dates"""
1358
1359     def __init__(self, start=None, end=None):
1360         """start and end must be strings in the format accepted by date"""
1361         if start is not None:
1362             self.start = date_from_str(start, strict=True)
1363         else:
1364             self.start = dt.datetime.min.date()
1365         if end is not None:
1366             self.end = date_from_str(end, strict=True)
1367         else:
1368             self.end = dt.datetime.max.date()
1369         if self.start > self.end:
1370             raise ValueError(f'Date range: "{self}" , the start date must be before the end date')
1371
1372     @classmethod
1373     def day(cls, day):
1374         """Returns a range that only contains the given day"""
1375         return cls(day, day)
1376
1377     def __contains__(self, date):
1378         """Check if the date is in the range"""
1379         if not isinstance(date, dt.date):
1380             date = date_from_str(date)
1381         return self.start <= date <= self.end
1382
1383     def __repr__(self):
1384         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1385
1386     def __str__(self):
1387         return f'{self.start} to {self.end}'
1388
1389     def __eq__(self, other):
1390         return (isinstance(other, DateRange)
1391                 and self.start == other.start and self.end == other.end)
1392
1393
1394 @functools.cache
1395 def system_identifier():
1396     python_implementation = platform.python_implementation()
1397     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1398         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1399     libc_ver = []
1400     with contextlib.suppress(OSError):  # We may not have access to the executable
1401         libc_ver = platform.libc_ver()
1402
1403     return 'Python {} ({} {} {}) - {} ({}{})'.format(
1404         platform.python_version(),
1405         python_implementation,
1406         platform.machine(),
1407         platform.architecture()[0],
1408         platform.platform(),
1409         ssl.OPENSSL_VERSION,
1410         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1411     )
1412
1413
1414 @functools.cache
1415 def get_windows_version():
1416     """ Get Windows version. returns () if it's not running on Windows """
1417     if compat_os_name == 'nt':
1418         return version_tuple(platform.win32_ver()[1])
1419     else:
1420         return ()
1421
1422
1423 def write_string(s, out=None, encoding=None):
1424     assert isinstance(s, str)
1425     out = out or sys.stderr
1426     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1427     if not out:
1428         return
1429
1430     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1431         s = re.sub(r'([\r\n]+)', r' \1', s)
1432
1433     enc, buffer = None, out
1434     # `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
1435     if 'b' in (getattr(out, 'mode', None) or ''):
1436         enc = encoding or preferredencoding()
1437     elif hasattr(out, 'buffer'):
1438         buffer = out.buffer
1439         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1440
1441     buffer.write(s.encode(enc, 'ignore') if enc else s)
1442     out.flush()
1443
1444
1445 # TODO: Use global logger
1446 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1447     from .. import _IN_CLI
1448     if _IN_CLI:
1449         if msg in deprecation_warning._cache:
1450             return
1451         deprecation_warning._cache.add(msg)
1452         if printer:
1453             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1454         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1455     else:
1456         import warnings
1457         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1458
1459
1460 deprecation_warning._cache = set()
1461
1462
1463 def bytes_to_intlist(bs):
1464     if not bs:
1465         return []
1466     if isinstance(bs[0], int):  # Python 3
1467         return list(bs)
1468     else:
1469         return [ord(c) for c in bs]
1470
1471
1472 def intlist_to_bytes(xs):
1473     if not xs:
1474         return b''
1475     return struct.pack('%dB' % len(xs), *xs)
1476
1477
1478 class LockingUnsupportedError(OSError):
1479     msg = 'File locking is not supported'
1480
1481     def __init__(self):
1482         super().__init__(self.msg)
1483
1484
1485 # Cross-platform file locking
1486 if sys.platform == 'win32':
1487     import ctypes
1488     import ctypes.wintypes
1489     import msvcrt
1490
1491     class OVERLAPPED(ctypes.Structure):
1492         _fields_ = [
1493             ('Internal', ctypes.wintypes.LPVOID),
1494             ('InternalHigh', ctypes.wintypes.LPVOID),
1495             ('Offset', ctypes.wintypes.DWORD),
1496             ('OffsetHigh', ctypes.wintypes.DWORD),
1497             ('hEvent', ctypes.wintypes.HANDLE),
1498         ]
1499
1500     kernel32 = ctypes.WinDLL('kernel32')
1501     LockFileEx = kernel32.LockFileEx
1502     LockFileEx.argtypes = [
1503         ctypes.wintypes.HANDLE,     # hFile
1504         ctypes.wintypes.DWORD,      # dwFlags
1505         ctypes.wintypes.DWORD,      # dwReserved
1506         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1507         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1508         ctypes.POINTER(OVERLAPPED),  # Overlapped
1509     ]
1510     LockFileEx.restype = ctypes.wintypes.BOOL
1511     UnlockFileEx = kernel32.UnlockFileEx
1512     UnlockFileEx.argtypes = [
1513         ctypes.wintypes.HANDLE,     # hFile
1514         ctypes.wintypes.DWORD,      # dwReserved
1515         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1516         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1517         ctypes.POINTER(OVERLAPPED),  # Overlapped
1518     ]
1519     UnlockFileEx.restype = ctypes.wintypes.BOOL
1520     whole_low = 0xffffffff
1521     whole_high = 0x7fffffff
1522
1523     def _lock_file(f, exclusive, block):
1524         overlapped = OVERLAPPED()
1525         overlapped.Offset = 0
1526         overlapped.OffsetHigh = 0
1527         overlapped.hEvent = 0
1528         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1529
1530         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1531                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1532                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1533             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1534             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1535
1536     def _unlock_file(f):
1537         assert f._lock_file_overlapped_p
1538         handle = msvcrt.get_osfhandle(f.fileno())
1539         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1540             raise OSError(f'Unlocking file failed: {ctypes.FormatError()!r}')
1541
1542 else:
1543     try:
1544         import fcntl
1545
1546         def _lock_file(f, exclusive, block):
1547             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1548             if not block:
1549                 flags |= fcntl.LOCK_NB
1550             try:
1551                 fcntl.flock(f, flags)
1552             except BlockingIOError:
1553                 raise
1554             except OSError:  # AOSP does not have flock()
1555                 fcntl.lockf(f, flags)
1556
1557         def _unlock_file(f):
1558             with contextlib.suppress(OSError):
1559                 return fcntl.flock(f, fcntl.LOCK_UN)
1560             with contextlib.suppress(OSError):
1561                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1562             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1563
1564     except ImportError:
1565
1566         def _lock_file(f, exclusive, block):
1567             raise LockingUnsupportedError
1568
1569         def _unlock_file(f):
1570             raise LockingUnsupportedError
1571
1572
1573 class locked_file:
1574     locked = False
1575
1576     def __init__(self, filename, mode, block=True, encoding=None):
1577         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1578             raise NotImplementedError(mode)
1579         self.mode, self.block = mode, block
1580
1581         writable = any(f in mode for f in 'wax+')
1582         readable = any(f in mode for f in 'r+')
1583         flags = functools.reduce(operator.ior, (
1584             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
1585             getattr(os, 'O_BINARY', 0),  # Windows only
1586             getattr(os, 'O_NOINHERIT', 0),  # Windows only
1587             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
1588             os.O_APPEND if 'a' in mode else 0,
1589             os.O_EXCL if 'x' in mode else 0,
1590             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1591         ))
1592
1593         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1594
1595     def __enter__(self):
1596         exclusive = 'r' not in self.mode
1597         try:
1598             _lock_file(self.f, exclusive, self.block)
1599             self.locked = True
1600         except OSError:
1601             self.f.close()
1602             raise
1603         if 'w' in self.mode:
1604             try:
1605                 self.f.truncate()
1606             except OSError as e:
1607                 if e.errno not in (
1608                     errno.ESPIPE,  # Illegal seek - expected for FIFO
1609                     errno.EINVAL,  # Invalid argument - expected for /dev/null
1610                 ):
1611                     raise
1612         return self
1613
1614     def unlock(self):
1615         if not self.locked:
1616             return
1617         try:
1618             _unlock_file(self.f)
1619         finally:
1620             self.locked = False
1621
1622     def __exit__(self, *_):
1623         try:
1624             self.unlock()
1625         finally:
1626             self.f.close()
1627
1628     open = __enter__
1629     close = __exit__
1630
1631     def __getattr__(self, attr):
1632         return getattr(self.f, attr)
1633
1634     def __iter__(self):
1635         return iter(self.f)
1636
1637
1638 @functools.cache
1639 def get_filesystem_encoding():
1640     encoding = sys.getfilesystemencoding()
1641     return encoding if encoding is not None else 'utf-8'
1642
1643
1644 _WINDOWS_QUOTE_TRANS = str.maketrans({'"': R'\"'})
1645 _CMD_QUOTE_TRANS = str.maketrans({
1646     # Keep quotes balanced by replacing them with `""` instead of `\\"`
1647     '"': '""',
1648     # These require an env-variable `=` containing `"^\n\n"` (set in `utils.Popen`)
1649     # `=` should be unique since variables containing `=` cannot be set using cmd
1650     '\n': '%=%',
1651     '\r': '%=%',
1652     # Use zero length variable replacement so `%` doesn't get expanded
1653     # `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`)
1654     '%': '%%cd:~,%',
1655 })
1656
1657
1658 def shell_quote(args, *, shell=False):
1659     args = list(variadic(args))
1660
1661     if compat_os_name != 'nt':
1662         return shlex.join(args)
1663
1664     trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS
1665     return ' '.join(
1666         s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII)
1667         else re.sub(r'(\\+)("|$)', r'\1\1\2', s).translate(trans).join('""')
1668         for s in args)
1669
1670
1671 def smuggle_url(url, data):
1672     """ Pass additional data in a URL for internal use. """
1673
1674     url, idata = unsmuggle_url(url, {})
1675     data.update(idata)
1676     sdata = urllib.parse.urlencode(
1677         {'__youtubedl_smuggle': json.dumps(data)})
1678     return url + '#' + sdata
1679
1680
1681 def unsmuggle_url(smug_url, default=None):
1682     if '#__youtubedl_smuggle' not in smug_url:
1683         return smug_url, default
1684     url, _, sdata = smug_url.rpartition('#')
1685     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1686     data = json.loads(jsond)
1687     return url, data
1688
1689
1690 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1691     """ Formats numbers with decimal sufixes like K, M, etc """
1692     num, factor = float_or_none(num), float(factor)
1693     if num is None or num < 0:
1694         return None
1695     POSSIBLE_SUFFIXES = 'kMGTPEZY'
1696     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1697     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1698     if factor == 1024:
1699         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1700     converted = num / (factor ** exponent)
1701     return fmt % (converted, suffix)
1702
1703
1704 def format_bytes(bytes):
1705     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1706
1707
1708 def lookup_unit_table(unit_table, s, strict=False):
1709     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1710     units_re = '|'.join(re.escape(u) for u in unit_table)
1711     m = (re.fullmatch if strict else re.match)(
1712         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1713     if not m:
1714         return None
1715
1716     num = float(m.group('num').replace(',', '.'))
1717     mult = unit_table[m.group('unit')]
1718     return round(num * mult)
1719
1720
1721 def parse_bytes(s):
1722     """Parse a string indicating a byte quantity into an integer"""
1723     return lookup_unit_table(
1724         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1725         s.upper(), strict=True)
1726
1727
1728 def parse_filesize(s):
1729     if s is None:
1730         return None
1731
1732     # The lower-case forms are of course incorrect and unofficial,
1733     # but we support those too
1734     _UNIT_TABLE = {
1735         'B': 1,
1736         'b': 1,
1737         'bytes': 1,
1738         'KiB': 1024,
1739         'KB': 1000,
1740         'kB': 1024,
1741         'Kb': 1000,
1742         'kb': 1000,
1743         'kilobytes': 1000,
1744         'kibibytes': 1024,
1745         'MiB': 1024 ** 2,
1746         'MB': 1000 ** 2,
1747         'mB': 1024 ** 2,
1748         'Mb': 1000 ** 2,
1749         'mb': 1000 ** 2,
1750         'megabytes': 1000 ** 2,
1751         'mebibytes': 1024 ** 2,
1752         'GiB': 1024 ** 3,
1753         'GB': 1000 ** 3,
1754         'gB': 1024 ** 3,
1755         'Gb': 1000 ** 3,
1756         'gb': 1000 ** 3,
1757         'gigabytes': 1000 ** 3,
1758         'gibibytes': 1024 ** 3,
1759         'TiB': 1024 ** 4,
1760         'TB': 1000 ** 4,
1761         'tB': 1024 ** 4,
1762         'Tb': 1000 ** 4,
1763         'tb': 1000 ** 4,
1764         'terabytes': 1000 ** 4,
1765         'tebibytes': 1024 ** 4,
1766         'PiB': 1024 ** 5,
1767         'PB': 1000 ** 5,
1768         'pB': 1024 ** 5,
1769         'Pb': 1000 ** 5,
1770         'pb': 1000 ** 5,
1771         'petabytes': 1000 ** 5,
1772         'pebibytes': 1024 ** 5,
1773         'EiB': 1024 ** 6,
1774         'EB': 1000 ** 6,
1775         'eB': 1024 ** 6,
1776         'Eb': 1000 ** 6,
1777         'eb': 1000 ** 6,
1778         'exabytes': 1000 ** 6,
1779         'exbibytes': 1024 ** 6,
1780         'ZiB': 1024 ** 7,
1781         'ZB': 1000 ** 7,
1782         'zB': 1024 ** 7,
1783         'Zb': 1000 ** 7,
1784         'zb': 1000 ** 7,
1785         'zettabytes': 1000 ** 7,
1786         'zebibytes': 1024 ** 7,
1787         'YiB': 1024 ** 8,
1788         'YB': 1000 ** 8,
1789         'yB': 1024 ** 8,
1790         'Yb': 1000 ** 8,
1791         'yb': 1000 ** 8,
1792         'yottabytes': 1000 ** 8,
1793         'yobibytes': 1024 ** 8,
1794     }
1795
1796     return lookup_unit_table(_UNIT_TABLE, s)
1797
1798
1799 def parse_count(s):
1800     if s is None:
1801         return None
1802
1803     s = re.sub(r'^[^\d]+\s', '', s).strip()
1804
1805     if re.match(r'^[\d,.]+$', s):
1806         return str_to_int(s)
1807
1808     _UNIT_TABLE = {
1809         'k': 1000,
1810         'K': 1000,
1811         'm': 1000 ** 2,
1812         'M': 1000 ** 2,
1813         'kk': 1000 ** 2,
1814         'KK': 1000 ** 2,
1815         'b': 1000 ** 3,
1816         'B': 1000 ** 3,
1817     }
1818
1819     ret = lookup_unit_table(_UNIT_TABLE, s)
1820     if ret is not None:
1821         return ret
1822
1823     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1824     if mobj:
1825         return str_to_int(mobj.group(1))
1826
1827
1828 def parse_resolution(s, *, lenient=False):
1829     if s is None:
1830         return {}
1831
1832     if lenient:
1833         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1834     else:
1835         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1836     if mobj:
1837         return {
1838             'width': int(mobj.group('w')),
1839             'height': int(mobj.group('h')),
1840         }
1841
1842     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1843     if mobj:
1844         return {'height': int(mobj.group(1))}
1845
1846     mobj = re.search(r'\b([48])[kK]\b', s)
1847     if mobj:
1848         return {'height': int(mobj.group(1)) * 540}
1849
1850     return {}
1851
1852
1853 def parse_bitrate(s):
1854     if not isinstance(s, str):
1855         return
1856     mobj = re.search(r'\b(\d+)\s*kbps', s)
1857     if mobj:
1858         return int(mobj.group(1))
1859
1860
1861 def month_by_name(name, lang='en'):
1862     """ Return the number of a month by (locale-independently) English name """
1863
1864     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1865
1866     try:
1867         return month_names.index(name) + 1
1868     except ValueError:
1869         return None
1870
1871
1872 def month_by_abbreviation(abbrev):
1873     """ Return the number of a month by (locale-independently) English
1874         abbreviations """
1875
1876     try:
1877         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1878     except ValueError:
1879         return None
1880
1881
1882 def fix_xml_ampersands(xml_str):
1883     """Replace all the '&' by '&amp;' in XML"""
1884     return re.sub(
1885         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1886         '&amp;',
1887         xml_str)
1888
1889
1890 def setproctitle(title):
1891     assert isinstance(title, str)
1892
1893     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1894     try:
1895         import ctypes
1896     except ImportError:
1897         return
1898
1899     try:
1900         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1901     except OSError:
1902         return
1903     except TypeError:
1904         # LoadLibrary in Windows Python 2.7.13 only expects
1905         # a bytestring, but since unicode_literals turns
1906         # every string into a unicode string, it fails.
1907         return
1908     title_bytes = title.encode()
1909     buf = ctypes.create_string_buffer(len(title_bytes))
1910     buf.value = title_bytes
1911     try:
1912         # PR_SET_NAME = 15      Ref: /usr/include/linux/prctl.h
1913         libc.prctl(15, buf, 0, 0, 0)
1914     except AttributeError:
1915         return  # Strange libc, just skip this
1916
1917
1918 def remove_start(s, start):
1919     return s[len(start):] if s is not None and s.startswith(start) else s
1920
1921
1922 def remove_end(s, end):
1923     return s[:-len(end)] if s is not None and s.endswith(end) else s
1924
1925
1926 def remove_quotes(s):
1927     if s is None or len(s) < 2:
1928         return s
1929     for quote in ('"', "'"):
1930         if s[0] == quote and s[-1] == quote:
1931             return s[1:-1]
1932     return s
1933
1934
1935 def get_domain(url):
1936     """
1937     This implementation is inconsistent, but is kept for compatibility.
1938     Use this only for "webpage_url_domain"
1939     """
1940     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1941
1942
1943 def url_basename(url):
1944     path = urllib.parse.urlparse(url).path
1945     return path.strip('/').split('/')[-1]
1946
1947
1948 def base_url(url):
1949     return re.match(r'https?://[^?#]+/', url).group()
1950
1951
1952 def urljoin(base, path):
1953     if isinstance(path, bytes):
1954         path = path.decode()
1955     if not isinstance(path, str) or not path:
1956         return None
1957     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1958         return path
1959     if isinstance(base, bytes):
1960         base = base.decode()
1961     if not isinstance(base, str) or not re.match(
1962             r'^(?:https?:)?//', base):
1963         return None
1964     return urllib.parse.urljoin(base, path)
1965
1966
1967 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1968     if get_attr and v is not None:
1969         v = getattr(v, get_attr, None)
1970     try:
1971         return int(v) * invscale // scale
1972     except (ValueError, TypeError, OverflowError):
1973         return default
1974
1975
1976 def str_or_none(v, default=None):
1977     return default if v is None else str(v)
1978
1979
1980 def str_to_int(int_str):
1981     """ A more relaxed version of int_or_none """
1982     if isinstance(int_str, int):
1983         return int_str
1984     elif isinstance(int_str, str):
1985         int_str = re.sub(r'[,\.\+]', '', int_str)
1986         return int_or_none(int_str)
1987
1988
1989 def float_or_none(v, scale=1, invscale=1, default=None):
1990     if v is None:
1991         return default
1992     try:
1993         return float(v) * invscale / scale
1994     except (ValueError, TypeError):
1995         return default
1996
1997
1998 def bool_or_none(v, default=None):
1999     return v if isinstance(v, bool) else default
2000
2001
2002 def strip_or_none(v, default=None):
2003     return v.strip() if isinstance(v, str) else default
2004
2005
2006 def url_or_none(url):
2007     if not url or not isinstance(url, str):
2008         return None
2009     url = url.strip()
2010     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2011
2012
2013 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
2014     datetime_object = None
2015     try:
2016         if isinstance(timestamp, (int, float)):  # unix timestamp
2017             # Using naive datetime here can break timestamp() in Windows
2018             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2019             # Also, dt.datetime.fromtimestamp breaks for negative timestamps
2020             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2021             datetime_object = (dt.datetime.fromtimestamp(0, dt.timezone.utc)
2022                                + dt.timedelta(seconds=timestamp))
2023         elif isinstance(timestamp, str):  # assume YYYYMMDD
2024             datetime_object = dt.datetime.strptime(timestamp, '%Y%m%d')
2025         date_format = re.sub(  # Support %s on windows
2026             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2027         return datetime_object.strftime(date_format)
2028     except (ValueError, TypeError, AttributeError):
2029         return default
2030
2031
2032 def parse_duration(s):
2033     if not isinstance(s, str):
2034         return None
2035     s = s.strip()
2036     if not s:
2037         return None
2038
2039     days, hours, mins, secs, ms = [None] * 5
2040     m = re.match(r'''(?x)
2041             (?P<before_secs>
2042                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2043             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2044             (?P<ms>[.:][0-9]+)?Z?$
2045         ''', s)
2046     if m:
2047         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2048     else:
2049         m = re.match(
2050             r'''(?ix)(?:P?
2051                 (?:
2052                     [0-9]+\s*y(?:ears?)?,?\s*
2053                 )?
2054                 (?:
2055                     [0-9]+\s*m(?:onths?)?,?\s*
2056                 )?
2057                 (?:
2058                     [0-9]+\s*w(?:eeks?)?,?\s*
2059                 )?
2060                 (?:
2061                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2062                 )?
2063                 T)?
2064                 (?:
2065                     (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2066                 )?
2067                 (?:
2068                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2069                 )?
2070                 (?:
2071                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2072                 )?Z?$''', s)
2073         if m:
2074             days, hours, mins, secs, ms = m.groups()
2075         else:
2076             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2077             if m:
2078                 hours, mins = m.groups()
2079             else:
2080                 return None
2081
2082     if ms:
2083         ms = ms.replace(':', '.')
2084     return sum(float(part or 0) * mult for part, mult in (
2085         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2086
2087
2088 def prepend_extension(filename, ext, expected_real_ext=None):
2089     name, real_ext = os.path.splitext(filename)
2090     return (
2091         f'{name}.{ext}{real_ext}'
2092         if not expected_real_ext or real_ext[1:] == expected_real_ext
2093         else f'{filename}.{ext}')
2094
2095
2096 def replace_extension(filename, ext, expected_real_ext=None):
2097     name, real_ext = os.path.splitext(filename)
2098     return f'{name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename}.{ext}'
2099
2100
2101 def check_executable(exe, args=[]):
2102     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2103     args can be a list of arguments for a short output (like -version) """
2104     try:
2105         Popen.run([exe, *args], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2106     except OSError:
2107         return False
2108     return exe
2109
2110
2111 def _get_exe_version_output(exe, args):
2112     try:
2113         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2114         # SIGTTOU if yt-dlp is run in the background.
2115         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2116         stdout, _, ret = Popen.run([encodeArgument(exe), *args], text=True,
2117                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2118         if ret:
2119             return None
2120     except OSError:
2121         return False
2122     return stdout
2123
2124
2125 def detect_exe_version(output, version_re=None, unrecognized='present'):
2126     assert isinstance(output, str)
2127     if version_re is None:
2128         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2129     m = re.search(version_re, output)
2130     if m:
2131         return m.group(1)
2132     else:
2133         return unrecognized
2134
2135
2136 def get_exe_version(exe, args=['--version'],
2137                     version_re=None, unrecognized=('present', 'broken')):
2138     """ Returns the version of the specified executable,
2139     or False if the executable is not present """
2140     unrecognized = variadic(unrecognized)
2141     assert len(unrecognized) in (1, 2)
2142     out = _get_exe_version_output(exe, args)
2143     if out is None:
2144         return unrecognized[-1]
2145     return out and detect_exe_version(out, version_re, unrecognized[0])
2146
2147
2148 def frange(start=0, stop=None, step=1):
2149     """Float range"""
2150     if stop is None:
2151         start, stop = 0, start
2152     sign = [-1, 1][step > 0] if step else 0
2153     while sign * start < sign * stop:
2154         yield start
2155         start += step
2156
2157
2158 class LazyList(collections.abc.Sequence):
2159     """Lazy immutable list from an iterable
2160     Note that slices of a LazyList are lists and not LazyList"""
2161
2162     class IndexError(IndexError):  # noqa: A001
2163         pass
2164
2165     def __init__(self, iterable, *, reverse=False, _cache=None):
2166         self._iterable = iter(iterable)
2167         self._cache = [] if _cache is None else _cache
2168         self._reversed = reverse
2169
2170     def __iter__(self):
2171         if self._reversed:
2172             # We need to consume the entire iterable to iterate in reverse
2173             yield from self.exhaust()
2174             return
2175         yield from self._cache
2176         for item in self._iterable:
2177             self._cache.append(item)
2178             yield item
2179
2180     def _exhaust(self):
2181         self._cache.extend(self._iterable)
2182         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2183         return self._cache
2184
2185     def exhaust(self):
2186         """Evaluate the entire iterable"""
2187         return self._exhaust()[::-1 if self._reversed else 1]
2188
2189     @staticmethod
2190     def _reverse_index(x):
2191         return None if x is None else ~x
2192
2193     def __getitem__(self, idx):
2194         if isinstance(idx, slice):
2195             if self._reversed:
2196                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2197             start, stop, step = idx.start, idx.stop, idx.step or 1
2198         elif isinstance(idx, int):
2199             if self._reversed:
2200                 idx = self._reverse_index(idx)
2201             start, stop, step = idx, idx, 0
2202         else:
2203             raise TypeError('indices must be integers or slices')
2204         if ((start or 0) < 0 or (stop or 0) < 0
2205                 or (start is None and step < 0)
2206                 or (stop is None and step > 0)):
2207             # We need to consume the entire iterable to be able to slice from the end
2208             # Obviously, never use this with infinite iterables
2209             self._exhaust()
2210             try:
2211                 return self._cache[idx]
2212             except IndexError as e:
2213                 raise self.IndexError(e) from e
2214         n = max(start or 0, stop or 0) - len(self._cache) + 1
2215         if n > 0:
2216             self._cache.extend(itertools.islice(self._iterable, n))
2217         try:
2218             return self._cache[idx]
2219         except IndexError as e:
2220             raise self.IndexError(e) from e
2221
2222     def __bool__(self):
2223         try:
2224             self[-1] if self._reversed else self[0]
2225         except self.IndexError:
2226             return False
2227         return True
2228
2229     def __len__(self):
2230         self._exhaust()
2231         return len(self._cache)
2232
2233     def __reversed__(self):
2234         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2235
2236     def __copy__(self):
2237         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2238
2239     def __repr__(self):
2240         # repr and str should mimic a list. So we exhaust the iterable
2241         return repr(self.exhaust())
2242
2243     def __str__(self):
2244         return repr(self.exhaust())
2245
2246
2247 class PagedList:
2248
2249     class IndexError(IndexError):  # noqa: A001
2250         pass
2251
2252     def __len__(self):
2253         # This is only useful for tests
2254         return len(self.getslice())
2255
2256     def __init__(self, pagefunc, pagesize, use_cache=True):
2257         self._pagefunc = pagefunc
2258         self._pagesize = pagesize
2259         self._pagecount = float('inf')
2260         self._use_cache = use_cache
2261         self._cache = {}
2262
2263     def getpage(self, pagenum):
2264         page_results = self._cache.get(pagenum)
2265         if page_results is None:
2266             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2267         if self._use_cache:
2268             self._cache[pagenum] = page_results
2269         return page_results
2270
2271     def getslice(self, start=0, end=None):
2272         return list(self._getslice(start, end))
2273
2274     def _getslice(self, start, end):
2275         raise NotImplementedError('This method must be implemented by subclasses')
2276
2277     def __getitem__(self, idx):
2278         assert self._use_cache, 'Indexing PagedList requires cache'
2279         if not isinstance(idx, int) or idx < 0:
2280             raise TypeError('indices must be non-negative integers')
2281         entries = self.getslice(idx, idx + 1)
2282         if not entries:
2283             raise self.IndexError
2284         return entries[0]
2285
2286     def __bool__(self):
2287         return bool(self.getslice(0, 1))
2288
2289
2290 class OnDemandPagedList(PagedList):
2291     """Download pages until a page with less than maximum results"""
2292
2293     def _getslice(self, start, end):
2294         for pagenum in itertools.count(start // self._pagesize):
2295             firstid = pagenum * self._pagesize
2296             nextfirstid = pagenum * self._pagesize + self._pagesize
2297             if start >= nextfirstid:
2298                 continue
2299
2300             startv = (
2301                 start % self._pagesize
2302                 if firstid <= start < nextfirstid
2303                 else 0)
2304             endv = (
2305                 ((end - 1) % self._pagesize) + 1
2306                 if (end is not None and firstid <= end <= nextfirstid)
2307                 else None)
2308
2309             try:
2310                 page_results = self.getpage(pagenum)
2311             except Exception:
2312                 self._pagecount = pagenum - 1
2313                 raise
2314             if startv != 0 or endv is not None:
2315                 page_results = page_results[startv:endv]
2316             yield from page_results
2317
2318             # A little optimization - if current page is not "full", ie. does
2319             # not contain page_size videos then we can assume that this page
2320             # is the last one - there are no more ids on further pages -
2321             # i.e. no need to query again.
2322             if len(page_results) + startv < self._pagesize:
2323                 break
2324
2325             # If we got the whole page, but the next page is not interesting,
2326             # break out early as well
2327             if end == nextfirstid:
2328                 break
2329
2330
2331 class InAdvancePagedList(PagedList):
2332     """PagedList with total number of pages known in advance"""
2333
2334     def __init__(self, pagefunc, pagecount, pagesize):
2335         PagedList.__init__(self, pagefunc, pagesize, True)
2336         self._pagecount = pagecount
2337
2338     def _getslice(self, start, end):
2339         start_page = start // self._pagesize
2340         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2341         skip_elems = start - start_page * self._pagesize
2342         only_more = None if end is None else end - start
2343         for pagenum in range(start_page, end_page):
2344             page_results = self.getpage(pagenum)
2345             if skip_elems:
2346                 page_results = page_results[skip_elems:]
2347                 skip_elems = None
2348             if only_more is not None:
2349                 if len(page_results) < only_more:
2350                     only_more -= len(page_results)
2351                 else:
2352                     yield from page_results[:only_more]
2353                     break
2354             yield from page_results
2355
2356
2357 class PlaylistEntries:
2358     MissingEntry = object()
2359     is_exhausted = False
2360
2361     def __init__(self, ydl, info_dict):
2362         self.ydl = ydl
2363
2364         # _entries must be assigned now since infodict can change during iteration
2365         entries = info_dict.get('entries')
2366         if entries is None:
2367             raise EntryNotInPlaylist('There are no entries')
2368         elif isinstance(entries, list):
2369             self.is_exhausted = True
2370
2371         requested_entries = info_dict.get('requested_entries')
2372         self.is_incomplete = requested_entries is not None
2373         if self.is_incomplete:
2374             assert self.is_exhausted
2375             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2376             for i, entry in zip(requested_entries, entries):
2377                 self._entries[i - 1] = entry
2378         elif isinstance(entries, (list, PagedList, LazyList)):
2379             self._entries = entries
2380         else:
2381             self._entries = LazyList(entries)
2382
2383     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2384         (?P<start>[+-]?\d+)?
2385         (?P<range>[:-]
2386             (?P<end>[+-]?\d+|inf(?:inite)?)?
2387             (?::(?P<step>[+-]?\d+))?
2388         )?''')
2389
2390     @classmethod
2391     def parse_playlist_items(cls, string):
2392         for segment in string.split(','):
2393             if not segment:
2394                 raise ValueError('There is two or more consecutive commas')
2395             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2396             if not mobj:
2397                 raise ValueError(f'{segment!r} is not a valid specification')
2398             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2399             if int_or_none(step) == 0:
2400                 raise ValueError(f'Step in {segment!r} cannot be zero')
2401             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2402
2403     def get_requested_items(self):
2404         playlist_items = self.ydl.params.get('playlist_items')
2405         playlist_start = self.ydl.params.get('playliststart', 1)
2406         playlist_end = self.ydl.params.get('playlistend')
2407         # For backwards compatibility, interpret -1 as whole list
2408         if playlist_end in (-1, None):
2409             playlist_end = ''
2410         if not playlist_items:
2411             playlist_items = f'{playlist_start}:{playlist_end}'
2412         elif playlist_start != 1 or playlist_end:
2413             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2414
2415         for index in self.parse_playlist_items(playlist_items):
2416             for i, entry in self[index]:
2417                 yield i, entry
2418                 if not entry:
2419                     continue
2420                 try:
2421                     # The item may have just been added to archive. Don't break due to it
2422                     if not self.ydl.params.get('lazy_playlist'):
2423                         # TODO: Add auto-generated fields
2424                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2425                 except (ExistingVideoReached, RejectedVideoReached):
2426                     return
2427
2428     def get_full_count(self):
2429         if self.is_exhausted and not self.is_incomplete:
2430             return len(self)
2431         elif isinstance(self._entries, InAdvancePagedList):
2432             if self._entries._pagesize == 1:
2433                 return self._entries._pagecount
2434
2435     @functools.cached_property
2436     def _getter(self):
2437         if isinstance(self._entries, list):
2438             def get_entry(i):
2439                 try:
2440                     entry = self._entries[i]
2441                 except IndexError:
2442                     entry = self.MissingEntry
2443                     if not self.is_incomplete:
2444                         raise self.IndexError
2445                 if entry is self.MissingEntry:
2446                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2447                 return entry
2448         else:
2449             def get_entry(i):
2450                 try:
2451                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2452                 except (LazyList.IndexError, PagedList.IndexError):
2453                     raise self.IndexError
2454         return get_entry
2455
2456     def __getitem__(self, idx):
2457         if isinstance(idx, int):
2458             idx = slice(idx, idx)
2459
2460         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2461         step = 1 if idx.step is None else idx.step
2462         if idx.start is None:
2463             start = 0 if step > 0 else len(self) - 1
2464         else:
2465             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2466
2467         # NB: Do not call len(self) when idx == [:]
2468         if idx.stop is None:
2469             stop = 0 if step < 0 else float('inf')
2470         else:
2471             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2472         stop += [-1, 1][step > 0]
2473
2474         for i in frange(start, stop, step):
2475             if i < 0:
2476                 continue
2477             try:
2478                 entry = self._getter(i)
2479             except self.IndexError:
2480                 self.is_exhausted = True
2481                 if step > 0:
2482                     break
2483                 continue
2484             yield i + 1, entry
2485
2486     def __len__(self):
2487         return len(tuple(self[:]))
2488
2489     class IndexError(IndexError):  # noqa: A001
2490         pass
2491
2492
2493 def uppercase_escape(s):
2494     unicode_escape = codecs.getdecoder('unicode_escape')
2495     return re.sub(
2496         r'\\U[0-9a-fA-F]{8}',
2497         lambda m: unicode_escape(m.group(0))[0],
2498         s)
2499
2500
2501 def lowercase_escape(s):
2502     unicode_escape = codecs.getdecoder('unicode_escape')
2503     return re.sub(
2504         r'\\u[0-9a-fA-F]{4}',
2505         lambda m: unicode_escape(m.group(0))[0],
2506         s)
2507
2508
2509 def parse_qs(url, **kwargs):
2510     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2511
2512
2513 def read_batch_urls(batch_fd):
2514     def fixup(url):
2515         if not isinstance(url, str):
2516             url = url.decode('utf-8', 'replace')
2517         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2518         for bom in BOM_UTF8:
2519             if url.startswith(bom):
2520                 url = url[len(bom):]
2521         url = url.lstrip()
2522         if not url or url.startswith(('#', ';', ']')):
2523             return False
2524         # "#" cannot be stripped out since it is part of the URI
2525         # However, it can be safely stripped out if following a whitespace
2526         return re.split(r'\s#', url, maxsplit=1)[0].rstrip()
2527
2528     with contextlib.closing(batch_fd) as fd:
2529         return [url for url in map(fixup, fd) if url]
2530
2531
2532 def urlencode_postdata(*args, **kargs):
2533     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2534
2535
2536 def update_url(url, *, query_update=None, **kwargs):
2537     """Replace URL components specified by kwargs
2538        @param url           str or parse url tuple
2539        @param query_update  update query
2540        @returns             str
2541     """
2542     if isinstance(url, str):
2543         if not kwargs and not query_update:
2544             return url
2545         else:
2546             url = urllib.parse.urlparse(url)
2547     if query_update:
2548         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2549         kwargs['query'] = urllib.parse.urlencode({
2550             **urllib.parse.parse_qs(url.query),
2551             **query_update,
2552         }, True)
2553     return urllib.parse.urlunparse(url._replace(**kwargs))
2554
2555
2556 def update_url_query(url, query):
2557     return update_url(url, query_update=query)
2558
2559
2560 def _multipart_encode_impl(data, boundary):
2561     content_type = f'multipart/form-data; boundary={boundary}'
2562
2563     out = b''
2564     for k, v in data.items():
2565         out += b'--' + boundary.encode('ascii') + b'\r\n'
2566         if isinstance(k, str):
2567             k = k.encode()
2568         if isinstance(v, str):
2569             v = v.encode()
2570         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2571         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2572         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2573         if boundary.encode('ascii') in content:
2574             raise ValueError('Boundary overlaps with data')
2575         out += content
2576
2577     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2578
2579     return out, content_type
2580
2581
2582 def multipart_encode(data, boundary=None):
2583     """
2584     Encode a dict to RFC 7578-compliant form-data
2585
2586     data:
2587         A dict where keys and values can be either Unicode or bytes-like
2588         objects.
2589     boundary:
2590         If specified a Unicode object, it's used as the boundary. Otherwise
2591         a random boundary is generated.
2592
2593     Reference: https://tools.ietf.org/html/rfc7578
2594     """
2595     has_specified_boundary = boundary is not None
2596
2597     while True:
2598         if boundary is None:
2599             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2600
2601         try:
2602             out, content_type = _multipart_encode_impl(data, boundary)
2603             break
2604         except ValueError:
2605             if has_specified_boundary:
2606                 raise
2607             boundary = None
2608
2609     return out, content_type
2610
2611
2612 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2613     if blocked_types is NO_DEFAULT:
2614         blocked_types = (str, bytes, collections.abc.Mapping)
2615     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2616
2617
2618 def variadic(x, allowed_types=NO_DEFAULT):
2619     if not isinstance(allowed_types, (tuple, type)):
2620         deprecation_warning('allowed_types should be a tuple or a type')
2621         allowed_types = tuple(allowed_types)
2622     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2623
2624
2625 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2626     for f in funcs:
2627         try:
2628             val = f(*args, **kwargs)
2629         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2630             pass
2631         else:
2632             if expected_type is None or isinstance(val, expected_type):
2633                 return val
2634
2635
2636 def try_get(src, getter, expected_type=None):
2637     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2638
2639
2640 def filter_dict(dct, cndn=lambda _, v: v is not None):
2641     return {k: v for k, v in dct.items() if cndn(k, v)}
2642
2643
2644 def merge_dicts(*dicts):
2645     merged = {}
2646     for a_dict in dicts:
2647         for k, v in a_dict.items():
2648             if (v is not None and k not in merged
2649                     or isinstance(v, str) and merged[k] == ''):
2650                 merged[k] = v
2651     return merged
2652
2653
2654 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2655     return string if isinstance(string, str) else str(string, encoding, errors)
2656
2657
2658 US_RATINGS = {
2659     'G': 0,
2660     'PG': 10,
2661     'PG-13': 13,
2662     'R': 16,
2663     'NC': 18,
2664 }
2665
2666
2667 TV_PARENTAL_GUIDELINES = {
2668     'TV-Y': 0,
2669     'TV-Y7': 7,
2670     'TV-G': 0,
2671     'TV-PG': 0,
2672     'TV-14': 14,
2673     'TV-MA': 17,
2674 }
2675
2676
2677 def parse_age_limit(s):
2678     # isinstance(False, int) is True. So type() must be used instead
2679     if type(s) is int:  # noqa: E721
2680         return s if 0 <= s <= 21 else None
2681     elif not isinstance(s, str):
2682         return None
2683     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2684     if m:
2685         return int(m.group('age'))
2686     s = s.upper()
2687     if s in US_RATINGS:
2688         return US_RATINGS[s]
2689     m = re.match(r'^TV[_-]?({})$'.format('|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES)), s)
2690     if m:
2691         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2692     return None
2693
2694
2695 def strip_jsonp(code):
2696     return re.sub(
2697         r'''(?sx)^
2698             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2699             (?:\s*&&\s*(?P=func_name))?
2700             \s*\(\s*(?P<callback_data>.*)\);?
2701             \s*?(?://[^\n]*)*$''',
2702         r'\g<callback_data>', code)
2703
2704
2705 def js_to_json(code, vars={}, *, strict=False):
2706     # vars is a dict of var, val pairs to substitute
2707     STRING_QUOTES = '\'"`'
2708     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2709     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2710     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2711     INTEGER_TABLE = (
2712         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2713         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2714     )
2715
2716     def process_escape(match):
2717         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2718         escape = match.group(1) or match.group(2)
2719
2720         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2721                 else R'\u00' if escape == 'x'
2722                 else '' if escape == '\n'
2723                 else escape)
2724
2725     def template_substitute(match):
2726         evaluated = js_to_json(match.group(1), vars, strict=strict)
2727         if evaluated[0] == '"':
2728             return json.loads(evaluated)
2729         return evaluated
2730
2731     def fix_kv(m):
2732         v = m.group(0)
2733         if v in ('true', 'false', 'null'):
2734             return v
2735         elif v in ('undefined', 'void 0'):
2736             return 'null'
2737         elif v.startswith(('/*', '//', '!')) or v == ',':
2738             return ''
2739
2740         if v[0] in STRING_QUOTES:
2741             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2742             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2743             return f'"{escaped}"'
2744
2745         for regex, base in INTEGER_TABLE:
2746             im = re.match(regex, v)
2747             if im:
2748                 i = int(im.group(1), base)
2749                 return f'"{i}":' if v.endswith(':') else str(i)
2750
2751         if v in vars:
2752             try:
2753                 if not strict:
2754                     json.loads(vars[v])
2755             except json.JSONDecodeError:
2756                 return json.dumps(vars[v])
2757             else:
2758                 return vars[v]
2759
2760         if not strict:
2761             return f'"{v}"'
2762
2763         raise ValueError(f'Unknown value: {v}')
2764
2765     def create_map(mobj):
2766         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2767
2768     code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
2769     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2770     if not strict:
2771         code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
2772         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2773         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2774         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2775
2776     return re.sub(rf'''(?sx)
2777         {STRING_RE}|
2778         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2779         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2780         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2781         [0-9]+(?={SKIP_RE}:)|
2782         !+
2783         ''', fix_kv, code)
2784
2785
2786 def qualities(quality_ids):
2787     """ Get a numeric quality value out of a list of possible values """
2788     def q(qid):
2789         try:
2790             return quality_ids.index(qid)
2791         except ValueError:
2792             return -1
2793     return q
2794
2795
2796 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2797
2798
2799 DEFAULT_OUTTMPL = {
2800     'default': '%(title)s [%(id)s].%(ext)s',
2801     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2802 }
2803 OUTTMPL_TYPES = {
2804     'chapter': None,
2805     'subtitle': None,
2806     'thumbnail': None,
2807     'description': 'description',
2808     'annotation': 'annotations.xml',
2809     'infojson': 'info.json',
2810     'link': None,
2811     'pl_video': None,
2812     'pl_thumbnail': None,
2813     'pl_description': 'description',
2814     'pl_infojson': 'info.json',
2815 }
2816
2817 # As of [1] format syntax is:
2818 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2819 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2820 STR_FORMAT_RE_TMPL = r'''(?x)
2821     (?<!%)(?P<prefix>(?:%%)*)
2822     %
2823     (?P<has_key>\((?P<key>{0})\))?
2824     (?P<format>
2825         (?P<conversion>[#0\-+ ]+)?
2826         (?P<min_width>\d+)?
2827         (?P<precision>\.\d+)?
2828         (?P<len_mod>[hlL])?  # unused in python
2829         {1}  # conversion type
2830     )
2831 '''
2832
2833
2834 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2835
2836
2837 def limit_length(s, length):
2838     """ Add ellipses to overly long strings """
2839     if s is None:
2840         return None
2841     ELLIPSES = '...'
2842     if len(s) > length:
2843         return s[:length - len(ELLIPSES)] + ELLIPSES
2844     return s
2845
2846
2847 def version_tuple(v):
2848     return tuple(int(e) for e in re.split(r'[-.]', v))
2849
2850
2851 def is_outdated_version(version, limit, assume_new=True):
2852     if not version:
2853         return not assume_new
2854     try:
2855         return version_tuple(version) < version_tuple(limit)
2856     except ValueError:
2857         return not assume_new
2858
2859
2860 def ytdl_is_updateable():
2861     """ Returns if yt-dlp can be updated with -U """
2862
2863     from ..update import is_non_updateable
2864
2865     return not is_non_updateable()
2866
2867
2868 def args_to_str(args):
2869     # Get a short string representation for a subprocess command
2870     return shell_quote(args)
2871
2872
2873 def error_to_str(err):
2874     return f'{type(err).__name__}: {err}'
2875
2876
2877 def mimetype2ext(mt, default=NO_DEFAULT):
2878     if not isinstance(mt, str):
2879         if default is not NO_DEFAULT:
2880             return default
2881         return None
2882
2883     MAP = {
2884         # video
2885         '3gpp': '3gp',
2886         'mp2t': 'ts',
2887         'mp4': 'mp4',
2888         'mpeg': 'mpeg',
2889         'mpegurl': 'm3u8',
2890         'quicktime': 'mov',
2891         'webm': 'webm',
2892         'vp9': 'vp9',
2893         'video/ogg': 'ogv',
2894         'x-flv': 'flv',
2895         'x-m4v': 'm4v',
2896         'x-matroska': 'mkv',
2897         'x-mng': 'mng',
2898         'x-mp4-fragmented': 'mp4',
2899         'x-ms-asf': 'asf',
2900         'x-ms-wmv': 'wmv',
2901         'x-msvideo': 'avi',
2902
2903         # application (streaming playlists)
2904         'dash+xml': 'mpd',
2905         'f4m+xml': 'f4m',
2906         'hds+xml': 'f4m',
2907         'vnd.apple.mpegurl': 'm3u8',
2908         'vnd.ms-sstr+xml': 'ism',
2909         'x-mpegurl': 'm3u8',
2910
2911         # audio
2912         'audio/mp4': 'm4a',
2913         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2914         # Using .mp3 as it's the most popular one
2915         'audio/mpeg': 'mp3',
2916         'audio/webm': 'webm',
2917         'audio/x-matroska': 'mka',
2918         'audio/x-mpegurl': 'm3u',
2919         'midi': 'mid',
2920         'ogg': 'ogg',
2921         'wav': 'wav',
2922         'wave': 'wav',
2923         'x-aac': 'aac',
2924         'x-flac': 'flac',
2925         'x-m4a': 'm4a',
2926         'x-realaudio': 'ra',
2927         'x-wav': 'wav',
2928
2929         # image
2930         'avif': 'avif',
2931         'bmp': 'bmp',
2932         'gif': 'gif',
2933         'jpeg': 'jpg',
2934         'png': 'png',
2935         'svg+xml': 'svg',
2936         'tiff': 'tif',
2937         'vnd.wap.wbmp': 'wbmp',
2938         'webp': 'webp',
2939         'x-icon': 'ico',
2940         'x-jng': 'jng',
2941         'x-ms-bmp': 'bmp',
2942
2943         # caption
2944         'filmstrip+json': 'fs',
2945         'smptett+xml': 'tt',
2946         'ttaf+xml': 'dfxp',
2947         'ttml+xml': 'ttml',
2948         'x-ms-sami': 'sami',
2949
2950         # misc
2951         'gzip': 'gz',
2952         'json': 'json',
2953         'xml': 'xml',
2954         'zip': 'zip',
2955     }
2956
2957     mimetype = mt.partition(';')[0].strip().lower()
2958     _, _, subtype = mimetype.rpartition('/')
2959
2960     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2961     if ext:
2962         return ext
2963     elif default is not NO_DEFAULT:
2964         return default
2965     return subtype.replace('+', '.')
2966
2967
2968 def ext2mimetype(ext_or_url):
2969     if not ext_or_url:
2970         return None
2971     if '.' not in ext_or_url:
2972         ext_or_url = f'file.{ext_or_url}'
2973     return mimetypes.guess_type(ext_or_url)[0]
2974
2975
2976 def parse_codecs(codecs_str):
2977     # http://tools.ietf.org/html/rfc6381
2978     if not codecs_str:
2979         return {}
2980     split_codecs = list(filter(None, map(
2981         str.strip, codecs_str.strip().strip(',').split(','))))
2982     vcodec, acodec, scodec, hdr = None, None, None, None
2983     for full_codec in split_codecs:
2984         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2985         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2986                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2987             if vcodec:
2988                 continue
2989             vcodec = full_codec
2990             if parts[0] in ('dvh1', 'dvhe'):
2991                 hdr = 'DV'
2992             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2993                 hdr = 'HDR10'
2994             elif parts[:2] == ['vp9', '2']:
2995                 hdr = 'HDR10'
2996         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
2997                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2998             acodec = acodec or full_codec
2999         elif parts[0] in ('stpp', 'wvtt'):
3000             scodec = scodec or full_codec
3001         else:
3002             write_string(f'WARNING: Unknown codec {full_codec}\n')
3003     if vcodec or acodec or scodec:
3004         return {
3005             'vcodec': vcodec or 'none',
3006             'acodec': acodec or 'none',
3007             'dynamic_range': hdr,
3008             **({'scodec': scodec} if scodec is not None else {}),
3009         }
3010     elif len(split_codecs) == 2:
3011         return {
3012             'vcodec': split_codecs[0],
3013             'acodec': split_codecs[1],
3014         }
3015     return {}
3016
3017
3018 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3019     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3020
3021     allow_mkv = not preferences or 'mkv' in preferences
3022
3023     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3024         return 'mkv'  # TODO: any other format allows this?
3025
3026     # TODO: All codecs supported by parse_codecs isn't handled here
3027     COMPATIBLE_CODECS = {
3028         'mp4': {
3029             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3030             'h264', 'aacl', 'ec-3',  # Set in ISM
3031         },
3032         'webm': {
3033             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3034             'vp9x', 'vp8x',  # in the webm spec
3035         },
3036     }
3037
3038     sanitize_codec = functools.partial(
3039         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3040     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3041
3042     for ext in preferences or COMPATIBLE_CODECS.keys():
3043         codec_set = COMPATIBLE_CODECS.get(ext, set())
3044         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3045             return ext
3046
3047     COMPATIBLE_EXTS = (
3048         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3049         {'webm', 'weba'},
3050     )
3051     for ext in preferences or vexts:
3052         current_exts = {ext, *vexts, *aexts}
3053         if ext == 'mkv' or current_exts == {ext} or any(
3054                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3055             return ext
3056     return 'mkv' if allow_mkv else preferences[-1]
3057
3058
3059 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3060     getheader = url_handle.headers.get
3061
3062     cd = getheader('Content-Disposition')
3063     if cd:
3064         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3065         if m:
3066             e = determine_ext(m.group('filename'), default_ext=None)
3067             if e:
3068                 return e
3069
3070     meta_ext = getheader('x-amz-meta-name')
3071     if meta_ext:
3072         e = meta_ext.rpartition('.')[2]
3073         if e:
3074             return e
3075
3076     return mimetype2ext(getheader('Content-Type'), default=default)
3077
3078
3079 def encode_data_uri(data, mime_type):
3080     return 'data:{};base64,{}'.format(mime_type, base64.b64encode(data).decode('ascii'))
3081
3082
3083 def age_restricted(content_limit, age_limit):
3084     """ Returns True iff the content should be blocked """
3085
3086     if age_limit is None:  # No limit set
3087         return False
3088     if content_limit is None:
3089         return False  # Content available for everyone
3090     return age_limit < content_limit
3091
3092
3093 # List of known byte-order-marks (BOM)
3094 BOMS = [
3095     (b'\xef\xbb\xbf', 'utf-8'),
3096     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3097     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3098     (b'\xff\xfe', 'utf-16-le'),
3099     (b'\xfe\xff', 'utf-16-be'),
3100 ]
3101
3102
3103 def is_html(first_bytes):
3104     """ Detect whether a file contains HTML by examining its first bytes. """
3105
3106     encoding = 'utf-8'
3107     for bom, enc in BOMS:
3108         while first_bytes.startswith(bom):
3109             encoding, first_bytes = enc, first_bytes[len(bom):]
3110
3111     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3112
3113
3114 def determine_protocol(info_dict):
3115     protocol = info_dict.get('protocol')
3116     if protocol is not None:
3117         return protocol
3118
3119     url = sanitize_url(info_dict['url'])
3120     if url.startswith('rtmp'):
3121         return 'rtmp'
3122     elif url.startswith('mms'):
3123         return 'mms'
3124     elif url.startswith('rtsp'):
3125         return 'rtsp'
3126
3127     ext = determine_ext(url)
3128     if ext == 'm3u8':
3129         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3130     elif ext == 'f4m':
3131         return 'f4m'
3132
3133     return urllib.parse.urlparse(url).scheme
3134
3135
3136 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3137     """ Render a list of rows, each as a list of values.
3138     Text after a \t will be right aligned """
3139     def width(string):
3140         return len(remove_terminal_sequences(string).replace('\t', ''))
3141
3142     def get_max_lens(table):
3143         return [max(width(str(v)) for v in col) for col in zip(*table)]
3144
3145     def filter_using_list(row, filter_array):
3146         return [col for take, col in itertools.zip_longest(filter_array, row, fillvalue=True) if take]
3147
3148     max_lens = get_max_lens(data) if hide_empty else []
3149     header_row = filter_using_list(header_row, max_lens)
3150     data = [filter_using_list(row, max_lens) for row in data]
3151
3152     table = [header_row, *data]
3153     max_lens = get_max_lens(table)
3154     extra_gap += 1
3155     if delim:
3156         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens], *data]
3157         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3158     for row in table:
3159         for pos, text in enumerate(map(str, row)):
3160             if '\t' in text:
3161                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3162             else:
3163                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3164     return '\n'.join(''.join(row).rstrip() for row in table)
3165
3166
3167 def _match_one(filter_part, dct, incomplete):
3168     # TODO: Generalize code with YoutubeDL._build_format_filter
3169     STRING_OPERATORS = {
3170         '*=': operator.contains,
3171         '^=': lambda attr, value: attr.startswith(value),
3172         '$=': lambda attr, value: attr.endswith(value),
3173         '~=': lambda attr, value: re.search(value, attr),
3174     }
3175     COMPARISON_OPERATORS = {
3176         **STRING_OPERATORS,
3177         '<=': operator.le,  # "<=" must be defined above "<"
3178         '<': operator.lt,
3179         '>=': operator.ge,
3180         '>': operator.gt,
3181         '=': operator.eq,
3182     }
3183
3184     if isinstance(incomplete, bool):
3185         is_incomplete = lambda _: incomplete
3186     else:
3187         is_incomplete = lambda k: k in incomplete
3188
3189     operator_rex = re.compile(r'''(?x)
3190         (?P<key>[a-z_]+)
3191         \s*(?P<negation>!\s*)?(?P<op>{})(?P<none_inclusive>\s*\?)?\s*
3192         (?:
3193             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3194             (?P<strval>.+?)
3195         )
3196         '''.format('|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))))
3197     m = operator_rex.fullmatch(filter_part.strip())
3198     if m:
3199         m = m.groupdict()
3200         unnegated_op = COMPARISON_OPERATORS[m['op']]
3201         if m['negation']:
3202             op = lambda attr, value: not unnegated_op(attr, value)
3203         else:
3204             op = unnegated_op
3205         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3206         if m['quote']:
3207             comparison_value = comparison_value.replace(r'\{}'.format(m['quote']), m['quote'])
3208         actual_value = dct.get(m['key'])
3209         numeric_comparison = None
3210         if isinstance(actual_value, (int, float)):
3211             # If the original field is a string and matching comparisonvalue is
3212             # a number we should respect the origin of the original field
3213             # and process comparison value as a string (see
3214             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3215             try:
3216                 numeric_comparison = int(comparison_value)
3217             except ValueError:
3218                 numeric_comparison = parse_filesize(comparison_value)
3219                 if numeric_comparison is None:
3220                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3221                 if numeric_comparison is None:
3222                     numeric_comparison = parse_duration(comparison_value)
3223         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3224             raise ValueError('Operator {} only supports string values!'.format(m['op']))
3225         if actual_value is None:
3226             return is_incomplete(m['key']) or m['none_inclusive']
3227         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3228
3229     UNARY_OPERATORS = {
3230         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3231         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3232     }
3233     operator_rex = re.compile(r'''(?x)
3234         (?P<op>{})\s*(?P<key>[a-z_]+)
3235         '''.format('|'.join(map(re.escape, UNARY_OPERATORS.keys()))))
3236     m = operator_rex.fullmatch(filter_part.strip())
3237     if m:
3238         op = UNARY_OPERATORS[m.group('op')]
3239         actual_value = dct.get(m.group('key'))
3240         if is_incomplete(m.group('key')) and actual_value is None:
3241             return True
3242         return op(actual_value)
3243
3244     raise ValueError(f'Invalid filter part {filter_part!r}')
3245
3246
3247 def match_str(filter_str, dct, incomplete=False):
3248     """ Filter a dictionary with a simple string syntax.
3249     @returns           Whether the filter passes
3250     @param incomplete  Set of keys that is expected to be missing from dct.
3251                        Can be True/False to indicate all/none of the keys may be missing.
3252                        All conditions on incomplete keys pass if the key is missing
3253     """
3254     return all(
3255         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3256         for filter_part in re.split(r'(?<!\\)&', filter_str))
3257
3258
3259 def match_filter_func(filters, breaking_filters=None):
3260     if not filters and not breaking_filters:
3261         return None
3262     repr_ = f'{match_filter_func.__module__}.{match_filter_func.__qualname__}({filters}, {breaking_filters})'
3263
3264     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3265     filters = set(variadic(filters or []))
3266
3267     interactive = '-' in filters
3268     if interactive:
3269         filters.remove('-')
3270
3271     @function_with_repr.set_repr(repr_)
3272     def _match_func(info_dict, incomplete=False):
3273         ret = breaking_filters(info_dict, incomplete)
3274         if ret is not None:
3275             raise RejectedVideoReached(ret)
3276
3277         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3278             return NO_DEFAULT if interactive and not incomplete else None
3279         else:
3280             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3281             filter_str = ') | ('.join(map(str.strip, filters))
3282             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3283     return _match_func
3284
3285
3286 class download_range_func:
3287     def __init__(self, chapters, ranges, from_info=False):
3288         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3289
3290     def __call__(self, info_dict, ydl):
3291
3292         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3293                    else 'Cannot match chapters since chapter information is unavailable')
3294         for regex in self.chapters or []:
3295             for i, chapter in enumerate(info_dict.get('chapters') or []):
3296                 if re.search(regex, chapter['title']):
3297                     warning = None
3298                     yield {**chapter, 'index': i}
3299         if self.chapters and warning:
3300             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3301
3302         for start, end in self.ranges or []:
3303             yield {
3304                 'start_time': self._handle_negative_timestamp(start, info_dict),
3305                 'end_time': self._handle_negative_timestamp(end, info_dict),
3306             }
3307
3308         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3309             yield {
3310                 'start_time': info_dict.get('start_time') or 0,
3311                 'end_time': info_dict.get('end_time') or float('inf'),
3312             }
3313         elif not self.ranges and not self.chapters:
3314             yield {}
3315
3316     @staticmethod
3317     def _handle_negative_timestamp(time, info):
3318         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3319
3320     def __eq__(self, other):
3321         return (isinstance(other, download_range_func)
3322                 and self.chapters == other.chapters and self.ranges == other.ranges)
3323
3324     def __repr__(self):
3325         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3326
3327
3328 def parse_dfxp_time_expr(time_expr):
3329     if not time_expr:
3330         return
3331
3332     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3333     if mobj:
3334         return float(mobj.group('time_offset'))
3335
3336     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3337     if mobj:
3338         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3339
3340
3341 def srt_subtitles_timecode(seconds):
3342     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3343
3344
3345 def ass_subtitles_timecode(seconds):
3346     time = timetuple_from_msec(seconds * 1000)
3347     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3348
3349
3350 def dfxp2srt(dfxp_data):
3351     """
3352     @param dfxp_data A bytes-like object containing DFXP data
3353     @returns A unicode object containing converted SRT data
3354     """
3355     LEGACY_NAMESPACES = (
3356         (b'http://www.w3.org/ns/ttml', [
3357             b'http://www.w3.org/2004/11/ttaf1',
3358             b'http://www.w3.org/2006/04/ttaf1',
3359             b'http://www.w3.org/2006/10/ttaf1',
3360         ]),
3361         (b'http://www.w3.org/ns/ttml#styling', [
3362             b'http://www.w3.org/ns/ttml#style',
3363         ]),
3364     )
3365
3366     SUPPORTED_STYLING = [
3367         'color',
3368         'fontFamily',
3369         'fontSize',
3370         'fontStyle',
3371         'fontWeight',
3372         'textDecoration',
3373     ]
3374
3375     _x = functools.partial(xpath_with_ns, ns_map={
3376         'xml': 'http://www.w3.org/XML/1998/namespace',
3377         'ttml': 'http://www.w3.org/ns/ttml',
3378         'tts': 'http://www.w3.org/ns/ttml#styling',
3379     })
3380
3381     styles = {}
3382     default_style = {}
3383
3384     class TTMLPElementParser:
3385         _out = ''
3386         _unclosed_elements = []
3387         _applied_styles = []
3388
3389         def start(self, tag, attrib):
3390             if tag in (_x('ttml:br'), 'br'):
3391                 self._out += '\n'
3392             else:
3393                 unclosed_elements = []
3394                 style = {}
3395                 element_style_id = attrib.get('style')
3396                 if default_style:
3397                     style.update(default_style)
3398                 if element_style_id:
3399                     style.update(styles.get(element_style_id, {}))
3400                 for prop in SUPPORTED_STYLING:
3401                     prop_val = attrib.get(_x('tts:' + prop))
3402                     if prop_val:
3403                         style[prop] = prop_val
3404                 if style:
3405                     font = ''
3406                     for k, v in sorted(style.items()):
3407                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3408                             continue
3409                         if k == 'color':
3410                             font += f' color="{v}"'
3411                         elif k == 'fontSize':
3412                             font += f' size="{v}"'
3413                         elif k == 'fontFamily':
3414                             font += f' face="{v}"'
3415                         elif k == 'fontWeight' and v == 'bold':
3416                             self._out += '<b>'
3417                             unclosed_elements.append('b')
3418                         elif k == 'fontStyle' and v == 'italic':
3419                             self._out += '<i>'
3420                             unclosed_elements.append('i')
3421                         elif k == 'textDecoration' and v == 'underline':
3422                             self._out += '<u>'
3423                             unclosed_elements.append('u')
3424                     if font:
3425                         self._out += '<font' + font + '>'
3426                         unclosed_elements.append('font')
3427                     applied_style = {}
3428                     if self._applied_styles:
3429                         applied_style.update(self._applied_styles[-1])
3430                     applied_style.update(style)
3431                     self._applied_styles.append(applied_style)
3432                 self._unclosed_elements.append(unclosed_elements)
3433
3434         def end(self, tag):
3435             if tag not in (_x('ttml:br'), 'br'):
3436                 unclosed_elements = self._unclosed_elements.pop()
3437                 for element in reversed(unclosed_elements):
3438                     self._out += f'</{element}>'
3439                 if unclosed_elements and self._applied_styles:
3440                     self._applied_styles.pop()
3441
3442         def data(self, data):
3443             self._out += data
3444
3445         def close(self):
3446             return self._out.strip()
3447
3448     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3449     # This will not trigger false positives since only UTF-8 text is being replaced
3450     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3451
3452     def parse_node(node):
3453         target = TTMLPElementParser()
3454         parser = xml.etree.ElementTree.XMLParser(target=target)
3455         parser.feed(xml.etree.ElementTree.tostring(node))
3456         return parser.close()
3457
3458     for k, v in LEGACY_NAMESPACES:
3459         for ns in v:
3460             dfxp_data = dfxp_data.replace(ns, k)
3461
3462     dfxp = compat_etree_fromstring(dfxp_data)
3463     out = []
3464     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3465
3466     if not paras:
3467         raise ValueError('Invalid dfxp/TTML subtitle')
3468
3469     repeat = False
3470     while True:
3471         for style in dfxp.findall(_x('.//ttml:style')):
3472             style_id = style.get('id') or style.get(_x('xml:id'))
3473             if not style_id:
3474                 continue
3475             parent_style_id = style.get('style')
3476             if parent_style_id:
3477                 if parent_style_id not in styles:
3478                     repeat = True
3479                     continue
3480                 styles[style_id] = styles[parent_style_id].copy()
3481             for prop in SUPPORTED_STYLING:
3482                 prop_val = style.get(_x('tts:' + prop))
3483                 if prop_val:
3484                     styles.setdefault(style_id, {})[prop] = prop_val
3485         if repeat:
3486             repeat = False
3487         else:
3488             break
3489
3490     for p in ('body', 'div'):
3491         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3492         if ele is None:
3493             continue
3494         style = styles.get(ele.get('style'))
3495         if not style:
3496             continue
3497         default_style.update(style)
3498
3499     for para, index in zip(paras, itertools.count(1)):
3500         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3501         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3502         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3503         if begin_time is None:
3504             continue
3505         if not end_time:
3506             if not dur:
3507                 continue
3508             end_time = begin_time + dur
3509         out.append('%d\n%s --> %s\n%s\n\n' % (
3510             index,
3511             srt_subtitles_timecode(begin_time),
3512             srt_subtitles_timecode(end_time),
3513             parse_node(para)))
3514
3515     return ''.join(out)
3516
3517
3518 def cli_option(params, command_option, param, separator=None):
3519     param = params.get(param)
3520     return ([] if param is None
3521             else [command_option, str(param)] if separator is None
3522             else [f'{command_option}{separator}{param}'])
3523
3524
3525 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3526     param = params.get(param)
3527     assert param in (True, False, None)
3528     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3529
3530
3531 def cli_valueless_option(params, command_option, param, expected_value=True):
3532     return [command_option] if params.get(param) == expected_value else []
3533
3534
3535 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3536     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3537         if use_compat:
3538             return argdict
3539         else:
3540             argdict = None
3541     if argdict is None:
3542         return default
3543     assert isinstance(argdict, dict)
3544
3545     assert isinstance(keys, (list, tuple))
3546     for key_list in keys:
3547         arg_list = list(filter(
3548             lambda x: x is not None,
3549             [argdict.get(key.lower()) for key in variadic(key_list)]))
3550         if arg_list:
3551             return [arg for args in arg_list for arg in args]
3552     return default
3553
3554
3555 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3556     main_key, exe = main_key.lower(), exe.lower()
3557     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3558     keys = [f'{root_key}{k}' for k in (keys or [''])]
3559     if root_key in keys:
3560         if main_key != exe:
3561             keys.append((main_key, exe))
3562         keys.append('default')
3563     else:
3564         use_compat = False
3565     return cli_configuration_args(argdict, keys, default, use_compat)
3566
3567
3568 class ISO639Utils:
3569     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3570     _lang_map = {
3571         'aa': 'aar',
3572         'ab': 'abk',
3573         'ae': 'ave',
3574         'af': 'afr',
3575         'ak': 'aka',
3576         'am': 'amh',
3577         'an': 'arg',
3578         'ar': 'ara',
3579         'as': 'asm',
3580         'av': 'ava',
3581         'ay': 'aym',
3582         'az': 'aze',
3583         'ba': 'bak',
3584         'be': 'bel',
3585         'bg': 'bul',
3586         'bh': 'bih',
3587         'bi': 'bis',
3588         'bm': 'bam',
3589         'bn': 'ben',
3590         'bo': 'bod',
3591         'br': 'bre',
3592         'bs': 'bos',
3593         'ca': 'cat',
3594         'ce': 'che',
3595         'ch': 'cha',
3596         'co': 'cos',
3597         'cr': 'cre',
3598         'cs': 'ces',
3599         'cu': 'chu',
3600         'cv': 'chv',
3601         'cy': 'cym',
3602         'da': 'dan',
3603         'de': 'deu',
3604         'dv': 'div',
3605         'dz': 'dzo',
3606         'ee': 'ewe',
3607         'el': 'ell',
3608         'en': 'eng',
3609         'eo': 'epo',
3610         'es': 'spa',
3611         'et': 'est',
3612         'eu': 'eus',
3613         'fa': 'fas',
3614         'ff': 'ful',
3615         'fi': 'fin',
3616         'fj': 'fij',
3617         'fo': 'fao',
3618         'fr': 'fra',
3619         'fy': 'fry',
3620         'ga': 'gle',
3621         'gd': 'gla',
3622         'gl': 'glg',
3623         'gn': 'grn',
3624         'gu': 'guj',
3625         'gv': 'glv',
3626         'ha': 'hau',
3627         'he': 'heb',
3628         'iw': 'heb',  # Replaced by he in 1989 revision
3629         'hi': 'hin',
3630         'ho': 'hmo',
3631         'hr': 'hrv',
3632         'ht': 'hat',
3633         'hu': 'hun',
3634         'hy': 'hye',
3635         'hz': 'her',
3636         'ia': 'ina',
3637         'id': 'ind',
3638         'in': 'ind',  # Replaced by id in 1989 revision
3639         'ie': 'ile',
3640         'ig': 'ibo',
3641         'ii': 'iii',
3642         'ik': 'ipk',
3643         'io': 'ido',
3644         'is': 'isl',
3645         'it': 'ita',
3646         'iu': 'iku',
3647         'ja': 'jpn',
3648         'jv': 'jav',
3649         'ka': 'kat',
3650         'kg': 'kon',
3651         'ki': 'kik',
3652         'kj': 'kua',
3653         'kk': 'kaz',
3654         'kl': 'kal',
3655         'km': 'khm',
3656         'kn': 'kan',
3657         'ko': 'kor',
3658         'kr': 'kau',
3659         'ks': 'kas',
3660         'ku': 'kur',
3661         'kv': 'kom',
3662         'kw': 'cor',
3663         'ky': 'kir',
3664         'la': 'lat',
3665         'lb': 'ltz',
3666         'lg': 'lug',
3667         'li': 'lim',
3668         'ln': 'lin',
3669         'lo': 'lao',
3670         'lt': 'lit',
3671         'lu': 'lub',
3672         'lv': 'lav',
3673         'mg': 'mlg',
3674         'mh': 'mah',
3675         'mi': 'mri',
3676         'mk': 'mkd',
3677         'ml': 'mal',
3678         'mn': 'mon',
3679         'mr': 'mar',
3680         'ms': 'msa',
3681         'mt': 'mlt',
3682         'my': 'mya',
3683         'na': 'nau',
3684         'nb': 'nob',
3685         'nd': 'nde',
3686         'ne': 'nep',
3687         'ng': 'ndo',
3688         'nl': 'nld',
3689         'nn': 'nno',
3690         'no': 'nor',
3691         'nr': 'nbl',
3692         'nv': 'nav',
3693         'ny': 'nya',
3694         'oc': 'oci',
3695         'oj': 'oji',
3696         'om': 'orm',
3697         'or': 'ori',
3698         'os': 'oss',
3699         'pa': 'pan',
3700         'pe': 'per',
3701         'pi': 'pli',
3702         'pl': 'pol',
3703         'ps': 'pus',
3704         'pt': 'por',
3705         'qu': 'que',
3706         'rm': 'roh',
3707         'rn': 'run',
3708         'ro': 'ron',
3709         'ru': 'rus',
3710         'rw': 'kin',
3711         'sa': 'san',
3712         'sc': 'srd',
3713         'sd': 'snd',
3714         'se': 'sme',
3715         'sg': 'sag',
3716         'si': 'sin',
3717         'sk': 'slk',
3718         'sl': 'slv',
3719         'sm': 'smo',
3720         'sn': 'sna',
3721         'so': 'som',
3722         'sq': 'sqi',
3723         'sr': 'srp',
3724         'ss': 'ssw',
3725         'st': 'sot',
3726         'su': 'sun',
3727         'sv': 'swe',
3728         'sw': 'swa',
3729         'ta': 'tam',
3730         'te': 'tel',
3731         'tg': 'tgk',
3732         'th': 'tha',
3733         'ti': 'tir',
3734         'tk': 'tuk',
3735         'tl': 'tgl',
3736         'tn': 'tsn',
3737         'to': 'ton',
3738         'tr': 'tur',
3739         'ts': 'tso',
3740         'tt': 'tat',
3741         'tw': 'twi',
3742         'ty': 'tah',
3743         'ug': 'uig',
3744         'uk': 'ukr',
3745         'ur': 'urd',
3746         'uz': 'uzb',
3747         've': 'ven',
3748         'vi': 'vie',
3749         'vo': 'vol',
3750         'wa': 'wln',
3751         'wo': 'wol',
3752         'xh': 'xho',
3753         'yi': 'yid',
3754         'ji': 'yid',  # Replaced by yi in 1989 revision
3755         'yo': 'yor',
3756         'za': 'zha',
3757         'zh': 'zho',
3758         'zu': 'zul',
3759     }
3760
3761     @classmethod
3762     def short2long(cls, code):
3763         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3764         return cls._lang_map.get(code[:2])
3765
3766     @classmethod
3767     def long2short(cls, code):
3768         """Convert language code from ISO 639-2/T to ISO 639-1"""
3769         for short_name, long_name in cls._lang_map.items():
3770             if long_name == code:
3771                 return short_name
3772
3773
3774 class ISO3166Utils:
3775     # From http://data.okfn.org/data/core/country-list
3776     _country_map = {
3777         'AF': 'Afghanistan',
3778         'AX': 'Åland Islands',
3779         'AL': 'Albania',
3780         'DZ': 'Algeria',
3781         'AS': 'American Samoa',
3782         'AD': 'Andorra',
3783         'AO': 'Angola',
3784         'AI': 'Anguilla',
3785         'AQ': 'Antarctica',
3786         'AG': 'Antigua and Barbuda',
3787         'AR': 'Argentina',
3788         'AM': 'Armenia',
3789         'AW': 'Aruba',
3790         'AU': 'Australia',
3791         'AT': 'Austria',
3792         'AZ': 'Azerbaijan',
3793         'BS': 'Bahamas',
3794         'BH': 'Bahrain',
3795         'BD': 'Bangladesh',
3796         'BB': 'Barbados',
3797         'BY': 'Belarus',
3798         'BE': 'Belgium',
3799         'BZ': 'Belize',
3800         'BJ': 'Benin',
3801         'BM': 'Bermuda',
3802         'BT': 'Bhutan',
3803         'BO': 'Bolivia, Plurinational State of',
3804         'BQ': 'Bonaire, Sint Eustatius and Saba',
3805         'BA': 'Bosnia and Herzegovina',
3806         'BW': 'Botswana',
3807         'BV': 'Bouvet Island',
3808         'BR': 'Brazil',
3809         'IO': 'British Indian Ocean Territory',
3810         'BN': 'Brunei Darussalam',
3811         'BG': 'Bulgaria',
3812         'BF': 'Burkina Faso',
3813         'BI': 'Burundi',
3814         'KH': 'Cambodia',
3815         'CM': 'Cameroon',
3816         'CA': 'Canada',
3817         'CV': 'Cape Verde',
3818         'KY': 'Cayman Islands',
3819         'CF': 'Central African Republic',
3820         'TD': 'Chad',
3821         'CL': 'Chile',
3822         'CN': 'China',
3823         'CX': 'Christmas Island',
3824         'CC': 'Cocos (Keeling) Islands',
3825         'CO': 'Colombia',
3826         'KM': 'Comoros',
3827         'CG': 'Congo',
3828         'CD': 'Congo, the Democratic Republic of the',
3829         'CK': 'Cook Islands',
3830         'CR': 'Costa Rica',
3831         'CI': 'Côte d\'Ivoire',
3832         'HR': 'Croatia',
3833         'CU': 'Cuba',
3834         'CW': 'Curaçao',
3835         'CY': 'Cyprus',
3836         'CZ': 'Czech Republic',
3837         'DK': 'Denmark',
3838         'DJ': 'Djibouti',
3839         'DM': 'Dominica',
3840         'DO': 'Dominican Republic',
3841         'EC': 'Ecuador',
3842         'EG': 'Egypt',
3843         'SV': 'El Salvador',
3844         'GQ': 'Equatorial Guinea',
3845         'ER': 'Eritrea',
3846         'EE': 'Estonia',
3847         'ET': 'Ethiopia',
3848         'FK': 'Falkland Islands (Malvinas)',
3849         'FO': 'Faroe Islands',
3850         'FJ': 'Fiji',
3851         'FI': 'Finland',
3852         'FR': 'France',
3853         'GF': 'French Guiana',
3854         'PF': 'French Polynesia',
3855         'TF': 'French Southern Territories',
3856         'GA': 'Gabon',
3857         'GM': 'Gambia',
3858         'GE': 'Georgia',
3859         'DE': 'Germany',
3860         'GH': 'Ghana',
3861         'GI': 'Gibraltar',
3862         'GR': 'Greece',
3863         'GL': 'Greenland',
3864         'GD': 'Grenada',
3865         'GP': 'Guadeloupe',
3866         'GU': 'Guam',
3867         'GT': 'Guatemala',
3868         'GG': 'Guernsey',
3869         'GN': 'Guinea',
3870         'GW': 'Guinea-Bissau',
3871         'GY': 'Guyana',
3872         'HT': 'Haiti',
3873         'HM': 'Heard Island and McDonald Islands',
3874         'VA': 'Holy See (Vatican City State)',
3875         'HN': 'Honduras',
3876         'HK': 'Hong Kong',
3877         'HU': 'Hungary',
3878         'IS': 'Iceland',
3879         'IN': 'India',
3880         'ID': 'Indonesia',
3881         'IR': 'Iran, Islamic Republic of',
3882         'IQ': 'Iraq',
3883         'IE': 'Ireland',
3884         'IM': 'Isle of Man',
3885         'IL': 'Israel',
3886         'IT': 'Italy',
3887         'JM': 'Jamaica',
3888         'JP': 'Japan',
3889         'JE': 'Jersey',
3890         'JO': 'Jordan',
3891         'KZ': 'Kazakhstan',
3892         'KE': 'Kenya',
3893         'KI': 'Kiribati',
3894         'KP': 'Korea, Democratic People\'s Republic of',
3895         'KR': 'Korea, Republic of',
3896         'KW': 'Kuwait',
3897         'KG': 'Kyrgyzstan',
3898         'LA': 'Lao People\'s Democratic Republic',
3899         'LV': 'Latvia',
3900         'LB': 'Lebanon',
3901         'LS': 'Lesotho',
3902         'LR': 'Liberia',
3903         'LY': 'Libya',
3904         'LI': 'Liechtenstein',
3905         'LT': 'Lithuania',
3906         'LU': 'Luxembourg',
3907         'MO': 'Macao',
3908         'MK': 'Macedonia, the Former Yugoslav Republic of',
3909         'MG': 'Madagascar',
3910         'MW': 'Malawi',
3911         'MY': 'Malaysia',
3912         'MV': 'Maldives',
3913         'ML': 'Mali',
3914         'MT': 'Malta',
3915         'MH': 'Marshall Islands',
3916         'MQ': 'Martinique',
3917         'MR': 'Mauritania',
3918         'MU': 'Mauritius',
3919         'YT': 'Mayotte',
3920         'MX': 'Mexico',
3921         'FM': 'Micronesia, Federated States of',
3922         'MD': 'Moldova, Republic of',
3923         'MC': 'Monaco',
3924         'MN': 'Mongolia',
3925         'ME': 'Montenegro',
3926         'MS': 'Montserrat',
3927         'MA': 'Morocco',
3928         'MZ': 'Mozambique',
3929         'MM': 'Myanmar',
3930         'NA': 'Namibia',
3931         'NR': 'Nauru',
3932         'NP': 'Nepal',
3933         'NL': 'Netherlands',
3934         'NC': 'New Caledonia',
3935         'NZ': 'New Zealand',
3936         'NI': 'Nicaragua',
3937         'NE': 'Niger',
3938         'NG': 'Nigeria',
3939         'NU': 'Niue',
3940         'NF': 'Norfolk Island',
3941         'MP': 'Northern Mariana Islands',
3942         'NO': 'Norway',
3943         'OM': 'Oman',
3944         'PK': 'Pakistan',
3945         'PW': 'Palau',
3946         'PS': 'Palestine, State of',
3947         'PA': 'Panama',
3948         'PG': 'Papua New Guinea',
3949         'PY': 'Paraguay',
3950         'PE': 'Peru',
3951         'PH': 'Philippines',
3952         'PN': 'Pitcairn',
3953         'PL': 'Poland',
3954         'PT': 'Portugal',
3955         'PR': 'Puerto Rico',
3956         'QA': 'Qatar',
3957         'RE': 'Réunion',
3958         'RO': 'Romania',
3959         'RU': 'Russian Federation',
3960         'RW': 'Rwanda',
3961         'BL': 'Saint Barthélemy',
3962         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3963         'KN': 'Saint Kitts and Nevis',
3964         'LC': 'Saint Lucia',
3965         'MF': 'Saint Martin (French part)',
3966         'PM': 'Saint Pierre and Miquelon',
3967         'VC': 'Saint Vincent and the Grenadines',
3968         'WS': 'Samoa',
3969         'SM': 'San Marino',
3970         'ST': 'Sao Tome and Principe',
3971         'SA': 'Saudi Arabia',
3972         'SN': 'Senegal',
3973         'RS': 'Serbia',
3974         'SC': 'Seychelles',
3975         'SL': 'Sierra Leone',
3976         'SG': 'Singapore',
3977         'SX': 'Sint Maarten (Dutch part)',
3978         'SK': 'Slovakia',
3979         'SI': 'Slovenia',
3980         'SB': 'Solomon Islands',
3981         'SO': 'Somalia',
3982         'ZA': 'South Africa',
3983         'GS': 'South Georgia and the South Sandwich Islands',
3984         'SS': 'South Sudan',
3985         'ES': 'Spain',
3986         'LK': 'Sri Lanka',
3987         'SD': 'Sudan',
3988         'SR': 'Suriname',
3989         'SJ': 'Svalbard and Jan Mayen',
3990         'SZ': 'Swaziland',
3991         'SE': 'Sweden',
3992         'CH': 'Switzerland',
3993         'SY': 'Syrian Arab Republic',
3994         'TW': 'Taiwan, Province of China',
3995         'TJ': 'Tajikistan',
3996         'TZ': 'Tanzania, United Republic of',
3997         'TH': 'Thailand',
3998         'TL': 'Timor-Leste',
3999         'TG': 'Togo',
4000         'TK': 'Tokelau',
4001         'TO': 'Tonga',
4002         'TT': 'Trinidad and Tobago',
4003         'TN': 'Tunisia',
4004         'TR': 'Turkey',
4005         'TM': 'Turkmenistan',
4006         'TC': 'Turks and Caicos Islands',
4007         'TV': 'Tuvalu',
4008         'UG': 'Uganda',
4009         'UA': 'Ukraine',
4010         'AE': 'United Arab Emirates',
4011         'GB': 'United Kingdom',
4012         'US': 'United States',
4013         'UM': 'United States Minor Outlying Islands',
4014         'UY': 'Uruguay',
4015         'UZ': 'Uzbekistan',
4016         'VU': 'Vanuatu',
4017         'VE': 'Venezuela, Bolivarian Republic of',
4018         'VN': 'Viet Nam',
4019         'VG': 'Virgin Islands, British',
4020         'VI': 'Virgin Islands, U.S.',
4021         'WF': 'Wallis and Futuna',
4022         'EH': 'Western Sahara',
4023         'YE': 'Yemen',
4024         'ZM': 'Zambia',
4025         'ZW': 'Zimbabwe',
4026         # Not ISO 3166 codes, but used for IP blocks
4027         'AP': 'Asia/Pacific Region',
4028         'EU': 'Europe',
4029     }
4030
4031     @classmethod
4032     def short2full(cls, code):
4033         """Convert an ISO 3166-2 country code to the corresponding full name"""
4034         return cls._country_map.get(code.upper())
4035
4036
4037 class GeoUtils:
4038     # Major IPv4 address blocks per country
4039     _country_ip_map = {
4040         'AD': '46.172.224.0/19',
4041         'AE': '94.200.0.0/13',
4042         'AF': '149.54.0.0/17',
4043         'AG': '209.59.64.0/18',
4044         'AI': '204.14.248.0/21',
4045         'AL': '46.99.0.0/16',
4046         'AM': '46.70.0.0/15',
4047         'AO': '105.168.0.0/13',
4048         'AP': '182.50.184.0/21',
4049         'AQ': '23.154.160.0/24',
4050         'AR': '181.0.0.0/12',
4051         'AS': '202.70.112.0/20',
4052         'AT': '77.116.0.0/14',
4053         'AU': '1.128.0.0/11',
4054         'AW': '181.41.0.0/18',
4055         'AX': '185.217.4.0/22',
4056         'AZ': '5.197.0.0/16',
4057         'BA': '31.176.128.0/17',
4058         'BB': '65.48.128.0/17',
4059         'BD': '114.130.0.0/16',
4060         'BE': '57.0.0.0/8',
4061         'BF': '102.178.0.0/15',
4062         'BG': '95.42.0.0/15',
4063         'BH': '37.131.0.0/17',
4064         'BI': '154.117.192.0/18',
4065         'BJ': '137.255.0.0/16',
4066         'BL': '185.212.72.0/23',
4067         'BM': '196.12.64.0/18',
4068         'BN': '156.31.0.0/16',
4069         'BO': '161.56.0.0/16',
4070         'BQ': '161.0.80.0/20',
4071         'BR': '191.128.0.0/12',
4072         'BS': '24.51.64.0/18',
4073         'BT': '119.2.96.0/19',
4074         'BW': '168.167.0.0/16',
4075         'BY': '178.120.0.0/13',
4076         'BZ': '179.42.192.0/18',
4077         'CA': '99.224.0.0/11',
4078         'CD': '41.243.0.0/16',
4079         'CF': '197.242.176.0/21',
4080         'CG': '160.113.0.0/16',
4081         'CH': '85.0.0.0/13',
4082         'CI': '102.136.0.0/14',
4083         'CK': '202.65.32.0/19',
4084         'CL': '152.172.0.0/14',
4085         'CM': '102.244.0.0/14',
4086         'CN': '36.128.0.0/10',
4087         'CO': '181.240.0.0/12',
4088         'CR': '201.192.0.0/12',
4089         'CU': '152.206.0.0/15',
4090         'CV': '165.90.96.0/19',
4091         'CW': '190.88.128.0/17',
4092         'CY': '31.153.0.0/16',
4093         'CZ': '88.100.0.0/14',
4094         'DE': '53.0.0.0/8',
4095         'DJ': '197.241.0.0/17',
4096         'DK': '87.48.0.0/12',
4097         'DM': '192.243.48.0/20',
4098         'DO': '152.166.0.0/15',
4099         'DZ': '41.96.0.0/12',
4100         'EC': '186.68.0.0/15',
4101         'EE': '90.190.0.0/15',
4102         'EG': '156.160.0.0/11',
4103         'ER': '196.200.96.0/20',
4104         'ES': '88.0.0.0/11',
4105         'ET': '196.188.0.0/14',
4106         'EU': '2.16.0.0/13',
4107         'FI': '91.152.0.0/13',
4108         'FJ': '144.120.0.0/16',
4109         'FK': '80.73.208.0/21',
4110         'FM': '119.252.112.0/20',
4111         'FO': '88.85.32.0/19',
4112         'FR': '90.0.0.0/9',
4113         'GA': '41.158.0.0/15',
4114         'GB': '25.0.0.0/8',
4115         'GD': '74.122.88.0/21',
4116         'GE': '31.146.0.0/16',
4117         'GF': '161.22.64.0/18',
4118         'GG': '62.68.160.0/19',
4119         'GH': '154.160.0.0/12',
4120         'GI': '95.164.0.0/16',
4121         'GL': '88.83.0.0/19',
4122         'GM': '160.182.0.0/15',
4123         'GN': '197.149.192.0/18',
4124         'GP': '104.250.0.0/19',
4125         'GQ': '105.235.224.0/20',
4126         'GR': '94.64.0.0/13',
4127         'GT': '168.234.0.0/16',
4128         'GU': '168.123.0.0/16',
4129         'GW': '197.214.80.0/20',
4130         'GY': '181.41.64.0/18',
4131         'HK': '113.252.0.0/14',
4132         'HN': '181.210.0.0/16',
4133         'HR': '93.136.0.0/13',
4134         'HT': '148.102.128.0/17',
4135         'HU': '84.0.0.0/14',
4136         'ID': '39.192.0.0/10',
4137         'IE': '87.32.0.0/12',
4138         'IL': '79.176.0.0/13',
4139         'IM': '5.62.80.0/20',
4140         'IN': '117.192.0.0/10',
4141         'IO': '203.83.48.0/21',
4142         'IQ': '37.236.0.0/14',
4143         'IR': '2.176.0.0/12',
4144         'IS': '82.221.0.0/16',
4145         'IT': '79.0.0.0/10',
4146         'JE': '87.244.64.0/18',
4147         'JM': '72.27.0.0/17',
4148         'JO': '176.29.0.0/16',
4149         'JP': '133.0.0.0/8',
4150         'KE': '105.48.0.0/12',
4151         'KG': '158.181.128.0/17',
4152         'KH': '36.37.128.0/17',
4153         'KI': '103.25.140.0/22',
4154         'KM': '197.255.224.0/20',
4155         'KN': '198.167.192.0/19',
4156         'KP': '175.45.176.0/22',
4157         'KR': '175.192.0.0/10',
4158         'KW': '37.36.0.0/14',
4159         'KY': '64.96.0.0/15',
4160         'KZ': '2.72.0.0/13',
4161         'LA': '115.84.64.0/18',
4162         'LB': '178.135.0.0/16',
4163         'LC': '24.92.144.0/20',
4164         'LI': '82.117.0.0/19',
4165         'LK': '112.134.0.0/15',
4166         'LR': '102.183.0.0/16',
4167         'LS': '129.232.0.0/17',
4168         'LT': '78.56.0.0/13',
4169         'LU': '188.42.0.0/16',
4170         'LV': '46.109.0.0/16',
4171         'LY': '41.252.0.0/14',
4172         'MA': '105.128.0.0/11',
4173         'MC': '88.209.64.0/18',
4174         'MD': '37.246.0.0/16',
4175         'ME': '178.175.0.0/17',
4176         'MF': '74.112.232.0/21',
4177         'MG': '154.126.0.0/17',
4178         'MH': '117.103.88.0/21',
4179         'MK': '77.28.0.0/15',
4180         'ML': '154.118.128.0/18',
4181         'MM': '37.111.0.0/17',
4182         'MN': '49.0.128.0/17',
4183         'MO': '60.246.0.0/16',
4184         'MP': '202.88.64.0/20',
4185         'MQ': '109.203.224.0/19',
4186         'MR': '41.188.64.0/18',
4187         'MS': '208.90.112.0/22',
4188         'MT': '46.11.0.0/16',
4189         'MU': '105.16.0.0/12',
4190         'MV': '27.114.128.0/18',
4191         'MW': '102.70.0.0/15',
4192         'MX': '187.192.0.0/11',
4193         'MY': '175.136.0.0/13',
4194         'MZ': '197.218.0.0/15',
4195         'NA': '41.182.0.0/16',
4196         'NC': '101.101.0.0/18',
4197         'NE': '197.214.0.0/18',
4198         'NF': '203.17.240.0/22',
4199         'NG': '105.112.0.0/12',
4200         'NI': '186.76.0.0/15',
4201         'NL': '145.96.0.0/11',
4202         'NO': '84.208.0.0/13',
4203         'NP': '36.252.0.0/15',
4204         'NR': '203.98.224.0/19',
4205         'NU': '49.156.48.0/22',
4206         'NZ': '49.224.0.0/14',
4207         'OM': '5.36.0.0/15',
4208         'PA': '186.72.0.0/15',
4209         'PE': '186.160.0.0/14',
4210         'PF': '123.50.64.0/18',
4211         'PG': '124.240.192.0/19',
4212         'PH': '49.144.0.0/13',
4213         'PK': '39.32.0.0/11',
4214         'PL': '83.0.0.0/11',
4215         'PM': '70.36.0.0/20',
4216         'PR': '66.50.0.0/16',
4217         'PS': '188.161.0.0/16',
4218         'PT': '85.240.0.0/13',
4219         'PW': '202.124.224.0/20',
4220         'PY': '181.120.0.0/14',
4221         'QA': '37.210.0.0/15',
4222         'RE': '102.35.0.0/16',
4223         'RO': '79.112.0.0/13',
4224         'RS': '93.86.0.0/15',
4225         'RU': '5.136.0.0/13',
4226         'RW': '41.186.0.0/16',
4227         'SA': '188.48.0.0/13',
4228         'SB': '202.1.160.0/19',
4229         'SC': '154.192.0.0/11',
4230         'SD': '102.120.0.0/13',
4231         'SE': '78.64.0.0/12',
4232         'SG': '8.128.0.0/10',
4233         'SI': '188.196.0.0/14',
4234         'SK': '78.98.0.0/15',
4235         'SL': '102.143.0.0/17',
4236         'SM': '89.186.32.0/19',
4237         'SN': '41.82.0.0/15',
4238         'SO': '154.115.192.0/18',
4239         'SR': '186.179.128.0/17',
4240         'SS': '105.235.208.0/21',
4241         'ST': '197.159.160.0/19',
4242         'SV': '168.243.0.0/16',
4243         'SX': '190.102.0.0/20',
4244         'SY': '5.0.0.0/16',
4245         'SZ': '41.84.224.0/19',
4246         'TC': '65.255.48.0/20',
4247         'TD': '154.68.128.0/19',
4248         'TG': '196.168.0.0/14',
4249         'TH': '171.96.0.0/13',
4250         'TJ': '85.9.128.0/18',
4251         'TK': '27.96.24.0/21',
4252         'TL': '180.189.160.0/20',
4253         'TM': '95.85.96.0/19',
4254         'TN': '197.0.0.0/11',
4255         'TO': '175.176.144.0/21',
4256         'TR': '78.160.0.0/11',
4257         'TT': '186.44.0.0/15',
4258         'TV': '202.2.96.0/19',
4259         'TW': '120.96.0.0/11',
4260         'TZ': '156.156.0.0/14',
4261         'UA': '37.52.0.0/14',
4262         'UG': '102.80.0.0/13',
4263         'US': '6.0.0.0/8',
4264         'UY': '167.56.0.0/13',
4265         'UZ': '84.54.64.0/18',
4266         'VA': '212.77.0.0/19',
4267         'VC': '207.191.240.0/21',
4268         'VE': '186.88.0.0/13',
4269         'VG': '66.81.192.0/20',
4270         'VI': '146.226.0.0/16',
4271         'VN': '14.160.0.0/11',
4272         'VU': '202.80.32.0/20',
4273         'WF': '117.20.32.0/21',
4274         'WS': '202.4.32.0/19',
4275         'YE': '134.35.0.0/16',
4276         'YT': '41.242.116.0/22',
4277         'ZA': '41.0.0.0/11',
4278         'ZM': '102.144.0.0/13',
4279         'ZW': '102.177.192.0/18',
4280     }
4281
4282     @classmethod
4283     def random_ipv4(cls, code_or_block):
4284         if len(code_or_block) == 2:
4285             block = cls._country_ip_map.get(code_or_block.upper())
4286             if not block:
4287                 return None
4288         else:
4289             block = code_or_block
4290         addr, preflen = block.split('/')
4291         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4292         addr_max = addr_min | (0xffffffff >> int(preflen))
4293         return str(socket.inet_ntoa(
4294             struct.pack('!L', random.randint(addr_min, addr_max))))
4295
4296
4297 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4298 # released into Public Domain
4299 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4300
4301 def long_to_bytes(n, blocksize=0):
4302     """long_to_bytes(n:long, blocksize:int) : string
4303     Convert a long integer to a byte string.
4304
4305     If optional blocksize is given and greater than zero, pad the front of the
4306     byte string with binary zeros so that the length is a multiple of
4307     blocksize.
4308     """
4309     # after much testing, this algorithm was deemed to be the fastest
4310     s = b''
4311     n = int(n)
4312     while n > 0:
4313         s = struct.pack('>I', n & 0xffffffff) + s
4314         n = n >> 32
4315     # strip off leading zeros
4316     for i in range(len(s)):
4317         if s[i] != b'\000'[0]:
4318             break
4319     else:
4320         # only happens when n == 0
4321         s = b'\000'
4322         i = 0
4323     s = s[i:]
4324     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4325     # de-padding being done above, but sigh...
4326     if blocksize > 0 and len(s) % blocksize:
4327         s = (blocksize - len(s) % blocksize) * b'\000' + s
4328     return s
4329
4330
4331 def bytes_to_long(s):
4332     """bytes_to_long(string) : long
4333     Convert a byte string to a long integer.
4334
4335     This is (essentially) the inverse of long_to_bytes().
4336     """
4337     acc = 0
4338     length = len(s)
4339     if length % 4:
4340         extra = (4 - length % 4)
4341         s = b'\000' * extra + s
4342         length = length + extra
4343     for i in range(0, length, 4):
4344         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4345     return acc
4346
4347
4348 def ohdave_rsa_encrypt(data, exponent, modulus):
4349     """
4350     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4351
4352     Input:
4353         data: data to encrypt, bytes-like object
4354         exponent, modulus: parameter e and N of RSA algorithm, both integer
4355     Output: hex string of encrypted data
4356
4357     Limitation: supports one block encryption only
4358     """
4359
4360     payload = int(binascii.hexlify(data[::-1]), 16)
4361     encrypted = pow(payload, exponent, modulus)
4362     return f'{encrypted:x}'
4363
4364
4365 def pkcs1pad(data, length):
4366     """
4367     Padding input data with PKCS#1 scheme
4368
4369     @param {int[]} data        input data
4370     @param {int}   length      target length
4371     @returns {int[]}           padded data
4372     """
4373     if len(data) > length - 11:
4374         raise ValueError('Input data too long for PKCS#1 padding')
4375
4376     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4377     return [0, 2, *pseudo_random, 0, *data]
4378
4379
4380 def _base_n_table(n, table):
4381     if not table and not n:
4382         raise ValueError('Either table or n must be specified')
4383     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4384
4385     if n and n != len(table):
4386         raise ValueError(f'base {n} exceeds table length {len(table)}')
4387     return table
4388
4389
4390 def encode_base_n(num, n=None, table=None):
4391     """Convert given int to a base-n string"""
4392     table = _base_n_table(n, table)
4393     if not num:
4394         return table[0]
4395
4396     result, base = '', len(table)
4397     while num:
4398         result = table[num % base] + result
4399         num = num // base
4400     return result
4401
4402
4403 def decode_base_n(string, n=None, table=None):
4404     """Convert given base-n string to int"""
4405     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4406     result, base = 0, len(table)
4407     for char in string:
4408         result = result * base + table[char]
4409     return result
4410
4411
4412 def decode_packed_codes(code):
4413     mobj = re.search(PACKED_CODES_RE, code)
4414     obfuscated_code, base, count, symbols = mobj.groups()
4415     base = int(base)
4416     count = int(count)
4417     symbols = symbols.split('|')
4418     symbol_table = {}
4419
4420     while count:
4421         count -= 1
4422         base_n_count = encode_base_n(count, base)
4423         symbol_table[base_n_count] = symbols[count] or base_n_count
4424
4425     return re.sub(
4426         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4427         obfuscated_code)
4428
4429
4430 def caesar(s, alphabet, shift):
4431     if shift == 0:
4432         return s
4433     l = len(alphabet)
4434     return ''.join(
4435         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4436         for c in s)
4437
4438
4439 def rot47(s):
4440     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4441
4442
4443 def parse_m3u8_attributes(attrib):
4444     info = {}
4445     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4446         if val.startswith('"'):
4447             val = val[1:-1]
4448         info[key] = val
4449     return info
4450
4451
4452 def urshift(val, n):
4453     return val >> n if val >= 0 else (val + 0x100000000) >> n
4454
4455
4456 def write_xattr(path, key, value):
4457     # Windows: Write xattrs to NTFS Alternate Data Streams:
4458     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4459     if compat_os_name == 'nt':
4460         assert ':' not in key
4461         assert os.path.exists(path)
4462
4463         try:
4464             with open(f'{path}:{key}', 'wb') as f:
4465                 f.write(value)
4466         except OSError as e:
4467             raise XAttrMetadataError(e.errno, e.strerror)
4468         return
4469
4470     # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
4471
4472     setxattr = None
4473     if callable(getattr(os, 'setxattr', None)):
4474         setxattr = os.setxattr
4475     elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4476         # Unicode arguments are not supported in pyxattr until version 0.5.0
4477         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4478         if version_tuple(xattr.__version__) >= (0, 5, 0):
4479             setxattr = xattr.set
4480     elif xattr:
4481         setxattr = xattr.setxattr
4482
4483     if setxattr:
4484         try:
4485             setxattr(path, key, value)
4486         except OSError as e:
4487             raise XAttrMetadataError(e.errno, e.strerror)
4488         return
4489
4490     # UNIX Method 2. Use setfattr/xattr executables
4491     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4492            else 'xattr' if check_executable('xattr', ['-h']) else None)
4493     if not exe:
4494         raise XAttrUnavailableError(
4495             'Couldn\'t find a tool to set the xattrs. Install either the "xattr" or "pyxattr" Python modules or the '
4496             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4497
4498     value = value.decode()
4499     try:
4500         _, stderr, returncode = Popen.run(
4501             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4502             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4503     except OSError as e:
4504         raise XAttrMetadataError(e.errno, e.strerror)
4505     if returncode:
4506         raise XAttrMetadataError(returncode, stderr)
4507
4508
4509 def random_birthday(year_field, month_field, day_field):
4510     start_date = dt.date(1950, 1, 1)
4511     end_date = dt.date(1995, 12, 31)
4512     offset = random.randint(0, (end_date - start_date).days)
4513     random_date = start_date + dt.timedelta(offset)
4514     return {
4515         year_field: str(random_date.year),
4516         month_field: str(random_date.month),
4517         day_field: str(random_date.day),
4518     }
4519
4520
4521 def find_available_port(interface=''):
4522     try:
4523         with socket.socket() as sock:
4524             sock.bind((interface, 0))
4525             return sock.getsockname()[1]
4526     except OSError:
4527         return None
4528
4529
4530 # Templates for internet shortcut files, which are plain text files.
4531 DOT_URL_LINK_TEMPLATE = '''\
4532 [InternetShortcut]
4533 URL=%(url)s
4534 '''
4535
4536 DOT_WEBLOC_LINK_TEMPLATE = '''\
4537 <?xml version="1.0" encoding="UTF-8"?>
4538 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4539 <plist version="1.0">
4540 <dict>
4541 \t<key>URL</key>
4542 \t<string>%(url)s</string>
4543 </dict>
4544 </plist>
4545 '''
4546
4547 DOT_DESKTOP_LINK_TEMPLATE = '''\
4548 [Desktop Entry]
4549 Encoding=UTF-8
4550 Name=%(filename)s
4551 Type=Link
4552 URL=%(url)s
4553 Icon=text-html
4554 '''
4555
4556 LINK_TEMPLATES = {
4557     'url': DOT_URL_LINK_TEMPLATE,
4558     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4559     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4560 }
4561
4562
4563 def iri_to_uri(iri):
4564     """
4565     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4566
4567     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4568     """
4569
4570     iri_parts = urllib.parse.urlparse(iri)
4571
4572     if '[' in iri_parts.netloc:
4573         raise ValueError('IPv6 URIs are not, yet, supported.')
4574         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4575
4576     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4577
4578     net_location = ''
4579     if iri_parts.username:
4580         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4581         if iri_parts.password is not None:
4582             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4583         net_location += '@'
4584
4585     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4586     # The 'idna' encoding produces ASCII text.
4587     if iri_parts.port is not None and iri_parts.port != 80:
4588         net_location += ':' + str(iri_parts.port)
4589
4590     return urllib.parse.urlunparse(
4591         (iri_parts.scheme,
4592             net_location,
4593
4594             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4595
4596             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4597             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4598
4599             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4600             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4601
4602             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4603
4604     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4605
4606
4607 def to_high_limit_path(path):
4608     if sys.platform in ['win32', 'cygwin']:
4609         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4610         return '\\\\?\\' + os.path.abspath(path)
4611
4612     return path
4613
4614
4615 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4616     val = traversal.traverse_obj(obj, *variadic(field))
4617     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4618         return default
4619     return template % func(val)
4620
4621
4622 def clean_podcast_url(url):
4623     url = re.sub(r'''(?x)
4624         (?:
4625             (?:
4626                 chtbl\.com/track|
4627                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4628                 play\.podtrac\.com|
4629                 chrt\.fm/track|
4630                 mgln\.ai/e
4631             )(?:/[^/.]+)?|
4632             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4633             flex\.acast\.com|
4634             pd(?:
4635                 cn\.co| # https://podcorn.com/analytics-prefix/
4636                 st\.fm # https://podsights.com/docs/
4637             )/e|
4638             [0-9]\.gum\.fm|
4639             pscrb\.fm/rss/p
4640         )/''', '', url)
4641     return re.sub(r'^\w+://(\w+://)', r'\1', url)
4642
4643
4644 _HEX_TABLE = '0123456789abcdef'
4645
4646
4647 def random_uuidv4():
4648     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4649
4650
4651 def make_dir(path, to_screen=None):
4652     try:
4653         dn = os.path.dirname(path)
4654         if dn:
4655             os.makedirs(dn, exist_ok=True)
4656         return True
4657     except OSError as err:
4658         if callable(to_screen) is not None:
4659             to_screen(f'unable to create directory {err}')
4660         return False
4661
4662
4663 def get_executable_path():
4664     from ..update import _get_variant_and_executable_path
4665
4666     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4667
4668
4669 def get_user_config_dirs(package_name):
4670     # .config (e.g. ~/.config/package_name)
4671     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4672     yield os.path.join(xdg_config_home, package_name)
4673
4674     # appdata (%APPDATA%/package_name)
4675     appdata_dir = os.getenv('appdata')
4676     if appdata_dir:
4677         yield os.path.join(appdata_dir, package_name)
4678
4679     # home (~/.package_name)
4680     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4681
4682
4683 def get_system_config_dirs(package_name):
4684     # /etc/package_name
4685     yield os.path.join('/etc', package_name)
4686
4687
4688 def time_seconds(**kwargs):
4689     """
4690     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4691     """
4692     return time.time() + dt.timedelta(**kwargs).total_seconds()
4693
4694
4695 # create a JSON Web Signature (jws) with HS256 algorithm
4696 # the resulting format is in JWS Compact Serialization
4697 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4698 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4699 def jwt_encode_hs256(payload_data, key, headers={}):
4700     header_data = {
4701         'alg': 'HS256',
4702         'typ': 'JWT',
4703     }
4704     if headers:
4705         header_data.update(headers)
4706     header_b64 = base64.b64encode(json.dumps(header_data).encode())
4707     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4708     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4709     signature_b64 = base64.b64encode(h.digest())
4710     return header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4711
4712
4713 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4714 def jwt_decode_hs256(jwt):
4715     header_b64, payload_b64, signature_b64 = jwt.split('.')
4716     # add trailing ='s that may have been stripped, superfluous ='s are ignored
4717     return json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4718
4719
4720 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4721
4722
4723 @functools.cache
4724 def supports_terminal_sequences(stream):
4725     if compat_os_name == 'nt':
4726         if not WINDOWS_VT_MODE:
4727             return False
4728     elif not os.getenv('TERM'):
4729         return False
4730     try:
4731         return stream.isatty()
4732     except BaseException:
4733         return False
4734
4735
4736 def windows_enable_vt_mode():
4737     """Ref: https://bugs.python.org/issue30075 """
4738     if get_windows_version() < (10, 0, 10586):
4739         return
4740
4741     import ctypes
4742     import ctypes.wintypes
4743     import msvcrt
4744
4745     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4746
4747     dll = ctypes.WinDLL('kernel32', use_last_error=False)
4748     handle = os.open('CONOUT$', os.O_RDWR)
4749     try:
4750         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4751         dw_original_mode = ctypes.wintypes.DWORD()
4752         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4753         if not success:
4754             raise Exception('GetConsoleMode failed')
4755
4756         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4757             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4758         if not success:
4759             raise Exception('SetConsoleMode failed')
4760     finally:
4761         os.close(handle)
4762
4763     global WINDOWS_VT_MODE
4764     WINDOWS_VT_MODE = True
4765     supports_terminal_sequences.cache_clear()
4766
4767
4768 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4769
4770
4771 def remove_terminal_sequences(string):
4772     return _terminal_sequences_re.sub('', string)
4773
4774
4775 def number_of_digits(number):
4776     return len('%d' % number)
4777
4778
4779 def join_nonempty(*values, delim='-', from_dict=None):
4780     if from_dict is not None:
4781         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4782     return delim.join(map(str, filter(None, values)))
4783
4784
4785 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4786     """
4787     Find the largest format dimensions in terms of video width and, for each thumbnail:
4788     * Modify the URL: Match the width with the provided regex and replace with the former width
4789     * Update dimensions
4790
4791     This function is useful with video services that scale the provided thumbnails on demand
4792     """
4793     _keys = ('width', 'height')
4794     max_dimensions = max(
4795         (tuple(fmt.get(k) or 0 for k in _keys) for fmt in formats),
4796         default=(0, 0))
4797     if not max_dimensions[0]:
4798         return thumbnails
4799     return [
4800         merge_dicts(
4801             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4802             dict(zip(_keys, max_dimensions)), thumbnail)
4803         for thumbnail in thumbnails
4804     ]
4805
4806
4807 def parse_http_range(range):
4808     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4809     if not range:
4810         return None, None, None
4811     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4812     if not crg:
4813         return None, None, None
4814     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4815
4816
4817 def read_stdin(what):
4818     if what:
4819         eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4820         write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4821     return sys.stdin
4822
4823
4824 def determine_file_encoding(data):
4825     """
4826     Detect the text encoding used
4827     @returns (encoding, bytes to skip)
4828     """
4829
4830     # BOM marks are given priority over declarations
4831     for bom, enc in BOMS:
4832         if data.startswith(bom):
4833             return enc, len(bom)
4834
4835     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4836     # We ignore the endianness to get a good enough match
4837     data = data.replace(b'\0', b'')
4838     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4839     return mobj.group(1).decode() if mobj else None, 0
4840
4841
4842 class Config:
4843     own_args = None
4844     parsed_args = None
4845     filename = None
4846     __initialized = False
4847
4848     def __init__(self, parser, label=None):
4849         self.parser, self.label = parser, label
4850         self._loaded_paths, self.configs = set(), []
4851
4852     def init(self, args=None, filename=None):
4853         assert not self.__initialized
4854         self.own_args, self.filename = args, filename
4855         return self.load_configs()
4856
4857     def load_configs(self):
4858         directory = ''
4859         if self.filename:
4860             location = os.path.realpath(self.filename)
4861             directory = os.path.dirname(location)
4862             if location in self._loaded_paths:
4863                 return False
4864             self._loaded_paths.add(location)
4865
4866         self.__initialized = True
4867         opts, _ = self.parser.parse_known_args(self.own_args)
4868         self.parsed_args = self.own_args
4869         for location in opts.config_locations or []:
4870             if location == '-':
4871                 if location in self._loaded_paths:
4872                     continue
4873                 self._loaded_paths.add(location)
4874                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4875                 continue
4876             location = os.path.join(directory, expand_path(location))
4877             if os.path.isdir(location):
4878                 location = os.path.join(location, 'yt-dlp.conf')
4879             if not os.path.exists(location):
4880                 self.parser.error(f'config location {location} does not exist')
4881             self.append_config(self.read_file(location), location)
4882         return True
4883
4884     def __str__(self):
4885         label = join_nonempty(
4886             self.label, 'config', f'"{self.filename}"' if self.filename else '',
4887             delim=' ')
4888         return join_nonempty(
4889             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4890             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4891             delim='\n')
4892
4893     @staticmethod
4894     def read_file(filename, default=[]):
4895         try:
4896             optionf = open(filename, 'rb')
4897         except OSError:
4898             return default  # silently skip if file is not present
4899         try:
4900             enc, skip = determine_file_encoding(optionf.read(512))
4901             optionf.seek(skip, io.SEEK_SET)
4902         except OSError:
4903             enc = None  # silently skip read errors
4904         try:
4905             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4906             contents = optionf.read().decode(enc or preferredencoding())
4907             res = shlex.split(contents, comments=True)
4908         except Exception as err:
4909             raise ValueError(f'Unable to parse "{filename}": {err}')
4910         finally:
4911             optionf.close()
4912         return res
4913
4914     @staticmethod
4915     def hide_login_info(opts):
4916         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4917         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4918
4919         def _scrub_eq(o):
4920             m = eqre.match(o)
4921             if m:
4922                 return m.group('key') + '=PRIVATE'
4923             else:
4924                 return o
4925
4926         opts = list(map(_scrub_eq, opts))
4927         for idx, opt in enumerate(opts):
4928             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4929                 opts[idx + 1] = 'PRIVATE'
4930         return opts
4931
4932     def append_config(self, *args, label=None):
4933         config = type(self)(self.parser, label)
4934         config._loaded_paths = self._loaded_paths
4935         if config.init(*args):
4936             self.configs.append(config)
4937
4938     @property
4939     def all_args(self):
4940         for config in reversed(self.configs):
4941             yield from config.all_args
4942         yield from self.parsed_args or []
4943
4944     def parse_known_args(self, **kwargs):
4945         return self.parser.parse_known_args(self.all_args, **kwargs)
4946
4947     def parse_args(self):
4948         return self.parser.parse_args(self.all_args)
4949
4950
4951 def merge_headers(*dicts):
4952     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4953     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
4954
4955
4956 def cached_method(f):
4957     """Cache a method"""
4958     signature = inspect.signature(f)
4959
4960     @functools.wraps(f)
4961     def wrapper(self, *args, **kwargs):
4962         bound_args = signature.bind(self, *args, **kwargs)
4963         bound_args.apply_defaults()
4964         key = tuple(bound_args.arguments.values())[1:]
4965
4966         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
4967         if key not in cache:
4968             cache[key] = f(self, *args, **kwargs)
4969         return cache[key]
4970     return wrapper
4971
4972
4973 class classproperty:
4974     """property access for class methods with optional caching"""
4975     def __new__(cls, func=None, *args, **kwargs):
4976         if not func:
4977             return functools.partial(cls, *args, **kwargs)
4978         return super().__new__(cls)
4979
4980     def __init__(self, func, *, cache=False):
4981         functools.update_wrapper(self, func)
4982         self.func = func
4983         self._cache = {} if cache else None
4984
4985     def __get__(self, _, cls):
4986         if self._cache is None:
4987             return self.func(cls)
4988         elif cls not in self._cache:
4989             self._cache[cls] = self.func(cls)
4990         return self._cache[cls]
4991
4992
4993 class function_with_repr:
4994     def __init__(self, func, repr_=None):
4995         functools.update_wrapper(self, func)
4996         self.func, self.__repr = func, repr_
4997
4998     def __call__(self, *args, **kwargs):
4999         return self.func(*args, **kwargs)
5000
5001     @classmethod
5002     def set_repr(cls, repr_):
5003         return functools.partial(cls, repr_=repr_)
5004
5005     def __repr__(self):
5006         if self.__repr:
5007             return self.__repr
5008         return f'{self.func.__module__}.{self.func.__qualname__}'
5009
5010
5011 class Namespace(types.SimpleNamespace):
5012     """Immutable namespace"""
5013
5014     def __iter__(self):
5015         return iter(self.__dict__.values())
5016
5017     @property
5018     def items_(self):
5019         return self.__dict__.items()
5020
5021
5022 MEDIA_EXTENSIONS = Namespace(
5023     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5024     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5025     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5026     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5027     thumbnails=('jpg', 'png', 'webp'),
5028     storyboards=('mhtml', ),
5029     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5030     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5031 )
5032 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5033 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5034
5035 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5036
5037
5038 class RetryManager:
5039     """Usage:
5040         for retry in RetryManager(...):
5041             try:
5042                 ...
5043             except SomeException as err:
5044                 retry.error = err
5045                 continue
5046     """
5047     attempt, _error = 0, None
5048
5049     def __init__(self, _retries, _error_callback, **kwargs):
5050         self.retries = _retries or 0
5051         self.error_callback = functools.partial(_error_callback, **kwargs)
5052
5053     def _should_retry(self):
5054         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5055
5056     @property
5057     def error(self):
5058         if self._error is NO_DEFAULT:
5059             return None
5060         return self._error
5061
5062     @error.setter
5063     def error(self, value):
5064         self._error = value
5065
5066     def __iter__(self):
5067         while self._should_retry():
5068             self.error = NO_DEFAULT
5069             self.attempt += 1
5070             yield self
5071             if self.error:
5072                 self.error_callback(self.error, self.attempt, self.retries)
5073
5074     @staticmethod
5075     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5076         """Utility function for reporting retries"""
5077         if count > retries:
5078             if error:
5079                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5080             raise e
5081
5082         if not count:
5083             return warn(e)
5084         elif isinstance(e, ExtractorError):
5085             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5086         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5087
5088         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5089         if delay:
5090             info(f'Sleeping {delay:.2f} seconds ...')
5091             time.sleep(delay)
5092
5093
5094 def make_archive_id(ie, video_id):
5095     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5096     return f'{ie_key.lower()} {video_id}'
5097
5098
5099 def truncate_string(s, left, right=0):
5100     assert left > 3 and right >= 0
5101     if s is None or len(s) <= left + right:
5102         return s
5103     return f'{s[:left - 3]}...{s[-right:] if right else ""}'
5104
5105
5106 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5107     assert 'all' in alias_dict, '"all" alias is required'
5108     requested = list(start or [])
5109     for val in options:
5110         discard = val.startswith('-')
5111         if discard:
5112             val = val[1:]
5113
5114         if val in alias_dict:
5115             val = alias_dict[val] if not discard else [
5116                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5117             # NB: Do not allow regex in aliases for performance
5118             requested = orderedSet_from_options(val, alias_dict, start=requested)
5119             continue
5120
5121         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5122                    else [val] if val in alias_dict['all'] else None)
5123         if current is None:
5124             raise ValueError(val)
5125
5126         if discard:
5127             for item in current:
5128                 while item in requested:
5129                     requested.remove(item)
5130         else:
5131             requested.extend(current)
5132
5133     return orderedSet(requested)
5134
5135
5136 # TODO: Rewrite
5137 class FormatSorter:
5138     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5139
5140     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5141                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5142                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5143     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5144                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5145                     'fps', 'fs_approx', 'source', 'id')
5146
5147     settings = {
5148         'vcodec': {'type': 'ordered', 'regex': True,
5149                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5150         'acodec': {'type': 'ordered', 'regex': True,
5151                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5152         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5153                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5154         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5155                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5156         'vext': {'type': 'ordered', 'field': 'video_ext',
5157                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5158                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5159         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5160                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5161                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5162         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5163         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5164                        'field': ('vcodec', 'acodec'),
5165                        'function': lambda it: int(any(v != 'none' for v in it))},
5166         'ie_pref': {'priority': True, 'type': 'extractor'},
5167         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5168         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5169         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5170         'quality': {'convert': 'float', 'default': -1},
5171         'filesize': {'convert': 'bytes'},
5172         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5173         'id': {'convert': 'string', 'field': 'format_id'},
5174         'height': {'convert': 'float_none'},
5175         'width': {'convert': 'float_none'},
5176         'fps': {'convert': 'float_none'},
5177         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5178         'tbr': {'convert': 'float_none'},
5179         'vbr': {'convert': 'float_none'},
5180         'abr': {'convert': 'float_none'},
5181         'asr': {'convert': 'float_none'},
5182         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5183
5184         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5185         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5186                'function': lambda it: next(filter(None, it), None)},
5187         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5188                  'function': lambda it: next(filter(None, it), None)},
5189         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5190         'res': {'type': 'multiple', 'field': ('height', 'width'),
5191                 'function': lambda it: min(filter(None, it), default=0)},
5192
5193         # Actual field names
5194         'format_id': {'type': 'alias', 'field': 'id'},
5195         'preference': {'type': 'alias', 'field': 'ie_pref'},
5196         'language_preference': {'type': 'alias', 'field': 'lang'},
5197         'source_preference': {'type': 'alias', 'field': 'source'},
5198         'protocol': {'type': 'alias', 'field': 'proto'},
5199         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5200         'audio_channels': {'type': 'alias', 'field': 'channels'},
5201
5202         # Deprecated
5203         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5204         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5205         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5206         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5207         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5208         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5209         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5210         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5211         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5212         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5213         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5214         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5215         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5216         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5217         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5218         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5219         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5220         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5221         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5222         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5223     }
5224
5225     def __init__(self, ydl, field_preference):
5226         self.ydl = ydl
5227         self._order = []
5228         self.evaluate_params(self.ydl.params, field_preference)
5229         if ydl.params.get('verbose'):
5230             self.print_verbose_info(self.ydl.write_debug)
5231
5232     def _get_field_setting(self, field, key):
5233         if field not in self.settings:
5234             if key in ('forced', 'priority'):
5235                 return False
5236             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5237                                         'deprecated and may be removed in a future version')
5238             self.settings[field] = {}
5239         prop_obj = self.settings[field]
5240         if key not in prop_obj:
5241             type_ = prop_obj.get('type')
5242             if key == 'field':
5243                 default = 'preference' if type_ == 'extractor' else (field,) if type_ in ('combined', 'multiple') else field
5244             elif key == 'convert':
5245                 default = 'order' if type_ == 'ordered' else 'float_string' if field else 'ignore'
5246             else:
5247                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key)
5248             prop_obj[key] = default
5249         return prop_obj[key]
5250
5251     def _resolve_field_value(self, field, value, convert_none=False):
5252         if value is None:
5253             if not convert_none:
5254                 return None
5255         else:
5256             value = value.lower()
5257         conversion = self._get_field_setting(field, 'convert')
5258         if conversion == 'ignore':
5259             return None
5260         if conversion == 'string':
5261             return value
5262         elif conversion == 'float_none':
5263             return float_or_none(value)
5264         elif conversion == 'bytes':
5265             return parse_bytes(value)
5266         elif conversion == 'order':
5267             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5268             use_regex = self._get_field_setting(field, 'regex')
5269             list_length = len(order_list)
5270             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5271             if use_regex and value is not None:
5272                 for i, regex in enumerate(order_list):
5273                     if regex and re.match(regex, value):
5274                         return list_length - i
5275                 return list_length - empty_pos  # not in list
5276             else:  # not regex or  value = None
5277                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5278         else:
5279             if value.isnumeric():
5280                 return float(value)
5281             else:
5282                 self.settings[field]['convert'] = 'string'
5283                 return value
5284
5285     def evaluate_params(self, params, sort_extractor):
5286         self._use_free_order = params.get('prefer_free_formats', False)
5287         self._sort_user = params.get('format_sort', [])
5288         self._sort_extractor = sort_extractor
5289
5290         def add_item(field, reverse, closest, limit_text):
5291             field = field.lower()
5292             if field in self._order:
5293                 return
5294             self._order.append(field)
5295             limit = self._resolve_field_value(field, limit_text)
5296             data = {
5297                 'reverse': reverse,
5298                 'closest': False if limit is None else closest,
5299                 'limit_text': limit_text,
5300                 'limit': limit}
5301             if field in self.settings:
5302                 self.settings[field].update(data)
5303             else:
5304                 self.settings[field] = data
5305
5306         sort_list = (
5307             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5308             + (tuple() if params.get('format_sort_force', False)
5309                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5310             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5311
5312         for item in sort_list:
5313             match = re.match(self.regex, item)
5314             if match is None:
5315                 raise ExtractorError(f'Invalid format sort string "{item}" given by extractor')
5316             field = match.group('field')
5317             if field is None:
5318                 continue
5319             if self._get_field_setting(field, 'type') == 'alias':
5320                 alias, field = field, self._get_field_setting(field, 'field')
5321                 if self._get_field_setting(alias, 'deprecated'):
5322                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5323                                                 f'be removed in a future version. Please use {field} instead')
5324             reverse = match.group('reverse') is not None
5325             closest = match.group('separator') == '~'
5326             limit_text = match.group('limit')
5327
5328             has_limit = limit_text is not None
5329             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5330             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5331
5332             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5333             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5334             limit_count = len(limits)
5335             for (i, f) in enumerate(fields):
5336                 add_item(f, reverse, closest,
5337                          limits[i] if i < limit_count
5338                          else limits[0] if has_limit and not has_multiple_limits
5339                          else None)
5340
5341     def print_verbose_info(self, write_debug):
5342         if self._sort_user:
5343             write_debug('Sort order given by user: {}'.format(', '.join(self._sort_user)))
5344         if self._sort_extractor:
5345             write_debug('Sort order given by extractor: {}'.format(', '.join(self._sort_extractor)))
5346         write_debug('Formats sorted by: {}'.format(', '.join(['{}{}{}'.format(
5347             '+' if self._get_field_setting(field, 'reverse') else '', field,
5348             '{}{}({})'.format('~' if self._get_field_setting(field, 'closest') else ':',
5349                               self._get_field_setting(field, 'limit_text'),
5350                               self._get_field_setting(field, 'limit'))
5351             if self._get_field_setting(field, 'limit_text') is not None else '')
5352             for field in self._order if self._get_field_setting(field, 'visible')])))
5353
5354     def _calculate_field_preference_from_value(self, format_, field, type_, value):
5355         reverse = self._get_field_setting(field, 'reverse')
5356         closest = self._get_field_setting(field, 'closest')
5357         limit = self._get_field_setting(field, 'limit')
5358
5359         if type_ == 'extractor':
5360             maximum = self._get_field_setting(field, 'max')
5361             if value is None or (maximum is not None and value >= maximum):
5362                 value = -1
5363         elif type_ == 'boolean':
5364             in_list = self._get_field_setting(field, 'in_list')
5365             not_in_list = self._get_field_setting(field, 'not_in_list')
5366             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5367         elif type_ == 'ordered':
5368             value = self._resolve_field_value(field, value, True)
5369
5370         # try to convert to number
5371         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5372         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5373         if is_num:
5374             value = val_num
5375
5376         return ((-10, 0) if value is None
5377                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5378                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5379                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5380                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5381                 else (-1, value, 0))
5382
5383     def _calculate_field_preference(self, format_, field):
5384         type_ = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5385         get_value = lambda f: format_.get(self._get_field_setting(f, 'field'))
5386         if type_ == 'multiple':
5387             type_ = 'field'  # Only 'field' is allowed in multiple for now
5388             actual_fields = self._get_field_setting(field, 'field')
5389
5390             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5391         else:
5392             value = get_value(field)
5393         return self._calculate_field_preference_from_value(format_, field, type_, value)
5394
5395     def calculate_preference(self, format):
5396         # Determine missing protocol
5397         if not format.get('protocol'):
5398             format['protocol'] = determine_protocol(format)
5399
5400         # Determine missing ext
5401         if not format.get('ext') and 'url' in format:
5402             format['ext'] = determine_ext(format['url'])
5403         if format.get('vcodec') == 'none':
5404             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5405             format['video_ext'] = 'none'
5406         else:
5407             format['video_ext'] = format['ext']
5408             format['audio_ext'] = 'none'
5409         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5410         #    format['preference'] = -1000
5411
5412         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5413             # HEVC-over-FLV is out-of-spec by FLV's original spec
5414             # ref. https://trac.ffmpeg.org/ticket/6389
5415             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5416             format['preference'] = -100
5417
5418         # Determine missing bitrates
5419         if format.get('vcodec') == 'none':
5420             format['vbr'] = 0
5421         if format.get('acodec') == 'none':
5422             format['abr'] = 0
5423         if not format.get('vbr') and format.get('vcodec') != 'none':
5424             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5425         if not format.get('abr') and format.get('acodec') != 'none':
5426             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5427         if not format.get('tbr'):
5428             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5429
5430         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5431
5432
5433 def filesize_from_tbr(tbr, duration):
5434     """
5435     @param tbr:      Total bitrate in kbps (1000 bits/sec)
5436     @param duration: Duration in seconds
5437     @returns         Filesize in bytes
5438     """
5439     if tbr is None or duration is None:
5440         return None
5441     return int(duration * tbr * (1000 / 8))
5442
5443
5444 # XXX: Temporary
5445 class _YDLLogger:
5446     def __init__(self, ydl=None):
5447         self._ydl = ydl
5448
5449     def debug(self, message):
5450         if self._ydl:
5451             self._ydl.write_debug(message)
5452
5453     def info(self, message):
5454         if self._ydl:
5455             self._ydl.to_screen(message)
5456
5457     def warning(self, message, *, once=False):
5458         if self._ydl:
5459             self._ydl.report_warning(message, once)
5460
5461     def error(self, message, *, is_error=True):
5462         if self._ydl:
5463             self._ydl.report_error(message, is_error=is_error)
5464
5465     def stdout(self, message):
5466         if self._ydl:
5467             self._ydl.to_stdout(message)
5468
5469     def stderr(self, message):
5470         if self._ydl:
5471             self._ydl.to_stderr(message)