yt_dlp/utils/_utils.py

   1 import base64
   2 import binascii
   3 import calendar
   4 import codecs
   5 import collections
   6 import collections.abc
   7 import contextlib
   8 import datetime as dt
   9 import email.header
  10 import email.utils
  11 import errno
  12 import hashlib
  13 import hmac
  14 import html.entities
  15 import html.parser
  16 import inspect
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import mimetypes
  23 import netrc
  24 import operator
  25 import os
  26 import platform
  27 import random
  28 import re
  29 import shlex
  30 import socket
  31 import ssl
  32 import struct
  33 import subprocess
  34 import sys
  35 import tempfile
  36 import time
  37 import traceback
  38 import types
  39 import unicodedata
  40 import urllib.error
  41 import urllib.parse
  42 import urllib.request
  43 import xml.etree.ElementTree
  44
  45 from . import traversal
  46
  47 from ..compat import functools  # isort: split
  48 from ..compat import (
  49     compat_etree_fromstring,
  50     compat_expanduser,
  51     compat_HTMLParseError,
  52     compat_os_name,
  53 )
  54 from ..dependencies import xattr
  55
  56 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  57
  58 # This is not clearly defined otherwise
  59 compiled_regex_type = type(re.compile(''))
  60
  61
  62 class NO_DEFAULT:
  63     pass
  64
  65
  66 def IDENTITY(x):
  67     return x
  68
  69
  70 ENGLISH_MONTH_NAMES = [
  71     'January', 'February', 'March', 'April', 'May', 'June',
  72     'July', 'August', 'September', 'October', 'November', 'December']
  73
  74 MONTH_NAMES = {
  75     'en': ENGLISH_MONTH_NAMES,
  76     'fr': [
  77         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  78         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  79     # these follow the genitive grammatical case (dopełniacz)
  80     # some websites might be using nominative, which will require another month list
  81     # https://en.wikibooks.org/wiki/Polish/Noun_cases
  82     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  83            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  84 }
  85
  86 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  87 TIMEZONE_NAMES = {
  88     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  89     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
  90     'EST': -5, 'EDT': -4,  # Eastern
  91     'CST': -6, 'CDT': -5,  # Central
  92     'MST': -7, 'MDT': -6,  # Mountain
  93     'PST': -8, 'PDT': -7   # Pacific
  94 }
  95
  96 # needed for sanitizing filenames in restricted mode
  97 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
  98                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
  99                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 100
 101 DATE_FORMATS = (
 102     '%d %B %Y',
 103     '%d %b %Y',
 104     '%B %d %Y',
 105     '%B %dst %Y',
 106     '%B %dnd %Y',
 107     '%B %drd %Y',
 108     '%B %dth %Y',
 109     '%b %d %Y',
 110     '%b %dst %Y',
 111     '%b %dnd %Y',
 112     '%b %drd %Y',
 113     '%b %dth %Y',
 114     '%b %dst %Y %I:%M',
 115     '%b %dnd %Y %I:%M',
 116     '%b %drd %Y %I:%M',
 117     '%b %dth %Y %I:%M',
 118     '%Y %m %d',
 119     '%Y-%m-%d',
 120     '%Y.%m.%d.',
 121     '%Y/%m/%d',
 122     '%Y/%m/%d %H:%M',
 123     '%Y/%m/%d %H:%M:%S',
 124     '%Y%m%d%H%M',
 125     '%Y%m%d%H%M%S',
 126     '%Y%m%d',
 127     '%Y-%m-%d %H:%M',
 128     '%Y-%m-%d %H:%M:%S',
 129     '%Y-%m-%d %H:%M:%S.%f',
 130     '%Y-%m-%d %H:%M:%S:%f',
 131     '%d.%m.%Y %H:%M',
 132     '%d.%m.%Y %H.%M',
 133     '%Y-%m-%dT%H:%M:%SZ',
 134     '%Y-%m-%dT%H:%M:%S.%fZ',
 135     '%Y-%m-%dT%H:%M:%S.%f0Z',
 136     '%Y-%m-%dT%H:%M:%S',
 137     '%Y-%m-%dT%H:%M:%S.%f',
 138     '%Y-%m-%dT%H:%M',
 139     '%b %d %Y at %H:%M',
 140     '%b %d %Y at %H:%M:%S',
 141     '%B %d %Y at %H:%M',
 142     '%B %d %Y at %H:%M:%S',
 143     '%H:%M %d-%b-%Y',
 144 )
 145
 146 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 147 DATE_FORMATS_DAY_FIRST.extend([
 148     '%d-%m-%Y',
 149     '%d.%m.%Y',
 150     '%d.%m.%y',
 151     '%d/%m/%Y',
 152     '%d/%m/%y',
 153     '%d/%m/%Y %H:%M:%S',
 154     '%d-%m-%Y %H:%M',
 155     '%H:%M %d/%m/%Y',
 156 ])
 157
 158 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 159 DATE_FORMATS_MONTH_FIRST.extend([
 160     '%m-%d-%Y',
 161     '%m.%d.%Y',
 162     '%m/%d/%Y',
 163     '%m/%d/%y',
 164     '%m/%d/%Y %H:%M:%S',
 165 ])
 166
 167 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 168 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 169
 170 NUMBER_RE = r'\d+(?:\.\d+)?'
 171
 172
 173 @functools.cache
 174 def preferredencoding():
 175     """Get preferred encoding.
 176
 177     Returns the best encoding scheme for the system, based on
 178     locale.getpreferredencoding() and some further tweaks.
 179     """
 180     try:
 181         pref = locale.getpreferredencoding()
 182         'TEST'.encode(pref)
 183     except Exception:
 184         pref = 'UTF-8'
 185
 186     return pref
 187
 188
 189 def write_json_file(obj, fn):
 190     """ Encode obj as JSON and write it to fn, atomically if possible """
 191
 192     tf = tempfile.NamedTemporaryFile(
 193         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 194         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 195
 196     try:
 197         with tf:
 198             json.dump(obj, tf, ensure_ascii=False)
 199         if sys.platform == 'win32':
 200             # Need to remove existing file on Windows, else os.rename raises
 201             # WindowsError or FileExistsError.
 202             with contextlib.suppress(OSError):
 203                 os.unlink(fn)
 204         with contextlib.suppress(OSError):
 205             mask = os.umask(0)
 206             os.umask(mask)
 207             os.chmod(tf.name, 0o666 & ~mask)
 208         os.rename(tf.name, fn)
 209     except Exception:
 210         with contextlib.suppress(OSError):
 211             os.remove(tf.name)
 212         raise
 213
 214
 215 def find_xpath_attr(node, xpath, key, val=None):
 216     """ Find the xpath xpath[@key=val] """
 217     assert re.match(r'^[a-zA-Z_-]+$', key)
 218     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 219     return node.find(expr)
 220
 221 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 222 # the namespace parameter
 223
 224
 225 def xpath_with_ns(path, ns_map):
 226     components = [c.split(':') for c in path.split('/')]
 227     replaced = []
 228     for c in components:
 229         if len(c) == 1:
 230             replaced.append(c[0])
 231         else:
 232             ns, tag = c
 233             replaced.append('{%s}%s' % (ns_map[ns], tag))
 234     return '/'.join(replaced)
 235
 236
 237 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 238     def _find_xpath(xpath):
 239         return node.find(xpath)
 240
 241     if isinstance(xpath, str):
 242         n = _find_xpath(xpath)
 243     else:
 244         for xp in xpath:
 245             n = _find_xpath(xp)
 246             if n is not None:
 247                 break
 248
 249     if n is None:
 250         if default is not NO_DEFAULT:
 251             return default
 252         elif fatal:
 253             name = xpath if name is None else name
 254             raise ExtractorError('Could not find XML element %s' % name)
 255         else:
 256             return None
 257     return n
 258
 259
 260 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 261     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 262     if n is None or n == default:
 263         return n
 264     if n.text is None:
 265         if default is not NO_DEFAULT:
 266             return default
 267         elif fatal:
 268             name = xpath if name is None else name
 269             raise ExtractorError('Could not find XML element\'s text %s' % name)
 270         else:
 271             return None
 272     return n.text
 273
 274
 275 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 276     n = find_xpath_attr(node, xpath, key)
 277     if n is None:
 278         if default is not NO_DEFAULT:
 279             return default
 280         elif fatal:
 281             name = f'{xpath}[@{key}]' if name is None else name
 282             raise ExtractorError('Could not find XML attribute %s' % name)
 283         else:
 284             return None
 285     return n.attrib[key]
 286
 287
 288 def get_element_by_id(id, html, **kwargs):
 289     """Return the content of the tag with the specified ID in the passed HTML document"""
 290     return get_element_by_attribute('id', id, html, **kwargs)
 291
 292
 293 def get_element_html_by_id(id, html, **kwargs):
 294     """Return the html of the tag with the specified ID in the passed HTML document"""
 295     return get_element_html_by_attribute('id', id, html, **kwargs)
 296
 297
 298 def get_element_by_class(class_name, html):
 299     """Return the content of the first tag with the specified class in the passed HTML document"""
 300     retval = get_elements_by_class(class_name, html)
 301     return retval[0] if retval else None
 302
 303
 304 def get_element_html_by_class(class_name, html):
 305     """Return the html of the first tag with the specified class in the passed HTML document"""
 306     retval = get_elements_html_by_class(class_name, html)
 307     return retval[0] if retval else None
 308
 309
 310 def get_element_by_attribute(attribute, value, html, **kwargs):
 311     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 312     return retval[0] if retval else None
 313
 314
 315 def get_element_html_by_attribute(attribute, value, html, **kargs):
 316     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 317     return retval[0] if retval else None
 318
 319
 320 def get_elements_by_class(class_name, html, **kargs):
 321     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 322     return get_elements_by_attribute(
 323         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 324         html, escape_value=False)
 325
 326
 327 def get_elements_html_by_class(class_name, html):
 328     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 329     return get_elements_html_by_attribute(
 330         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 331         html, escape_value=False)
 332
 333
 334 def get_elements_by_attribute(*args, **kwargs):
 335     """Return the content of the tag with the specified attribute in the passed HTML document"""
 336     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 337
 338
 339 def get_elements_html_by_attribute(*args, **kwargs):
 340     """Return the html of the tag with the specified attribute in the passed HTML document"""
 341     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 342
 343
 344 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 345     """
 346     Return the text (content) and the html (whole) of the tag with the specified
 347     attribute in the passed HTML document
 348     """
 349     if not value:
 350         return
 351
 352     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 353
 354     value = re.escape(value) if escape_value else value
 355
 356     partial_element_re = rf'''(?x)
 357         <(?P<tag>{tag})
 358          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 359          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 360         '''
 361
 362     for m in re.finditer(partial_element_re, html):
 363         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 364
 365         yield (
 366             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 367             whole
 368         )
 369
 370
 371 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 372     """
 373     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 374     closing tag for the first opening tag it has encountered, and can be used
 375     as a context manager
 376     """
 377
 378     class HTMLBreakOnClosingTagException(Exception):
 379         pass
 380
 381     def __init__(self):
 382         self.tagstack = collections.deque()
 383         html.parser.HTMLParser.__init__(self)
 384
 385     def __enter__(self):
 386         return self
 387
 388     def __exit__(self, *_):
 389         self.close()
 390
 391     def close(self):
 392         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 393         # so data remains buffered; we no longer have any interest in it, thus
 394         # override this method to discard it
 395         pass
 396
 397     def handle_starttag(self, tag, _):
 398         self.tagstack.append(tag)
 399
 400     def handle_endtag(self, tag):
 401         if not self.tagstack:
 402             raise compat_HTMLParseError('no tags in the stack')
 403         while self.tagstack:
 404             inner_tag = self.tagstack.pop()
 405             if inner_tag == tag:
 406                 break
 407         else:
 408             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 409         if not self.tagstack:
 410             raise self.HTMLBreakOnClosingTagException()
 411
 412
 413 # XXX: This should be far less strict
 414 def get_element_text_and_html_by_tag(tag, html):
 415     """
 416     For the first element with the specified tag in the passed HTML document
 417     return its' content (text) and the whole element (html)
 418     """
 419     def find_or_raise(haystack, needle, exc):
 420         try:
 421             return haystack.index(needle)
 422         except ValueError:
 423             raise exc
 424     closing_tag = f'</{tag}>'
 425     whole_start = find_or_raise(
 426         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 427     content_start = find_or_raise(
 428         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 429     content_start += whole_start + 1
 430     with HTMLBreakOnClosingTagParser() as parser:
 431         parser.feed(html[whole_start:content_start])
 432         if not parser.tagstack or parser.tagstack[0] != tag:
 433             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 434         offset = content_start
 435         while offset < len(html):
 436             next_closing_tag_start = find_or_raise(
 437                 html[offset:], closing_tag,
 438                 compat_HTMLParseError(f'closing {tag} tag not found'))
 439             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 440             try:
 441                 parser.feed(html[offset:offset + next_closing_tag_end])
 442                 offset += next_closing_tag_end
 443             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 444                 return html[content_start:offset + next_closing_tag_start], \
 445                     html[whole_start:offset + next_closing_tag_end]
 446         raise compat_HTMLParseError('unexpected end of html')
 447
 448
 449 class HTMLAttributeParser(html.parser.HTMLParser):
 450     """Trivial HTML parser to gather the attributes for a single element"""
 451
 452     def __init__(self):
 453         self.attrs = {}
 454         html.parser.HTMLParser.__init__(self)
 455
 456     def handle_starttag(self, tag, attrs):
 457         self.attrs = dict(attrs)
 458         raise compat_HTMLParseError('done')
 459
 460
 461 class HTMLListAttrsParser(html.parser.HTMLParser):
 462     """HTML parser to gather the attributes for the elements of a list"""
 463
 464     def __init__(self):
 465         html.parser.HTMLParser.__init__(self)
 466         self.items = []
 467         self._level = 0
 468
 469     def handle_starttag(self, tag, attrs):
 470         if tag == 'li' and self._level == 0:
 471             self.items.append(dict(attrs))
 472         self._level += 1
 473
 474     def handle_endtag(self, tag):
 475         self._level -= 1
 476
 477
 478 def extract_attributes(html_element):
 479     """Given a string for an HTML element such as
 480     <el
 481          a="foo" B="bar" c="&98;az" d=boz
 482          empty= noval entity="&amp;"
 483          sq='"' dq="'"
 484     >
 485     Decode and return a dictionary of attributes.
 486     {
 487         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 488         'empty': '', 'noval': None, 'entity': '&',
 489         'sq': '"', 'dq': '\''
 490     }.
 491     """
 492     parser = HTMLAttributeParser()
 493     with contextlib.suppress(compat_HTMLParseError):
 494         parser.feed(html_element)
 495         parser.close()
 496     return parser.attrs
 497
 498
 499 def parse_list(webpage):
 500     """Given a string for an series of HTML <li> elements,
 501     return a dictionary of their attributes"""
 502     parser = HTMLListAttrsParser()
 503     parser.feed(webpage)
 504     parser.close()
 505     return parser.items
 506
 507
 508 def clean_html(html):
 509     """Clean an HTML snippet into a readable string"""
 510
 511     if html is None:  # Convenience for sanitizing descriptions etc.
 512         return html
 513
 514     html = re.sub(r'\s+', ' ', html)
 515     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 516     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 517     # Strip html tags
 518     html = re.sub('<.*?>', '', html)
 519     # Replace html entities
 520     html = unescapeHTML(html)
 521     return html.strip()
 522
 523
 524 class LenientJSONDecoder(json.JSONDecoder):
 525     # TODO: Write tests
 526     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 527         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 528         self._close_attempts = 2 * close_objects
 529         super().__init__(*args, **kwargs)
 530
 531     @staticmethod
 532     def _close_object(err):
 533         doc = err.doc[:err.pos]
 534         # We need to add comma first to get the correct error message
 535         if err.msg.startswith('Expecting \',\''):
 536             return doc + ','
 537         elif not doc.endswith(','):
 538             return
 539
 540         if err.msg.startswith('Expecting property name'):
 541             return doc[:-1] + '}'
 542         elif err.msg.startswith('Expecting value'):
 543             return doc[:-1] + ']'
 544
 545     def decode(self, s):
 546         if self.transform_source:
 547             s = self.transform_source(s)
 548         for attempt in range(self._close_attempts + 1):
 549             try:
 550                 if self.ignore_extra:
 551                     return self.raw_decode(s.lstrip())[0]
 552                 return super().decode(s)
 553             except json.JSONDecodeError as e:
 554                 if e.pos is None:
 555                     raise
 556                 elif attempt < self._close_attempts:
 557                     s = self._close_object(e)
 558                     if s is not None:
 559                         continue
 560                 raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
 561         assert False, 'Too many attempts to decode JSON'
 562
 563
 564 def sanitize_open(filename, open_mode):
 565     """Try to open the given filename, and slightly tweak it if this fails.
 566
 567     Attempts to open the given filename. If this fails, it tries to change
 568     the filename slightly, step by step, until it's either able to open it
 569     or it fails and raises a final exception, like the standard open()
 570     function.
 571
 572     It returns the tuple (stream, definitive_file_name).
 573     """
 574     if filename == '-':
 575         if sys.platform == 'win32':
 576             import msvcrt
 577
 578             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 579             with contextlib.suppress(io.UnsupportedOperation):
 580                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 581         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 582
 583     for attempt in range(2):
 584         try:
 585             try:
 586                 if sys.platform == 'win32':
 587                     # FIXME: An exclusive lock also locks the file from being read.
 588                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 589                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 590                     raise LockingUnsupportedError()
 591                 stream = locked_file(filename, open_mode, block=False).__enter__()
 592             except OSError:
 593                 stream = open(filename, open_mode)
 594             return stream, filename
 595         except OSError as err:
 596             if attempt or err.errno in (errno.EACCES,):
 597                 raise
 598             old_filename, filename = filename, sanitize_path(filename)
 599             if old_filename == filename:
 600                 raise
 601
 602
 603 def timeconvert(timestr):
 604     """Convert RFC 2822 defined time string into system timestamp"""
 605     timestamp = None
 606     timetuple = email.utils.parsedate_tz(timestr)
 607     if timetuple is not None:
 608         timestamp = email.utils.mktime_tz(timetuple)
 609     return timestamp
 610
 611
 612 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 613     """Sanitizes a string so it could be used as part of a filename.
 614     @param restricted   Use a stricter subset of allowed characters
 615     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 616                         If unset, yt-dlp's new sanitization rules are in effect
 617     """
 618     if s == '':
 619         return ''
 620
 621     def replace_insane(char):
 622         if restricted and char in ACCENT_CHARS:
 623             return ACCENT_CHARS[char]
 624         elif not restricted and char == '\n':
 625             return '\0 '
 626         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 627             # Replace with their full-width unicode counterparts
 628             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 629         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 630             return ''
 631         elif char == '"':
 632             return '' if restricted else '\''
 633         elif char == ':':
 634             return '\0_\0-' if restricted else '\0 \0-'
 635         elif char in '\\/|*<>':
 636             return '\0_'
 637         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 638             return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
 639         return char
 640
 641     # Replace look-alike Unicode glyphs
 642     if restricted and (is_id is NO_DEFAULT or not is_id):
 643         s = unicodedata.normalize('NFKC', s)
 644     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 645     result = ''.join(map(replace_insane, s))
 646     if is_id is NO_DEFAULT:
 647         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 648         STRIP_RE = r'(?:\0.|[ _-])*'
 649         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 650     result = result.replace('\0', '') or '_'
 651
 652     if not is_id:
 653         while '__' in result:
 654             result = result.replace('__', '_')
 655         result = result.strip('_')
 656         # Common case of "Foreign band name - English song title"
 657         if restricted and result.startswith('-_'):
 658             result = result[2:]
 659         if result.startswith('-'):
 660             result = '_' + result[len('-'):]
 661         result = result.lstrip('.')
 662         if not result:
 663             result = '_'
 664     return result
 665
 666
 667 def sanitize_path(s, force=False):
 668     """Sanitizes and normalizes path on Windows"""
 669     # XXX: this handles drive relative paths (c:sth) incorrectly
 670     if sys.platform == 'win32':
 671         force = False
 672         drive_or_unc, _ = os.path.splitdrive(s)
 673     elif force:
 674         drive_or_unc = ''
 675     else:
 676         return s
 677
 678     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 679     if drive_or_unc:
 680         norm_path.pop(0)
 681     sanitized_path = [
 682         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 683         for path_part in norm_path]
 684     if drive_or_unc:
 685         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 686     elif force and s and s[0] == os.path.sep:
 687         sanitized_path.insert(0, os.path.sep)
 688     # TODO: Fix behavioral differences <3.12
 689     # The workaround using `normpath` only superficially passes tests
 690     # Ref: https://github.com/python/cpython/pull/100351
 691     return os.path.normpath(os.path.join(*sanitized_path))
 692
 693
 694 def sanitize_url(url, *, scheme='http'):
 695     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 696     # the number of unwanted failures due to missing protocol
 697     if url is None:
 698         return
 699     elif url.startswith('//'):
 700         return f'{scheme}:{url}'
 701     # Fix some common typos seen so far
 702     COMMON_TYPOS = (
 703         # https://github.com/ytdl-org/youtube-dl/issues/15649
 704         (r'^httpss://', r'https://'),
 705         # https://bx1.be/lives/direct-tv/
 706         (r'^rmtp([es]?)://', r'rtmp\1://'),
 707     )
 708     for mistake, fixup in COMMON_TYPOS:
 709         if re.match(mistake, url):
 710             return re.sub(mistake, fixup, url)
 711     return url
 712
 713
 714 def extract_basic_auth(url):
 715     parts = urllib.parse.urlsplit(url)
 716     if parts.username is None:
 717         return url, None
 718     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 719         parts.hostname if parts.port is None
 720         else '%s:%d' % (parts.hostname, parts.port))))
 721     auth_payload = base64.b64encode(
 722         ('%s:%s' % (parts.username, parts.password or '')).encode())
 723     return url, f'Basic {auth_payload.decode()}'
 724
 725
 726 def expand_path(s):
 727     """Expand shell variables and ~"""
 728     return os.path.expandvars(compat_expanduser(s))
 729
 730
 731 def orderedSet(iterable, *, lazy=False):
 732     """Remove all duplicates from the input iterable"""
 733     def _iter():
 734         seen = []  # Do not use set since the items can be unhashable
 735         for x in iterable:
 736             if x not in seen:
 737                 seen.append(x)
 738                 yield x
 739
 740     return _iter() if lazy else list(_iter())
 741
 742
 743 def _htmlentity_transform(entity_with_semicolon):
 744     """Transforms an HTML entity to a character."""
 745     entity = entity_with_semicolon[:-1]
 746
 747     # Known non-numeric HTML entity
 748     if entity in html.entities.name2codepoint:
 749         return chr(html.entities.name2codepoint[entity])
 750
 751     # TODO: HTML5 allows entities without a semicolon.
 752     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 753     if entity_with_semicolon in html.entities.html5:
 754         return html.entities.html5[entity_with_semicolon]
 755
 756     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 757     if mobj is not None:
 758         numstr = mobj.group(1)
 759         if numstr.startswith('x'):
 760             base = 16
 761             numstr = '0%s' % numstr
 762         else:
 763             base = 10
 764         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 765         with contextlib.suppress(ValueError):
 766             return chr(int(numstr, base))
 767
 768     # Unknown entity in name, return its literal representation
 769     return '&%s;' % entity
 770
 771
 772 def unescapeHTML(s):
 773     if s is None:
 774         return None
 775     assert isinstance(s, str)
 776
 777     return re.sub(
 778         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 779
 780
 781 def escapeHTML(text):
 782     return (
 783         text
 784         .replace('&', '&amp;')
 785         .replace('<', '&lt;')
 786         .replace('>', '&gt;')
 787         .replace('"', '&quot;')
 788         .replace("'", '&#39;')
 789     )
 790
 791
 792 class netrc_from_content(netrc.netrc):
 793     def __init__(self, content):
 794         self.hosts, self.macros = {}, {}
 795         with io.StringIO(content) as stream:
 796             self._parse('-', stream, False)
 797
 798
 799 class Popen(subprocess.Popen):
 800     if sys.platform == 'win32':
 801         _startupinfo = subprocess.STARTUPINFO()
 802         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 803     else:
 804         _startupinfo = None
 805
 806     @staticmethod
 807     def _fix_pyinstaller_ld_path(env):
 808         """Restore LD_LIBRARY_PATH when using PyInstaller
 809             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 810                  https://github.com/yt-dlp/yt-dlp/issues/4573
 811         """
 812         if not hasattr(sys, '_MEIPASS'):
 813             return
 814
 815         def _fix(key):
 816             orig = env.get(f'{key}_ORIG')
 817             if orig is None:
 818                 env.pop(key, None)
 819             else:
 820                 env[key] = orig
 821
 822         _fix('LD_LIBRARY_PATH')  # Linux
 823         _fix('DYLD_LIBRARY_PATH')  # macOS
 824
 825     def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
 826         if env is None:
 827             env = os.environ.copy()
 828         self._fix_pyinstaller_ld_path(env)
 829
 830         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 831         if text is True:
 832             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 833             kwargs.setdefault('encoding', 'utf-8')
 834             kwargs.setdefault('errors', 'replace')
 835
 836         if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
 837             if not isinstance(args, str):
 838                 args = shell_quote(args, shell=True)
 839             shell = False
 840             # Set variable for `cmd.exe` newline escaping (see `utils.shell_quote`)
 841             env['='] = '"^\n\n"'
 842             args = f'{self.__comspec()} /Q /S /D /V:OFF /E:ON /C "{args}"'
 843
 844         super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
 845
 846     def __comspec(self):
 847         comspec = os.environ.get('ComSpec') or os.path.join(
 848             os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
 849         if os.path.isabs(comspec):
 850             return comspec
 851         raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
 852
 853     def communicate_or_kill(self, *args, **kwargs):
 854         try:
 855             return self.communicate(*args, **kwargs)
 856         except BaseException:  # Including KeyboardInterrupt
 857             self.kill(timeout=None)
 858             raise
 859
 860     def kill(self, *, timeout=0):
 861         super().kill()
 862         if timeout != 0:
 863             self.wait(timeout=timeout)
 864
 865     @classmethod
 866     def run(cls, *args, timeout=None, **kwargs):
 867         with cls(*args, **kwargs) as proc:
 868             default = '' if proc.__text_mode else b''
 869             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 870             return stdout or default, stderr or default, proc.returncode
 871
 872
 873 def encodeArgument(s):
 874     # Legacy code that uses byte strings
 875     # Uncomment the following line after fixing all post processors
 876     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 877     return s if isinstance(s, str) else s.decode('ascii')
 878
 879
 880 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 881
 882
 883 def timetuple_from_msec(msec):
 884     secs, msec = divmod(msec, 1000)
 885     mins, secs = divmod(secs, 60)
 886     hrs, mins = divmod(mins, 60)
 887     return _timetuple(hrs, mins, secs, msec)
 888
 889
 890 def formatSeconds(secs, delim=':', msec=False):
 891     time = timetuple_from_msec(secs * 1000)
 892     if time.hours:
 893         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 894     elif time.minutes:
 895         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 896     else:
 897         ret = '%d' % time.seconds
 898     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 899
 900
 901 def bug_reports_message(before=';'):
 902     from ..update import REPOSITORY
 903
 904     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 905            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 906
 907     before = before.rstrip()
 908     if not before or before.endswith(('.', '!', '?')):
 909         msg = msg[0].title() + msg[1:]
 910
 911     return (before + ' ' if before else '') + msg
 912
 913
 914 class YoutubeDLError(Exception):
 915     """Base exception for YoutubeDL errors."""
 916     msg = None
 917
 918     def __init__(self, msg=None):
 919         if msg is not None:
 920             self.msg = msg
 921         elif self.msg is None:
 922             self.msg = type(self).__name__
 923         super().__init__(self.msg)
 924
 925
 926 class ExtractorError(YoutubeDLError):
 927     """Error during info extraction."""
 928
 929     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 930         """ tb, if given, is the original traceback (so that it can be printed out).
 931         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 932         """
 933         from ..networking.exceptions import network_exceptions
 934         if sys.exc_info()[0] in network_exceptions:
 935             expected = True
 936
 937         self.orig_msg = str(msg)
 938         self.traceback = tb
 939         self.expected = expected
 940         self.cause = cause
 941         self.video_id = video_id
 942         self.ie = ie
 943         self.exc_info = sys.exc_info()  # preserve original exception
 944         if isinstance(self.exc_info[1], ExtractorError):
 945             self.exc_info = self.exc_info[1].exc_info
 946         super().__init__(self.__msg)
 947
 948     @property
 949     def __msg(self):
 950         return ''.join((
 951             format_field(self.ie, None, '[%s] '),
 952             format_field(self.video_id, None, '%s: '),
 953             self.orig_msg,
 954             format_field(self.cause, None, ' (caused by %r)'),
 955             '' if self.expected else bug_reports_message()))
 956
 957     def format_traceback(self):
 958         return join_nonempty(
 959             self.traceback and ''.join(traceback.format_tb(self.traceback)),
 960             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
 961             delim='\n') or None
 962
 963     def __setattr__(self, name, value):
 964         super().__setattr__(name, value)
 965         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
 966             self.msg = self.__msg or type(self).__name__
 967             self.args = (self.msg, )  # Cannot be property
 968
 969
 970 class UnsupportedError(ExtractorError):
 971     def __init__(self, url):
 972         super().__init__(
 973             'Unsupported URL: %s' % url, expected=True)
 974         self.url = url
 975
 976
 977 class RegexNotFoundError(ExtractorError):
 978     """Error when a regex didn't match"""
 979     pass
 980
 981
 982 class GeoRestrictedError(ExtractorError):
 983     """Geographic restriction Error exception.
 984
 985     This exception may be thrown when a video is not available from your
 986     geographic location due to geographic restrictions imposed by a website.
 987     """
 988
 989     def __init__(self, msg, countries=None, **kwargs):
 990         kwargs['expected'] = True
 991         super().__init__(msg, **kwargs)
 992         self.countries = countries
 993
 994
 995 class UserNotLive(ExtractorError):
 996     """Error when a channel/user is not live"""
 997
 998     def __init__(self, msg=None, **kwargs):
 999         kwargs['expected'] = True
1000         super().__init__(msg or 'The channel is not currently live', **kwargs)
1001
1002
1003 class DownloadError(YoutubeDLError):
1004     """Download Error exception.
1005
1006     This exception may be thrown by FileDownloader objects if they are not
1007     configured to continue on errors. They will contain the appropriate
1008     error message.
1009     """
1010
1011     def __init__(self, msg, exc_info=None):
1012         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1013         super().__init__(msg)
1014         self.exc_info = exc_info
1015
1016
1017 class EntryNotInPlaylist(YoutubeDLError):
1018     """Entry not in playlist exception.
1019
1020     This exception will be thrown by YoutubeDL when a requested entry
1021     is not found in the playlist info_dict
1022     """
1023     msg = 'Entry not found in info'
1024
1025
1026 class SameFileError(YoutubeDLError):
1027     """Same File exception.
1028
1029     This exception will be thrown by FileDownloader objects if they detect
1030     multiple files would have to be downloaded to the same file on disk.
1031     """
1032     msg = 'Fixed output name but more than one file to download'
1033
1034     def __init__(self, filename=None):
1035         if filename is not None:
1036             self.msg += f': {filename}'
1037         super().__init__(self.msg)
1038
1039
1040 class PostProcessingError(YoutubeDLError):
1041     """Post Processing exception.
1042
1043     This exception may be raised by PostProcessor's .run() method to
1044     indicate an error in the postprocessing task.
1045     """
1046
1047
1048 class DownloadCancelled(YoutubeDLError):
1049     """ Exception raised when the download queue should be interrupted """
1050     msg = 'The download was cancelled'
1051
1052
1053 class ExistingVideoReached(DownloadCancelled):
1054     """ --break-on-existing triggered """
1055     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1056
1057
1058 class RejectedVideoReached(DownloadCancelled):
1059     """ --break-match-filter triggered """
1060     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1061
1062
1063 class MaxDownloadsReached(DownloadCancelled):
1064     """ --max-downloads limit has been reached. """
1065     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1066
1067
1068 class ReExtractInfo(YoutubeDLError):
1069     """ Video info needs to be re-extracted. """
1070
1071     def __init__(self, msg, expected=False):
1072         super().__init__(msg)
1073         self.expected = expected
1074
1075
1076 class ThrottledDownload(ReExtractInfo):
1077     """ Download speed below --throttled-rate. """
1078     msg = 'The download speed is below throttle limit'
1079
1080     def __init__(self):
1081         super().__init__(self.msg, expected=False)
1082
1083
1084 class UnavailableVideoError(YoutubeDLError):
1085     """Unavailable Format exception.
1086
1087     This exception will be thrown when a video is requested
1088     in a format that is not available for that video.
1089     """
1090     msg = 'Unable to download video'
1091
1092     def __init__(self, err=None):
1093         if err is not None:
1094             self.msg += f': {err}'
1095         super().__init__(self.msg)
1096
1097
1098 class ContentTooShortError(YoutubeDLError):
1099     """Content Too Short exception.
1100
1101     This exception may be raised by FileDownloader objects when a file they
1102     download is too small for what the server announced first, indicating
1103     the connection was probably interrupted.
1104     """
1105
1106     def __init__(self, downloaded, expected):
1107         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1108         # Both in bytes
1109         self.downloaded = downloaded
1110         self.expected = expected
1111
1112
1113 class XAttrMetadataError(YoutubeDLError):
1114     def __init__(self, code=None, msg='Unknown error'):
1115         super().__init__(msg)
1116         self.code = code
1117         self.msg = msg
1118
1119         # Parsing code and msg
1120         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1121                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1122             self.reason = 'NO_SPACE'
1123         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1124             self.reason = 'VALUE_TOO_LONG'
1125         else:
1126             self.reason = 'NOT_SUPPORTED'
1127
1128
1129 class XAttrUnavailableError(YoutubeDLError):
1130     pass
1131
1132
1133 def is_path_like(f):
1134     return isinstance(f, (str, bytes, os.PathLike))
1135
1136
1137 def extract_timezone(date_str):
1138     m = re.search(
1139         r'''(?x)
1140             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1141             (?P<tz>Z|                                            # just the UTC Z, or
1142                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1143                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1144                    [ ]?                                          # optional space
1145                 (?P<sign>\+|-)                                   # +/-
1146                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1147             $)
1148         ''', date_str)
1149     if not m:
1150         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1151         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1152         if timezone is not None:
1153             date_str = date_str[:-len(m.group('tz'))]
1154         timezone = dt.timedelta(hours=timezone or 0)
1155     else:
1156         date_str = date_str[:-len(m.group('tz'))]
1157         if not m.group('sign'):
1158             timezone = dt.timedelta()
1159         else:
1160             sign = 1 if m.group('sign') == '+' else -1
1161             timezone = dt.timedelta(
1162                 hours=sign * int(m.group('hours')),
1163                 minutes=sign * int(m.group('minutes')))
1164     return timezone, date_str
1165
1166
1167 def parse_iso8601(date_str, delimiter='T', timezone=None):
1168     """ Return a UNIX timestamp from the given date """
1169
1170     if date_str is None:
1171         return None
1172
1173     date_str = re.sub(r'\.[0-9]+', '', date_str)
1174
1175     if timezone is None:
1176         timezone, date_str = extract_timezone(date_str)
1177
1178     with contextlib.suppress(ValueError):
1179         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1180         dt_ = dt.datetime.strptime(date_str, date_format) - timezone
1181         return calendar.timegm(dt_.timetuple())
1182
1183
1184 def date_formats(day_first=True):
1185     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1186
1187
1188 def unified_strdate(date_str, day_first=True):
1189     """Return a string with the date in the format YYYYMMDD"""
1190
1191     if date_str is None:
1192         return None
1193     upload_date = None
1194     # Replace commas
1195     date_str = date_str.replace(',', ' ')
1196     # Remove AM/PM + timezone
1197     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1198     _, date_str = extract_timezone(date_str)
1199
1200     for expression in date_formats(day_first):
1201         with contextlib.suppress(ValueError):
1202             upload_date = dt.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1203     if upload_date is None:
1204         timetuple = email.utils.parsedate_tz(date_str)
1205         if timetuple:
1206             with contextlib.suppress(ValueError):
1207                 upload_date = dt.datetime(*timetuple[:6]).strftime('%Y%m%d')
1208     if upload_date is not None:
1209         return str(upload_date)
1210
1211
1212 def unified_timestamp(date_str, day_first=True):
1213     if not isinstance(date_str, str):
1214         return None
1215
1216     date_str = re.sub(r'\s+', ' ', re.sub(
1217         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1218
1219     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1220     timezone, date_str = extract_timezone(date_str)
1221
1222     # Remove AM/PM + timezone
1223     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1224
1225     # Remove unrecognized timezones from ISO 8601 alike timestamps
1226     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1227     if m:
1228         date_str = date_str[:-len(m.group('tz'))]
1229
1230     # Python only supports microseconds, so remove nanoseconds
1231     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1232     if m:
1233         date_str = m.group(1)
1234
1235     for expression in date_formats(day_first):
1236         with contextlib.suppress(ValueError):
1237             dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta)
1238             return calendar.timegm(dt_.timetuple())
1239
1240     timetuple = email.utils.parsedate_tz(date_str)
1241     if timetuple:
1242         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1243
1244
1245 def determine_ext(url, default_ext='unknown_video'):
1246     if url is None or '.' not in url:
1247         return default_ext
1248     guess = url.partition('?')[0].rpartition('.')[2]
1249     if re.match(r'^[A-Za-z0-9]+$', guess):
1250         return guess
1251     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1252     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1253         return guess.rstrip('/')
1254     else:
1255         return default_ext
1256
1257
1258 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1259     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1260
1261
1262 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1263     R"""
1264     Return a datetime object from a string.
1265     Supported format:
1266         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1267
1268     @param format       strftime format of DATE
1269     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1270                         auto: round to the unit provided in date_str (if applicable).
1271     """
1272     auto_precision = False
1273     if precision == 'auto':
1274         auto_precision = True
1275         precision = 'microsecond'
1276     today = datetime_round(dt.datetime.now(dt.timezone.utc), precision)
1277     if date_str in ('now', 'today'):
1278         return today
1279     if date_str == 'yesterday':
1280         return today - dt.timedelta(days=1)
1281     match = re.match(
1282         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1283         date_str)
1284     if match is not None:
1285         start_time = datetime_from_str(match.group('start'), precision, format)
1286         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1287         unit = match.group('unit')
1288         if unit == 'month' or unit == 'year':
1289             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1290             unit = 'day'
1291         else:
1292             if unit == 'week':
1293                 unit = 'day'
1294                 time *= 7
1295             delta = dt.timedelta(**{unit + 's': time})
1296             new_date = start_time + delta
1297         if auto_precision:
1298             return datetime_round(new_date, unit)
1299         return new_date
1300
1301     return datetime_round(dt.datetime.strptime(date_str, format), precision)
1302
1303
1304 def date_from_str(date_str, format='%Y%m%d', strict=False):
1305     R"""
1306     Return a date object from a string using datetime_from_str
1307
1308     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1309                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1310     """
1311     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1312         raise ValueError(f'Invalid date format "{date_str}"')
1313     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1314
1315
1316 def datetime_add_months(dt_, months):
1317     """Increment/Decrement a datetime object by months."""
1318     month = dt_.month + months - 1
1319     year = dt_.year + month // 12
1320     month = month % 12 + 1
1321     day = min(dt_.day, calendar.monthrange(year, month)[1])
1322     return dt_.replace(year, month, day)
1323
1324
1325 def datetime_round(dt_, precision='day'):
1326     """
1327     Round a datetime object's time to a specific precision
1328     """
1329     if precision == 'microsecond':
1330         return dt_
1331
1332     unit_seconds = {
1333         'day': 86400,
1334         'hour': 3600,
1335         'minute': 60,
1336         'second': 1,
1337     }
1338     roundto = lambda x, n: ((x + n / 2) // n) * n
1339     timestamp = roundto(calendar.timegm(dt_.timetuple()), unit_seconds[precision])
1340     return dt.datetime.fromtimestamp(timestamp, dt.timezone.utc)
1341
1342
1343 def hyphenate_date(date_str):
1344     """
1345     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1346     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1347     if match is not None:
1348         return '-'.join(match.groups())
1349     else:
1350         return date_str
1351
1352
1353 class DateRange:
1354     """Represents a time interval between two dates"""
1355
1356     def __init__(self, start=None, end=None):
1357         """start and end must be strings in the format accepted by date"""
1358         if start is not None:
1359             self.start = date_from_str(start, strict=True)
1360         else:
1361             self.start = dt.datetime.min.date()
1362         if end is not None:
1363             self.end = date_from_str(end, strict=True)
1364         else:
1365             self.end = dt.datetime.max.date()
1366         if self.start > self.end:
1367             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1368
1369     @classmethod
1370     def day(cls, day):
1371         """Returns a range that only contains the given day"""
1372         return cls(day, day)
1373
1374     def __contains__(self, date):
1375         """Check if the date is in the range"""
1376         if not isinstance(date, dt.date):
1377             date = date_from_str(date)
1378         return self.start <= date <= self.end
1379
1380     def __repr__(self):
1381         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1382
1383     def __str__(self):
1384         return f'{self.start} to {self.end}'
1385
1386     def __eq__(self, other):
1387         return (isinstance(other, DateRange)
1388                 and self.start == other.start and self.end == other.end)
1389
1390
1391 @functools.cache
1392 def system_identifier():
1393     python_implementation = platform.python_implementation()
1394     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1395         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1396     libc_ver = []
1397     with contextlib.suppress(OSError):  # We may not have access to the executable
1398         libc_ver = platform.libc_ver()
1399
1400     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1401         platform.python_version(),
1402         python_implementation,
1403         platform.machine(),
1404         platform.architecture()[0],
1405         platform.platform(),
1406         ssl.OPENSSL_VERSION,
1407         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1408     )
1409
1410
1411 @functools.cache
1412 def get_windows_version():
1413     ''' Get Windows version. returns () if it's not running on Windows '''
1414     if compat_os_name == 'nt':
1415         return version_tuple(platform.win32_ver()[1])
1416     else:
1417         return ()
1418
1419
1420 def write_string(s, out=None, encoding=None):
1421     assert isinstance(s, str)
1422     out = out or sys.stderr
1423     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1424     if not out:
1425         return
1426
1427     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1428         s = re.sub(r'([\r\n]+)', r' \1', s)
1429
1430     enc, buffer = None, out
1431     # `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
1432     if 'b' in (getattr(out, 'mode', None) or ''):
1433         enc = encoding or preferredencoding()
1434     elif hasattr(out, 'buffer'):
1435         buffer = out.buffer
1436         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1437
1438     buffer.write(s.encode(enc, 'ignore') if enc else s)
1439     out.flush()
1440
1441
1442 # TODO: Use global logger
1443 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1444     from .. import _IN_CLI
1445     if _IN_CLI:
1446         if msg in deprecation_warning._cache:
1447             return
1448         deprecation_warning._cache.add(msg)
1449         if printer:
1450             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1451         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1452     else:
1453         import warnings
1454         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1455
1456
1457 deprecation_warning._cache = set()
1458
1459
1460 def bytes_to_intlist(bs):
1461     if not bs:
1462         return []
1463     if isinstance(bs[0], int):  # Python 3
1464         return list(bs)
1465     else:
1466         return [ord(c) for c in bs]
1467
1468
1469 def intlist_to_bytes(xs):
1470     if not xs:
1471         return b''
1472     return struct.pack('%dB' % len(xs), *xs)
1473
1474
1475 class LockingUnsupportedError(OSError):
1476     msg = 'File locking is not supported'
1477
1478     def __init__(self):
1479         super().__init__(self.msg)
1480
1481
1482 # Cross-platform file locking
1483 if sys.platform == 'win32':
1484     import ctypes
1485     import ctypes.wintypes
1486     import msvcrt
1487
1488     class OVERLAPPED(ctypes.Structure):
1489         _fields_ = [
1490             ('Internal', ctypes.wintypes.LPVOID),
1491             ('InternalHigh', ctypes.wintypes.LPVOID),
1492             ('Offset', ctypes.wintypes.DWORD),
1493             ('OffsetHigh', ctypes.wintypes.DWORD),
1494             ('hEvent', ctypes.wintypes.HANDLE),
1495         ]
1496
1497     kernel32 = ctypes.WinDLL('kernel32')
1498     LockFileEx = kernel32.LockFileEx
1499     LockFileEx.argtypes = [
1500         ctypes.wintypes.HANDLE,     # hFile
1501         ctypes.wintypes.DWORD,      # dwFlags
1502         ctypes.wintypes.DWORD,      # dwReserved
1503         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1504         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1505         ctypes.POINTER(OVERLAPPED)  # Overlapped
1506     ]
1507     LockFileEx.restype = ctypes.wintypes.BOOL
1508     UnlockFileEx = kernel32.UnlockFileEx
1509     UnlockFileEx.argtypes = [
1510         ctypes.wintypes.HANDLE,     # hFile
1511         ctypes.wintypes.DWORD,      # dwReserved
1512         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1513         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1514         ctypes.POINTER(OVERLAPPED)  # Overlapped
1515     ]
1516     UnlockFileEx.restype = ctypes.wintypes.BOOL
1517     whole_low = 0xffffffff
1518     whole_high = 0x7fffffff
1519
1520     def _lock_file(f, exclusive, block):
1521         overlapped = OVERLAPPED()
1522         overlapped.Offset = 0
1523         overlapped.OffsetHigh = 0
1524         overlapped.hEvent = 0
1525         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1526
1527         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1528                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1529                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1530             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1531             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1532
1533     def _unlock_file(f):
1534         assert f._lock_file_overlapped_p
1535         handle = msvcrt.get_osfhandle(f.fileno())
1536         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1537             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1538
1539 else:
1540     try:
1541         import fcntl
1542
1543         def _lock_file(f, exclusive, block):
1544             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1545             if not block:
1546                 flags |= fcntl.LOCK_NB
1547             try:
1548                 fcntl.flock(f, flags)
1549             except BlockingIOError:
1550                 raise
1551             except OSError:  # AOSP does not have flock()
1552                 fcntl.lockf(f, flags)
1553
1554         def _unlock_file(f):
1555             with contextlib.suppress(OSError):
1556                 return fcntl.flock(f, fcntl.LOCK_UN)
1557             with contextlib.suppress(OSError):
1558                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1559             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1560
1561     except ImportError:
1562
1563         def _lock_file(f, exclusive, block):
1564             raise LockingUnsupportedError()
1565
1566         def _unlock_file(f):
1567             raise LockingUnsupportedError()
1568
1569
1570 class locked_file:
1571     locked = False
1572
1573     def __init__(self, filename, mode, block=True, encoding=None):
1574         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1575             raise NotImplementedError(mode)
1576         self.mode, self.block = mode, block
1577
1578         writable = any(f in mode for f in 'wax+')
1579         readable = any(f in mode for f in 'r+')
1580         flags = functools.reduce(operator.ior, (
1581             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
1582             getattr(os, 'O_BINARY', 0),  # Windows only
1583             getattr(os, 'O_NOINHERIT', 0),  # Windows only
1584             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
1585             os.O_APPEND if 'a' in mode else 0,
1586             os.O_EXCL if 'x' in mode else 0,
1587             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1588         ))
1589
1590         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1591
1592     def __enter__(self):
1593         exclusive = 'r' not in self.mode
1594         try:
1595             _lock_file(self.f, exclusive, self.block)
1596             self.locked = True
1597         except OSError:
1598             self.f.close()
1599             raise
1600         if 'w' in self.mode:
1601             try:
1602                 self.f.truncate()
1603             except OSError as e:
1604                 if e.errno not in (
1605                     errno.ESPIPE,  # Illegal seek - expected for FIFO
1606                     errno.EINVAL,  # Invalid argument - expected for /dev/null
1607                 ):
1608                     raise
1609         return self
1610
1611     def unlock(self):
1612         if not self.locked:
1613             return
1614         try:
1615             _unlock_file(self.f)
1616         finally:
1617             self.locked = False
1618
1619     def __exit__(self, *_):
1620         try:
1621             self.unlock()
1622         finally:
1623             self.f.close()
1624
1625     open = __enter__
1626     close = __exit__
1627
1628     def __getattr__(self, attr):
1629         return getattr(self.f, attr)
1630
1631     def __iter__(self):
1632         return iter(self.f)
1633
1634
1635 @functools.cache
1636 def get_filesystem_encoding():
1637     encoding = sys.getfilesystemencoding()
1638     return encoding if encoding is not None else 'utf-8'
1639
1640
1641 _WINDOWS_QUOTE_TRANS = str.maketrans({'"': R'\"'})
1642 _CMD_QUOTE_TRANS = str.maketrans({
1643     # Keep quotes balanced by replacing them with `""` instead of `\\"`
1644     '"': '""',
1645     # These require an env-variable `=` containing `"^\n\n"` (set in `utils.Popen`)
1646     # `=` should be unique since variables containing `=` cannot be set using cmd
1647     '\n': '%=%',
1648     '\r': '%=%',
1649     # Use zero length variable replacement so `%` doesn't get expanded
1650     # `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`)
1651     '%': '%%cd:~,%',
1652 })
1653
1654
1655 def shell_quote(args, *, shell=False):
1656     args = list(variadic(args))
1657
1658     if compat_os_name != 'nt':
1659         return shlex.join(args)
1660
1661     trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS
1662     return ' '.join(
1663         s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII)
1664         else re.sub(r'(\\+)("|$)', r'\1\1\2', s).translate(trans).join('""')
1665         for s in args)
1666
1667
1668 def smuggle_url(url, data):
1669     """ Pass additional data in a URL for internal use. """
1670
1671     url, idata = unsmuggle_url(url, {})
1672     data.update(idata)
1673     sdata = urllib.parse.urlencode(
1674         {'__youtubedl_smuggle': json.dumps(data)})
1675     return url + '#' + sdata
1676
1677
1678 def unsmuggle_url(smug_url, default=None):
1679     if '#__youtubedl_smuggle' not in smug_url:
1680         return smug_url, default
1681     url, _, sdata = smug_url.rpartition('#')
1682     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1683     data = json.loads(jsond)
1684     return url, data
1685
1686
1687 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1688     """ Formats numbers with decimal sufixes like K, M, etc """
1689     num, factor = float_or_none(num), float(factor)
1690     if num is None or num < 0:
1691         return None
1692     POSSIBLE_SUFFIXES = 'kMGTPEZY'
1693     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1694     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1695     if factor == 1024:
1696         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1697     converted = num / (factor ** exponent)
1698     return fmt % (converted, suffix)
1699
1700
1701 def format_bytes(bytes):
1702     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1703
1704
1705 def lookup_unit_table(unit_table, s, strict=False):
1706     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1707     units_re = '|'.join(re.escape(u) for u in unit_table)
1708     m = (re.fullmatch if strict else re.match)(
1709         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1710     if not m:
1711         return None
1712
1713     num = float(m.group('num').replace(',', '.'))
1714     mult = unit_table[m.group('unit')]
1715     return round(num * mult)
1716
1717
1718 def parse_bytes(s):
1719     """Parse a string indicating a byte quantity into an integer"""
1720     return lookup_unit_table(
1721         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1722         s.upper(), strict=True)
1723
1724
1725 def parse_filesize(s):
1726     if s is None:
1727         return None
1728
1729     # The lower-case forms are of course incorrect and unofficial,
1730     # but we support those too
1731     _UNIT_TABLE = {
1732         'B': 1,
1733         'b': 1,
1734         'bytes': 1,
1735         'KiB': 1024,
1736         'KB': 1000,
1737         'kB': 1024,
1738         'Kb': 1000,
1739         'kb': 1000,
1740         'kilobytes': 1000,
1741         'kibibytes': 1024,
1742         'MiB': 1024 ** 2,
1743         'MB': 1000 ** 2,
1744         'mB': 1024 ** 2,
1745         'Mb': 1000 ** 2,
1746         'mb': 1000 ** 2,
1747         'megabytes': 1000 ** 2,
1748         'mebibytes': 1024 ** 2,
1749         'GiB': 1024 ** 3,
1750         'GB': 1000 ** 3,
1751         'gB': 1024 ** 3,
1752         'Gb': 1000 ** 3,
1753         'gb': 1000 ** 3,
1754         'gigabytes': 1000 ** 3,
1755         'gibibytes': 1024 ** 3,
1756         'TiB': 1024 ** 4,
1757         'TB': 1000 ** 4,
1758         'tB': 1024 ** 4,
1759         'Tb': 1000 ** 4,
1760         'tb': 1000 ** 4,
1761         'terabytes': 1000 ** 4,
1762         'tebibytes': 1024 ** 4,
1763         'PiB': 1024 ** 5,
1764         'PB': 1000 ** 5,
1765         'pB': 1024 ** 5,
1766         'Pb': 1000 ** 5,
1767         'pb': 1000 ** 5,
1768         'petabytes': 1000 ** 5,
1769         'pebibytes': 1024 ** 5,
1770         'EiB': 1024 ** 6,
1771         'EB': 1000 ** 6,
1772         'eB': 1024 ** 6,
1773         'Eb': 1000 ** 6,
1774         'eb': 1000 ** 6,
1775         'exabytes': 1000 ** 6,
1776         'exbibytes': 1024 ** 6,
1777         'ZiB': 1024 ** 7,
1778         'ZB': 1000 ** 7,
1779         'zB': 1024 ** 7,
1780         'Zb': 1000 ** 7,
1781         'zb': 1000 ** 7,
1782         'zettabytes': 1000 ** 7,
1783         'zebibytes': 1024 ** 7,
1784         'YiB': 1024 ** 8,
1785         'YB': 1000 ** 8,
1786         'yB': 1024 ** 8,
1787         'Yb': 1000 ** 8,
1788         'yb': 1000 ** 8,
1789         'yottabytes': 1000 ** 8,
1790         'yobibytes': 1024 ** 8,
1791     }
1792
1793     return lookup_unit_table(_UNIT_TABLE, s)
1794
1795
1796 def parse_count(s):
1797     if s is None:
1798         return None
1799
1800     s = re.sub(r'^[^\d]+\s', '', s).strip()
1801
1802     if re.match(r'^[\d,.]+$', s):
1803         return str_to_int(s)
1804
1805     _UNIT_TABLE = {
1806         'k': 1000,
1807         'K': 1000,
1808         'm': 1000 ** 2,
1809         'M': 1000 ** 2,
1810         'kk': 1000 ** 2,
1811         'KK': 1000 ** 2,
1812         'b': 1000 ** 3,
1813         'B': 1000 ** 3,
1814     }
1815
1816     ret = lookup_unit_table(_UNIT_TABLE, s)
1817     if ret is not None:
1818         return ret
1819
1820     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1821     if mobj:
1822         return str_to_int(mobj.group(1))
1823
1824
1825 def parse_resolution(s, *, lenient=False):
1826     if s is None:
1827         return {}
1828
1829     if lenient:
1830         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1831     else:
1832         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1833     if mobj:
1834         return {
1835             'width': int(mobj.group('w')),
1836             'height': int(mobj.group('h')),
1837         }
1838
1839     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1840     if mobj:
1841         return {'height': int(mobj.group(1))}
1842
1843     mobj = re.search(r'\b([48])[kK]\b', s)
1844     if mobj:
1845         return {'height': int(mobj.group(1)) * 540}
1846
1847     return {}
1848
1849
1850 def parse_bitrate(s):
1851     if not isinstance(s, str):
1852         return
1853     mobj = re.search(r'\b(\d+)\s*kbps', s)
1854     if mobj:
1855         return int(mobj.group(1))
1856
1857
1858 def month_by_name(name, lang='en'):
1859     """ Return the number of a month by (locale-independently) English name """
1860
1861     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1862
1863     try:
1864         return month_names.index(name) + 1
1865     except ValueError:
1866         return None
1867
1868
1869 def month_by_abbreviation(abbrev):
1870     """ Return the number of a month by (locale-independently) English
1871         abbreviations """
1872
1873     try:
1874         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1875     except ValueError:
1876         return None
1877
1878
1879 def fix_xml_ampersands(xml_str):
1880     """Replace all the '&' by '&amp;' in XML"""
1881     return re.sub(
1882         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1883         '&amp;',
1884         xml_str)
1885
1886
1887 def setproctitle(title):
1888     assert isinstance(title, str)
1889
1890     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1891     try:
1892         import ctypes
1893     except ImportError:
1894         return
1895
1896     try:
1897         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1898     except OSError:
1899         return
1900     except TypeError:
1901         # LoadLibrary in Windows Python 2.7.13 only expects
1902         # a bytestring, but since unicode_literals turns
1903         # every string into a unicode string, it fails.
1904         return
1905     title_bytes = title.encode()
1906     buf = ctypes.create_string_buffer(len(title_bytes))
1907     buf.value = title_bytes
1908     try:
1909         # PR_SET_NAME = 15      Ref: /usr/include/linux/prctl.h
1910         libc.prctl(15, buf, 0, 0, 0)
1911     except AttributeError:
1912         return  # Strange libc, just skip this
1913
1914
1915 def remove_start(s, start):
1916     return s[len(start):] if s is not None and s.startswith(start) else s
1917
1918
1919 def remove_end(s, end):
1920     return s[:-len(end)] if s is not None and s.endswith(end) else s
1921
1922
1923 def remove_quotes(s):
1924     if s is None or len(s) < 2:
1925         return s
1926     for quote in ('"', "'", ):
1927         if s[0] == quote and s[-1] == quote:
1928             return s[1:-1]
1929     return s
1930
1931
1932 def get_domain(url):
1933     """
1934     This implementation is inconsistent, but is kept for compatibility.
1935     Use this only for "webpage_url_domain"
1936     """
1937     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1938
1939
1940 def url_basename(url):
1941     path = urllib.parse.urlparse(url).path
1942     return path.strip('/').split('/')[-1]
1943
1944
1945 def base_url(url):
1946     return re.match(r'https?://[^?#]+/', url).group()
1947
1948
1949 def urljoin(base, path):
1950     if isinstance(path, bytes):
1951         path = path.decode()
1952     if not isinstance(path, str) or not path:
1953         return None
1954     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1955         return path
1956     if isinstance(base, bytes):
1957         base = base.decode()
1958     if not isinstance(base, str) or not re.match(
1959             r'^(?:https?:)?//', base):
1960         return None
1961     return urllib.parse.urljoin(base, path)
1962
1963
1964 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1965     if get_attr and v is not None:
1966         v = getattr(v, get_attr, None)
1967     try:
1968         return int(v) * invscale // scale
1969     except (ValueError, TypeError, OverflowError):
1970         return default
1971
1972
1973 def str_or_none(v, default=None):
1974     return default if v is None else str(v)
1975
1976
1977 def str_to_int(int_str):
1978     """ A more relaxed version of int_or_none """
1979     if isinstance(int_str, int):
1980         return int_str
1981     elif isinstance(int_str, str):
1982         int_str = re.sub(r'[,\.\+]', '', int_str)
1983         return int_or_none(int_str)
1984
1985
1986 def float_or_none(v, scale=1, invscale=1, default=None):
1987     if v is None:
1988         return default
1989     try:
1990         return float(v) * invscale / scale
1991     except (ValueError, TypeError):
1992         return default
1993
1994
1995 def bool_or_none(v, default=None):
1996     return v if isinstance(v, bool) else default
1997
1998
1999 def strip_or_none(v, default=None):
2000     return v.strip() if isinstance(v, str) else default
2001
2002
2003 def url_or_none(url):
2004     if not url or not isinstance(url, str):
2005         return None
2006     url = url.strip()
2007     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2008
2009
2010 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
2011     datetime_object = None
2012     try:
2013         if isinstance(timestamp, (int, float)):  # unix timestamp
2014             # Using naive datetime here can break timestamp() in Windows
2015             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2016             # Also, dt.datetime.fromtimestamp breaks for negative timestamps
2017             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2018             datetime_object = (dt.datetime.fromtimestamp(0, dt.timezone.utc)
2019                                + dt.timedelta(seconds=timestamp))
2020         elif isinstance(timestamp, str):  # assume YYYYMMDD
2021             datetime_object = dt.datetime.strptime(timestamp, '%Y%m%d')
2022         date_format = re.sub(  # Support %s on windows
2023             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2024         return datetime_object.strftime(date_format)
2025     except (ValueError, TypeError, AttributeError):
2026         return default
2027
2028
2029 def parse_duration(s):
2030     if not isinstance(s, str):
2031         return None
2032     s = s.strip()
2033     if not s:
2034         return None
2035
2036     days, hours, mins, secs, ms = [None] * 5
2037     m = re.match(r'''(?x)
2038             (?P<before_secs>
2039                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2040             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2041             (?P<ms>[.:][0-9]+)?Z?$
2042         ''', s)
2043     if m:
2044         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2045     else:
2046         m = re.match(
2047             r'''(?ix)(?:P?
2048                 (?:
2049                     [0-9]+\s*y(?:ears?)?,?\s*
2050                 )?
2051                 (?:
2052                     [0-9]+\s*m(?:onths?)?,?\s*
2053                 )?
2054                 (?:
2055                     [0-9]+\s*w(?:eeks?)?,?\s*
2056                 )?
2057                 (?:
2058                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2059                 )?
2060                 T)?
2061                 (?:
2062                     (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2063                 )?
2064                 (?:
2065                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2066                 )?
2067                 (?:
2068                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2069                 )?Z?$''', s)
2070         if m:
2071             days, hours, mins, secs, ms = m.groups()
2072         else:
2073             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2074             if m:
2075                 hours, mins = m.groups()
2076             else:
2077                 return None
2078
2079     if ms:
2080         ms = ms.replace(':', '.')
2081     return sum(float(part or 0) * mult for part, mult in (
2082         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2083
2084
2085 def prepend_extension(filename, ext, expected_real_ext=None):
2086     name, real_ext = os.path.splitext(filename)
2087     return (
2088         f'{name}.{ext}{real_ext}'
2089         if not expected_real_ext or real_ext[1:] == expected_real_ext
2090         else f'{filename}.{ext}')
2091
2092
2093 def replace_extension(filename, ext, expected_real_ext=None):
2094     name, real_ext = os.path.splitext(filename)
2095     return '{}.{}'.format(
2096         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2097         ext)
2098
2099
2100 def check_executable(exe, args=[]):
2101     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2102     args can be a list of arguments for a short output (like -version) """
2103     try:
2104         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2105     except OSError:
2106         return False
2107     return exe
2108
2109
2110 def _get_exe_version_output(exe, args):
2111     try:
2112         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2113         # SIGTTOU if yt-dlp is run in the background.
2114         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2115         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2116                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2117         if ret:
2118             return None
2119     except OSError:
2120         return False
2121     return stdout
2122
2123
2124 def detect_exe_version(output, version_re=None, unrecognized='present'):
2125     assert isinstance(output, str)
2126     if version_re is None:
2127         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2128     m = re.search(version_re, output)
2129     if m:
2130         return m.group(1)
2131     else:
2132         return unrecognized
2133
2134
2135 def get_exe_version(exe, args=['--version'],
2136                     version_re=None, unrecognized=('present', 'broken')):
2137     """ Returns the version of the specified executable,
2138     or False if the executable is not present """
2139     unrecognized = variadic(unrecognized)
2140     assert len(unrecognized) in (1, 2)
2141     out = _get_exe_version_output(exe, args)
2142     if out is None:
2143         return unrecognized[-1]
2144     return out and detect_exe_version(out, version_re, unrecognized[0])
2145
2146
2147 def frange(start=0, stop=None, step=1):
2148     """Float range"""
2149     if stop is None:
2150         start, stop = 0, start
2151     sign = [-1, 1][step > 0] if step else 0
2152     while sign * start < sign * stop:
2153         yield start
2154         start += step
2155
2156
2157 class LazyList(collections.abc.Sequence):
2158     """Lazy immutable list from an iterable
2159     Note that slices of a LazyList are lists and not LazyList"""
2160
2161     class IndexError(IndexError):
2162         pass
2163
2164     def __init__(self, iterable, *, reverse=False, _cache=None):
2165         self._iterable = iter(iterable)
2166         self._cache = [] if _cache is None else _cache
2167         self._reversed = reverse
2168
2169     def __iter__(self):
2170         if self._reversed:
2171             # We need to consume the entire iterable to iterate in reverse
2172             yield from self.exhaust()
2173             return
2174         yield from self._cache
2175         for item in self._iterable:
2176             self._cache.append(item)
2177             yield item
2178
2179     def _exhaust(self):
2180         self._cache.extend(self._iterable)
2181         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2182         return self._cache
2183
2184     def exhaust(self):
2185         """Evaluate the entire iterable"""
2186         return self._exhaust()[::-1 if self._reversed else 1]
2187
2188     @staticmethod
2189     def _reverse_index(x):
2190         return None if x is None else ~x
2191
2192     def __getitem__(self, idx):
2193         if isinstance(idx, slice):
2194             if self._reversed:
2195                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2196             start, stop, step = idx.start, idx.stop, idx.step or 1
2197         elif isinstance(idx, int):
2198             if self._reversed:
2199                 idx = self._reverse_index(idx)
2200             start, stop, step = idx, idx, 0
2201         else:
2202             raise TypeError('indices must be integers or slices')
2203         if ((start or 0) < 0 or (stop or 0) < 0
2204                 or (start is None and step < 0)
2205                 or (stop is None and step > 0)):
2206             # We need to consume the entire iterable to be able to slice from the end
2207             # Obviously, never use this with infinite iterables
2208             self._exhaust()
2209             try:
2210                 return self._cache[idx]
2211             except IndexError as e:
2212                 raise self.IndexError(e) from e
2213         n = max(start or 0, stop or 0) - len(self._cache) + 1
2214         if n > 0:
2215             self._cache.extend(itertools.islice(self._iterable, n))
2216         try:
2217             return self._cache[idx]
2218         except IndexError as e:
2219             raise self.IndexError(e) from e
2220
2221     def __bool__(self):
2222         try:
2223             self[-1] if self._reversed else self[0]
2224         except self.IndexError:
2225             return False
2226         return True
2227
2228     def __len__(self):
2229         self._exhaust()
2230         return len(self._cache)
2231
2232     def __reversed__(self):
2233         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2234
2235     def __copy__(self):
2236         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2237
2238     def __repr__(self):
2239         # repr and str should mimic a list. So we exhaust the iterable
2240         return repr(self.exhaust())
2241
2242     def __str__(self):
2243         return repr(self.exhaust())
2244
2245
2246 class PagedList:
2247
2248     class IndexError(IndexError):
2249         pass
2250
2251     def __len__(self):
2252         # This is only useful for tests
2253         return len(self.getslice())
2254
2255     def __init__(self, pagefunc, pagesize, use_cache=True):
2256         self._pagefunc = pagefunc
2257         self._pagesize = pagesize
2258         self._pagecount = float('inf')
2259         self._use_cache = use_cache
2260         self._cache = {}
2261
2262     def getpage(self, pagenum):
2263         page_results = self._cache.get(pagenum)
2264         if page_results is None:
2265             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2266         if self._use_cache:
2267             self._cache[pagenum] = page_results
2268         return page_results
2269
2270     def getslice(self, start=0, end=None):
2271         return list(self._getslice(start, end))
2272
2273     def _getslice(self, start, end):
2274         raise NotImplementedError('This method must be implemented by subclasses')
2275
2276     def __getitem__(self, idx):
2277         assert self._use_cache, 'Indexing PagedList requires cache'
2278         if not isinstance(idx, int) or idx < 0:
2279             raise TypeError('indices must be non-negative integers')
2280         entries = self.getslice(idx, idx + 1)
2281         if not entries:
2282             raise self.IndexError()
2283         return entries[0]
2284
2285     def __bool__(self):
2286         return bool(self.getslice(0, 1))
2287
2288
2289 class OnDemandPagedList(PagedList):
2290     """Download pages until a page with less than maximum results"""
2291
2292     def _getslice(self, start, end):
2293         for pagenum in itertools.count(start // self._pagesize):
2294             firstid = pagenum * self._pagesize
2295             nextfirstid = pagenum * self._pagesize + self._pagesize
2296             if start >= nextfirstid:
2297                 continue
2298
2299             startv = (
2300                 start % self._pagesize
2301                 if firstid <= start < nextfirstid
2302                 else 0)
2303             endv = (
2304                 ((end - 1) % self._pagesize) + 1
2305                 if (end is not None and firstid <= end <= nextfirstid)
2306                 else None)
2307
2308             try:
2309                 page_results = self.getpage(pagenum)
2310             except Exception:
2311                 self._pagecount = pagenum - 1
2312                 raise
2313             if startv != 0 or endv is not None:
2314                 page_results = page_results[startv:endv]
2315             yield from page_results
2316
2317             # A little optimization - if current page is not "full", ie. does
2318             # not contain page_size videos then we can assume that this page
2319             # is the last one - there are no more ids on further pages -
2320             # i.e. no need to query again.
2321             if len(page_results) + startv < self._pagesize:
2322                 break
2323
2324             # If we got the whole page, but the next page is not interesting,
2325             # break out early as well
2326             if end == nextfirstid:
2327                 break
2328
2329
2330 class InAdvancePagedList(PagedList):
2331     """PagedList with total number of pages known in advance"""
2332
2333     def __init__(self, pagefunc, pagecount, pagesize):
2334         PagedList.__init__(self, pagefunc, pagesize, True)
2335         self._pagecount = pagecount
2336
2337     def _getslice(self, start, end):
2338         start_page = start // self._pagesize
2339         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2340         skip_elems = start - start_page * self._pagesize
2341         only_more = None if end is None else end - start
2342         for pagenum in range(start_page, end_page):
2343             page_results = self.getpage(pagenum)
2344             if skip_elems:
2345                 page_results = page_results[skip_elems:]
2346                 skip_elems = None
2347             if only_more is not None:
2348                 if len(page_results) < only_more:
2349                     only_more -= len(page_results)
2350                 else:
2351                     yield from page_results[:only_more]
2352                     break
2353             yield from page_results
2354
2355
2356 class PlaylistEntries:
2357     MissingEntry = object()
2358     is_exhausted = False
2359
2360     def __init__(self, ydl, info_dict):
2361         self.ydl = ydl
2362
2363         # _entries must be assigned now since infodict can change during iteration
2364         entries = info_dict.get('entries')
2365         if entries is None:
2366             raise EntryNotInPlaylist('There are no entries')
2367         elif isinstance(entries, list):
2368             self.is_exhausted = True
2369
2370         requested_entries = info_dict.get('requested_entries')
2371         self.is_incomplete = requested_entries is not None
2372         if self.is_incomplete:
2373             assert self.is_exhausted
2374             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2375             for i, entry in zip(requested_entries, entries):
2376                 self._entries[i - 1] = entry
2377         elif isinstance(entries, (list, PagedList, LazyList)):
2378             self._entries = entries
2379         else:
2380             self._entries = LazyList(entries)
2381
2382     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2383         (?P<start>[+-]?\d+)?
2384         (?P<range>[:-]
2385             (?P<end>[+-]?\d+|inf(?:inite)?)?
2386             (?::(?P<step>[+-]?\d+))?
2387         )?''')
2388
2389     @classmethod
2390     def parse_playlist_items(cls, string):
2391         for segment in string.split(','):
2392             if not segment:
2393                 raise ValueError('There is two or more consecutive commas')
2394             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2395             if not mobj:
2396                 raise ValueError(f'{segment!r} is not a valid specification')
2397             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2398             if int_or_none(step) == 0:
2399                 raise ValueError(f'Step in {segment!r} cannot be zero')
2400             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2401
2402     def get_requested_items(self):
2403         playlist_items = self.ydl.params.get('playlist_items')
2404         playlist_start = self.ydl.params.get('playliststart', 1)
2405         playlist_end = self.ydl.params.get('playlistend')
2406         # For backwards compatibility, interpret -1 as whole list
2407         if playlist_end in (-1, None):
2408             playlist_end = ''
2409         if not playlist_items:
2410             playlist_items = f'{playlist_start}:{playlist_end}'
2411         elif playlist_start != 1 or playlist_end:
2412             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2413
2414         for index in self.parse_playlist_items(playlist_items):
2415             for i, entry in self[index]:
2416                 yield i, entry
2417                 if not entry:
2418                     continue
2419                 try:
2420                     # The item may have just been added to archive. Don't break due to it
2421                     if not self.ydl.params.get('lazy_playlist'):
2422                         # TODO: Add auto-generated fields
2423                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2424                 except (ExistingVideoReached, RejectedVideoReached):
2425                     return
2426
2427     def get_full_count(self):
2428         if self.is_exhausted and not self.is_incomplete:
2429             return len(self)
2430         elif isinstance(self._entries, InAdvancePagedList):
2431             if self._entries._pagesize == 1:
2432                 return self._entries._pagecount
2433
2434     @functools.cached_property
2435     def _getter(self):
2436         if isinstance(self._entries, list):
2437             def get_entry(i):
2438                 try:
2439                     entry = self._entries[i]
2440                 except IndexError:
2441                     entry = self.MissingEntry
2442                     if not self.is_incomplete:
2443                         raise self.IndexError()
2444                 if entry is self.MissingEntry:
2445                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2446                 return entry
2447         else:
2448             def get_entry(i):
2449                 try:
2450                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2451                 except (LazyList.IndexError, PagedList.IndexError):
2452                     raise self.IndexError()
2453         return get_entry
2454
2455     def __getitem__(self, idx):
2456         if isinstance(idx, int):
2457             idx = slice(idx, idx)
2458
2459         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2460         step = 1 if idx.step is None else idx.step
2461         if idx.start is None:
2462             start = 0 if step > 0 else len(self) - 1
2463         else:
2464             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2465
2466         # NB: Do not call len(self) when idx == [:]
2467         if idx.stop is None:
2468             stop = 0 if step < 0 else float('inf')
2469         else:
2470             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2471         stop += [-1, 1][step > 0]
2472
2473         for i in frange(start, stop, step):
2474             if i < 0:
2475                 continue
2476             try:
2477                 entry = self._getter(i)
2478             except self.IndexError:
2479                 self.is_exhausted = True
2480                 if step > 0:
2481                     break
2482                 continue
2483             yield i + 1, entry
2484
2485     def __len__(self):
2486         return len(tuple(self[:]))
2487
2488     class IndexError(IndexError):
2489         pass
2490
2491
2492 def uppercase_escape(s):
2493     unicode_escape = codecs.getdecoder('unicode_escape')
2494     return re.sub(
2495         r'\\U[0-9a-fA-F]{8}',
2496         lambda m: unicode_escape(m.group(0))[0],
2497         s)
2498
2499
2500 def lowercase_escape(s):
2501     unicode_escape = codecs.getdecoder('unicode_escape')
2502     return re.sub(
2503         r'\\u[0-9a-fA-F]{4}',
2504         lambda m: unicode_escape(m.group(0))[0],
2505         s)
2506
2507
2508 def parse_qs(url, **kwargs):
2509     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2510
2511
2512 def read_batch_urls(batch_fd):
2513     def fixup(url):
2514         if not isinstance(url, str):
2515             url = url.decode('utf-8', 'replace')
2516         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2517         for bom in BOM_UTF8:
2518             if url.startswith(bom):
2519                 url = url[len(bom):]
2520         url = url.lstrip()
2521         if not url or url.startswith(('#', ';', ']')):
2522             return False
2523         # "#" cannot be stripped out since it is part of the URI
2524         # However, it can be safely stripped out if following a whitespace
2525         return re.split(r'\s#', url, 1)[0].rstrip()
2526
2527     with contextlib.closing(batch_fd) as fd:
2528         return [url for url in map(fixup, fd) if url]
2529
2530
2531 def urlencode_postdata(*args, **kargs):
2532     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2533
2534
2535 def update_url(url, *, query_update=None, **kwargs):
2536     """Replace URL components specified by kwargs
2537        @param url           str or parse url tuple
2538        @param query_update  update query
2539        @returns             str
2540     """
2541     if isinstance(url, str):
2542         if not kwargs and not query_update:
2543             return url
2544         else:
2545             url = urllib.parse.urlparse(url)
2546     if query_update:
2547         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2548         kwargs['query'] = urllib.parse.urlencode({
2549             **urllib.parse.parse_qs(url.query),
2550             **query_update
2551         }, True)
2552     return urllib.parse.urlunparse(url._replace(**kwargs))
2553
2554
2555 def update_url_query(url, query):
2556     return update_url(url, query_update=query)
2557
2558
2559 def _multipart_encode_impl(data, boundary):
2560     content_type = 'multipart/form-data; boundary=%s' % boundary
2561
2562     out = b''
2563     for k, v in data.items():
2564         out += b'--' + boundary.encode('ascii') + b'\r\n'
2565         if isinstance(k, str):
2566             k = k.encode()
2567         if isinstance(v, str):
2568             v = v.encode()
2569         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2570         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2571         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2572         if boundary.encode('ascii') in content:
2573             raise ValueError('Boundary overlaps with data')
2574         out += content
2575
2576     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2577
2578     return out, content_type
2579
2580
2581 def multipart_encode(data, boundary=None):
2582     '''
2583     Encode a dict to RFC 7578-compliant form-data
2584
2585     data:
2586         A dict where keys and values can be either Unicode or bytes-like
2587         objects.
2588     boundary:
2589         If specified a Unicode object, it's used as the boundary. Otherwise
2590         a random boundary is generated.
2591
2592     Reference: https://tools.ietf.org/html/rfc7578
2593     '''
2594     has_specified_boundary = boundary is not None
2595
2596     while True:
2597         if boundary is None:
2598             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2599
2600         try:
2601             out, content_type = _multipart_encode_impl(data, boundary)
2602             break
2603         except ValueError:
2604             if has_specified_boundary:
2605                 raise
2606             boundary = None
2607
2608     return out, content_type
2609
2610
2611 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2612     if blocked_types is NO_DEFAULT:
2613         blocked_types = (str, bytes, collections.abc.Mapping)
2614     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2615
2616
2617 def variadic(x, allowed_types=NO_DEFAULT):
2618     if not isinstance(allowed_types, (tuple, type)):
2619         deprecation_warning('allowed_types should be a tuple or a type')
2620         allowed_types = tuple(allowed_types)
2621     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2622
2623
2624 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2625     for f in funcs:
2626         try:
2627             val = f(*args, **kwargs)
2628         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2629             pass
2630         else:
2631             if expected_type is None or isinstance(val, expected_type):
2632                 return val
2633
2634
2635 def try_get(src, getter, expected_type=None):
2636     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2637
2638
2639 def filter_dict(dct, cndn=lambda _, v: v is not None):
2640     return {k: v for k, v in dct.items() if cndn(k, v)}
2641
2642
2643 def merge_dicts(*dicts):
2644     merged = {}
2645     for a_dict in dicts:
2646         for k, v in a_dict.items():
2647             if (v is not None and k not in merged
2648                     or isinstance(v, str) and merged[k] == ''):
2649                 merged[k] = v
2650     return merged
2651
2652
2653 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2654     return string if isinstance(string, str) else str(string, encoding, errors)
2655
2656
2657 US_RATINGS = {
2658     'G': 0,
2659     'PG': 10,
2660     'PG-13': 13,
2661     'R': 16,
2662     'NC': 18,
2663 }
2664
2665
2666 TV_PARENTAL_GUIDELINES = {
2667     'TV-Y': 0,
2668     'TV-Y7': 7,
2669     'TV-G': 0,
2670     'TV-PG': 0,
2671     'TV-14': 14,
2672     'TV-MA': 17,
2673 }
2674
2675
2676 def parse_age_limit(s):
2677     # isinstance(False, int) is True. So type() must be used instead
2678     if type(s) is int:  # noqa: E721
2679         return s if 0 <= s <= 21 else None
2680     elif not isinstance(s, str):
2681         return None
2682     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2683     if m:
2684         return int(m.group('age'))
2685     s = s.upper()
2686     if s in US_RATINGS:
2687         return US_RATINGS[s]
2688     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2689     if m:
2690         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2691     return None
2692
2693
2694 def strip_jsonp(code):
2695     return re.sub(
2696         r'''(?sx)^
2697             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2698             (?:\s*&&\s*(?P=func_name))?
2699             \s*\(\s*(?P<callback_data>.*)\);?
2700             \s*?(?://[^\n]*)*$''',
2701         r'\g<callback_data>', code)
2702
2703
2704 def js_to_json(code, vars={}, *, strict=False):
2705     # vars is a dict of var, val pairs to substitute
2706     STRING_QUOTES = '\'"`'
2707     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2708     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2709     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2710     INTEGER_TABLE = (
2711         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2712         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2713     )
2714
2715     def process_escape(match):
2716         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2717         escape = match.group(1) or match.group(2)
2718
2719         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2720                 else R'\u00' if escape == 'x'
2721                 else '' if escape == '\n'
2722                 else escape)
2723
2724     def template_substitute(match):
2725         evaluated = js_to_json(match.group(1), vars, strict=strict)
2726         if evaluated[0] == '"':
2727             return json.loads(evaluated)
2728         return evaluated
2729
2730     def fix_kv(m):
2731         v = m.group(0)
2732         if v in ('true', 'false', 'null'):
2733             return v
2734         elif v in ('undefined', 'void 0'):
2735             return 'null'
2736         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2737             return ''
2738
2739         if v[0] in STRING_QUOTES:
2740             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2741             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2742             return f'"{escaped}"'
2743
2744         for regex, base in INTEGER_TABLE:
2745             im = re.match(regex, v)
2746             if im:
2747                 i = int(im.group(1), base)
2748                 return f'"{i}":' if v.endswith(':') else str(i)
2749
2750         if v in vars:
2751             try:
2752                 if not strict:
2753                     json.loads(vars[v])
2754             except json.JSONDecodeError:
2755                 return json.dumps(vars[v])
2756             else:
2757                 return vars[v]
2758
2759         if not strict:
2760             return f'"{v}"'
2761
2762         raise ValueError(f'Unknown value: {v}')
2763
2764     def create_map(mobj):
2765         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2766
2767     code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
2768     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2769     if not strict:
2770         code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
2771         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2772         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2773         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2774
2775     return re.sub(rf'''(?sx)
2776         {STRING_RE}|
2777         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2778         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2779         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2780         [0-9]+(?={SKIP_RE}:)|
2781         !+
2782         ''', fix_kv, code)
2783
2784
2785 def qualities(quality_ids):
2786     """ Get a numeric quality value out of a list of possible values """
2787     def q(qid):
2788         try:
2789             return quality_ids.index(qid)
2790         except ValueError:
2791             return -1
2792     return q
2793
2794
2795 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2796
2797
2798 DEFAULT_OUTTMPL = {
2799     'default': '%(title)s [%(id)s].%(ext)s',
2800     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2801 }
2802 OUTTMPL_TYPES = {
2803     'chapter': None,
2804     'subtitle': None,
2805     'thumbnail': None,
2806     'description': 'description',
2807     'annotation': 'annotations.xml',
2808     'infojson': 'info.json',
2809     'link': None,
2810     'pl_video': None,
2811     'pl_thumbnail': None,
2812     'pl_description': 'description',
2813     'pl_infojson': 'info.json',
2814 }
2815
2816 # As of [1] format syntax is:
2817 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2818 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2819 STR_FORMAT_RE_TMPL = r'''(?x)
2820     (?<!%)(?P<prefix>(?:%%)*)
2821     %
2822     (?P<has_key>\((?P<key>{0})\))?
2823     (?P<format>
2824         (?P<conversion>[#0\-+ ]+)?
2825         (?P<min_width>\d+)?
2826         (?P<precision>\.\d+)?
2827         (?P<len_mod>[hlL])?  # unused in python
2828         {1}  # conversion type
2829     )
2830 '''
2831
2832
2833 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2834
2835
2836 def limit_length(s, length):
2837     """ Add ellipses to overly long strings """
2838     if s is None:
2839         return None
2840     ELLIPSES = '...'
2841     if len(s) > length:
2842         return s[:length - len(ELLIPSES)] + ELLIPSES
2843     return s
2844
2845
2846 def version_tuple(v):
2847     return tuple(int(e) for e in re.split(r'[-.]', v))
2848
2849
2850 def is_outdated_version(version, limit, assume_new=True):
2851     if not version:
2852         return not assume_new
2853     try:
2854         return version_tuple(version) < version_tuple(limit)
2855     except ValueError:
2856         return not assume_new
2857
2858
2859 def ytdl_is_updateable():
2860     """ Returns if yt-dlp can be updated with -U """
2861
2862     from ..update import is_non_updateable
2863
2864     return not is_non_updateable()
2865
2866
2867 def args_to_str(args):
2868     # Get a short string representation for a subprocess command
2869     return shell_quote(args)
2870
2871
2872 def error_to_str(err):
2873     return f'{type(err).__name__}: {err}'
2874
2875
2876 def mimetype2ext(mt, default=NO_DEFAULT):
2877     if not isinstance(mt, str):
2878         if default is not NO_DEFAULT:
2879             return default
2880         return None
2881
2882     MAP = {
2883         # video
2884         '3gpp': '3gp',
2885         'mp2t': 'ts',
2886         'mp4': 'mp4',
2887         'mpeg': 'mpeg',
2888         'mpegurl': 'm3u8',
2889         'quicktime': 'mov',
2890         'webm': 'webm',
2891         'vp9': 'vp9',
2892         'video/ogg': 'ogv',
2893         'x-flv': 'flv',
2894         'x-m4v': 'm4v',
2895         'x-matroska': 'mkv',
2896         'x-mng': 'mng',
2897         'x-mp4-fragmented': 'mp4',
2898         'x-ms-asf': 'asf',
2899         'x-ms-wmv': 'wmv',
2900         'x-msvideo': 'avi',
2901
2902         # application (streaming playlists)
2903         'dash+xml': 'mpd',
2904         'f4m+xml': 'f4m',
2905         'hds+xml': 'f4m',
2906         'vnd.apple.mpegurl': 'm3u8',
2907         'vnd.ms-sstr+xml': 'ism',
2908         'x-mpegurl': 'm3u8',
2909
2910         # audio
2911         'audio/mp4': 'm4a',
2912         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2913         # Using .mp3 as it's the most popular one
2914         'audio/mpeg': 'mp3',
2915         'audio/webm': 'webm',
2916         'audio/x-matroska': 'mka',
2917         'audio/x-mpegurl': 'm3u',
2918         'midi': 'mid',
2919         'ogg': 'ogg',
2920         'wav': 'wav',
2921         'wave': 'wav',
2922         'x-aac': 'aac',
2923         'x-flac': 'flac',
2924         'x-m4a': 'm4a',
2925         'x-realaudio': 'ra',
2926         'x-wav': 'wav',
2927
2928         # image
2929         'avif': 'avif',
2930         'bmp': 'bmp',
2931         'gif': 'gif',
2932         'jpeg': 'jpg',
2933         'png': 'png',
2934         'svg+xml': 'svg',
2935         'tiff': 'tif',
2936         'vnd.wap.wbmp': 'wbmp',
2937         'webp': 'webp',
2938         'x-icon': 'ico',
2939         'x-jng': 'jng',
2940         'x-ms-bmp': 'bmp',
2941
2942         # caption
2943         'filmstrip+json': 'fs',
2944         'smptett+xml': 'tt',
2945         'ttaf+xml': 'dfxp',
2946         'ttml+xml': 'ttml',
2947         'x-ms-sami': 'sami',
2948
2949         # misc
2950         'gzip': 'gz',
2951         'json': 'json',
2952         'xml': 'xml',
2953         'zip': 'zip',
2954     }
2955
2956     mimetype = mt.partition(';')[0].strip().lower()
2957     _, _, subtype = mimetype.rpartition('/')
2958
2959     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2960     if ext:
2961         return ext
2962     elif default is not NO_DEFAULT:
2963         return default
2964     return subtype.replace('+', '.')
2965
2966
2967 def ext2mimetype(ext_or_url):
2968     if not ext_or_url:
2969         return None
2970     if '.' not in ext_or_url:
2971         ext_or_url = f'file.{ext_or_url}'
2972     return mimetypes.guess_type(ext_or_url)[0]
2973
2974
2975 def parse_codecs(codecs_str):
2976     # http://tools.ietf.org/html/rfc6381
2977     if not codecs_str:
2978         return {}
2979     split_codecs = list(filter(None, map(
2980         str.strip, codecs_str.strip().strip(',').split(','))))
2981     vcodec, acodec, scodec, hdr = None, None, None, None
2982     for full_codec in split_codecs:
2983         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2984         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2985                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2986             if vcodec:
2987                 continue
2988             vcodec = full_codec
2989             if parts[0] in ('dvh1', 'dvhe'):
2990                 hdr = 'DV'
2991             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2992                 hdr = 'HDR10'
2993             elif parts[:2] == ['vp9', '2']:
2994                 hdr = 'HDR10'
2995         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
2996                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2997             acodec = acodec or full_codec
2998         elif parts[0] in ('stpp', 'wvtt'):
2999             scodec = scodec or full_codec
3000         else:
3001             write_string(f'WARNING: Unknown codec {full_codec}\n')
3002     if vcodec or acodec or scodec:
3003         return {
3004             'vcodec': vcodec or 'none',
3005             'acodec': acodec or 'none',
3006             'dynamic_range': hdr,
3007             **({'scodec': scodec} if scodec is not None else {}),
3008         }
3009     elif len(split_codecs) == 2:
3010         return {
3011             'vcodec': split_codecs[0],
3012             'acodec': split_codecs[1],
3013         }
3014     return {}
3015
3016
3017 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3018     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3019
3020     allow_mkv = not preferences or 'mkv' in preferences
3021
3022     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3023         return 'mkv'  # TODO: any other format allows this?
3024
3025     # TODO: All codecs supported by parse_codecs isn't handled here
3026     COMPATIBLE_CODECS = {
3027         'mp4': {
3028             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3029             'h264', 'aacl', 'ec-3',  # Set in ISM
3030         },
3031         'webm': {
3032             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3033             'vp9x', 'vp8x',  # in the webm spec
3034         },
3035     }
3036
3037     sanitize_codec = functools.partial(
3038         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3039     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3040
3041     for ext in preferences or COMPATIBLE_CODECS.keys():
3042         codec_set = COMPATIBLE_CODECS.get(ext, set())
3043         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3044             return ext
3045
3046     COMPATIBLE_EXTS = (
3047         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3048         {'webm', 'weba'},
3049     )
3050     for ext in preferences or vexts:
3051         current_exts = {ext, *vexts, *aexts}
3052         if ext == 'mkv' or current_exts == {ext} or any(
3053                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3054             return ext
3055     return 'mkv' if allow_mkv else preferences[-1]
3056
3057
3058 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3059     getheader = url_handle.headers.get
3060
3061     cd = getheader('Content-Disposition')
3062     if cd:
3063         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3064         if m:
3065             e = determine_ext(m.group('filename'), default_ext=None)
3066             if e:
3067                 return e
3068
3069     meta_ext = getheader('x-amz-meta-name')
3070     if meta_ext:
3071         e = meta_ext.rpartition('.')[2]
3072         if e:
3073             return e
3074
3075     return mimetype2ext(getheader('Content-Type'), default=default)
3076
3077
3078 def encode_data_uri(data, mime_type):
3079     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3080
3081
3082 def age_restricted(content_limit, age_limit):
3083     """ Returns True iff the content should be blocked """
3084
3085     if age_limit is None:  # No limit set
3086         return False
3087     if content_limit is None:
3088         return False  # Content available for everyone
3089     return age_limit < content_limit
3090
3091
3092 # List of known byte-order-marks (BOM)
3093 BOMS = [
3094     (b'\xef\xbb\xbf', 'utf-8'),
3095     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3096     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3097     (b'\xff\xfe', 'utf-16-le'),
3098     (b'\xfe\xff', 'utf-16-be'),
3099 ]
3100
3101
3102 def is_html(first_bytes):
3103     """ Detect whether a file contains HTML by examining its first bytes. """
3104
3105     encoding = 'utf-8'
3106     for bom, enc in BOMS:
3107         while first_bytes.startswith(bom):
3108             encoding, first_bytes = enc, first_bytes[len(bom):]
3109
3110     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3111
3112
3113 def determine_protocol(info_dict):
3114     protocol = info_dict.get('protocol')
3115     if protocol is not None:
3116         return protocol
3117
3118     url = sanitize_url(info_dict['url'])
3119     if url.startswith('rtmp'):
3120         return 'rtmp'
3121     elif url.startswith('mms'):
3122         return 'mms'
3123     elif url.startswith('rtsp'):
3124         return 'rtsp'
3125
3126     ext = determine_ext(url)
3127     if ext == 'm3u8':
3128         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3129     elif ext == 'f4m':
3130         return 'f4m'
3131
3132     return urllib.parse.urlparse(url).scheme
3133
3134
3135 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3136     """ Render a list of rows, each as a list of values.
3137     Text after a \t will be right aligned """
3138     def width(string):
3139         return len(remove_terminal_sequences(string).replace('\t', ''))
3140
3141     def get_max_lens(table):
3142         return [max(width(str(v)) for v in col) for col in zip(*table)]
3143
3144     def filter_using_list(row, filterArray):
3145         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3146
3147     max_lens = get_max_lens(data) if hide_empty else []
3148     header_row = filter_using_list(header_row, max_lens)
3149     data = [filter_using_list(row, max_lens) for row in data]
3150
3151     table = [header_row] + data
3152     max_lens = get_max_lens(table)
3153     extra_gap += 1
3154     if delim:
3155         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3156         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3157     for row in table:
3158         for pos, text in enumerate(map(str, row)):
3159             if '\t' in text:
3160                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3161             else:
3162                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3163     ret = '\n'.join(''.join(row).rstrip() for row in table)
3164     return ret
3165
3166
3167 def _match_one(filter_part, dct, incomplete):
3168     # TODO: Generalize code with YoutubeDL._build_format_filter
3169     STRING_OPERATORS = {
3170         '*=': operator.contains,
3171         '^=': lambda attr, value: attr.startswith(value),
3172         '$=': lambda attr, value: attr.endswith(value),
3173         '~=': lambda attr, value: re.search(value, attr),
3174     }
3175     COMPARISON_OPERATORS = {
3176         **STRING_OPERATORS,
3177         '<=': operator.le,  # "<=" must be defined above "<"
3178         '<': operator.lt,
3179         '>=': operator.ge,
3180         '>': operator.gt,
3181         '=': operator.eq,
3182     }
3183
3184     if isinstance(incomplete, bool):
3185         is_incomplete = lambda _: incomplete
3186     else:
3187         is_incomplete = lambda k: k in incomplete
3188
3189     operator_rex = re.compile(r'''(?x)
3190         (?P<key>[a-z_]+)
3191         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3192         (?:
3193             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3194             (?P<strval>.+?)
3195         )
3196         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3197     m = operator_rex.fullmatch(filter_part.strip())
3198     if m:
3199         m = m.groupdict()
3200         unnegated_op = COMPARISON_OPERATORS[m['op']]
3201         if m['negation']:
3202             op = lambda attr, value: not unnegated_op(attr, value)
3203         else:
3204             op = unnegated_op
3205         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3206         if m['quote']:
3207             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3208         actual_value = dct.get(m['key'])
3209         numeric_comparison = None
3210         if isinstance(actual_value, (int, float)):
3211             # If the original field is a string and matching comparisonvalue is
3212             # a number we should respect the origin of the original field
3213             # and process comparison value as a string (see
3214             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3215             try:
3216                 numeric_comparison = int(comparison_value)
3217             except ValueError:
3218                 numeric_comparison = parse_filesize(comparison_value)
3219                 if numeric_comparison is None:
3220                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3221                 if numeric_comparison is None:
3222                     numeric_comparison = parse_duration(comparison_value)
3223         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3224             raise ValueError('Operator %s only supports string values!' % m['op'])
3225         if actual_value is None:
3226             return is_incomplete(m['key']) or m['none_inclusive']
3227         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3228
3229     UNARY_OPERATORS = {
3230         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3231         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3232     }
3233     operator_rex = re.compile(r'''(?x)
3234         (?P<op>%s)\s*(?P<key>[a-z_]+)
3235         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3236     m = operator_rex.fullmatch(filter_part.strip())
3237     if m:
3238         op = UNARY_OPERATORS[m.group('op')]
3239         actual_value = dct.get(m.group('key'))
3240         if is_incomplete(m.group('key')) and actual_value is None:
3241             return True
3242         return op(actual_value)
3243
3244     raise ValueError('Invalid filter part %r' % filter_part)
3245
3246
3247 def match_str(filter_str, dct, incomplete=False):
3248     """ Filter a dictionary with a simple string syntax.
3249     @returns           Whether the filter passes
3250     @param incomplete  Set of keys that is expected to be missing from dct.
3251                        Can be True/False to indicate all/none of the keys may be missing.
3252                        All conditions on incomplete keys pass if the key is missing
3253     """
3254     return all(
3255         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3256         for filter_part in re.split(r'(?<!\\)&', filter_str))
3257
3258
3259 def match_filter_func(filters, breaking_filters=None):
3260     if not filters and not breaking_filters:
3261         return None
3262     repr_ = f'{match_filter_func.__module__}.{match_filter_func.__qualname__}({filters}, {breaking_filters})'
3263
3264     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3265     filters = set(variadic(filters or []))
3266
3267     interactive = '-' in filters
3268     if interactive:
3269         filters.remove('-')
3270
3271     @function_with_repr.set_repr(repr_)
3272     def _match_func(info_dict, incomplete=False):
3273         ret = breaking_filters(info_dict, incomplete)
3274         if ret is not None:
3275             raise RejectedVideoReached(ret)
3276
3277         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3278             return NO_DEFAULT if interactive and not incomplete else None
3279         else:
3280             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3281             filter_str = ') | ('.join(map(str.strip, filters))
3282             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3283     return _match_func
3284
3285
3286 class download_range_func:
3287     def __init__(self, chapters, ranges, from_info=False):
3288         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3289
3290     def __call__(self, info_dict, ydl):
3291
3292         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3293                    else 'Cannot match chapters since chapter information is unavailable')
3294         for regex in self.chapters or []:
3295             for i, chapter in enumerate(info_dict.get('chapters') or []):
3296                 if re.search(regex, chapter['title']):
3297                     warning = None
3298                     yield {**chapter, 'index': i}
3299         if self.chapters and warning:
3300             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3301
3302         for start, end in self.ranges or []:
3303             yield {
3304                 'start_time': self._handle_negative_timestamp(start, info_dict),
3305                 'end_time': self._handle_negative_timestamp(end, info_dict),
3306             }
3307
3308         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3309             yield {
3310                 'start_time': info_dict.get('start_time') or 0,
3311                 'end_time': info_dict.get('end_time') or float('inf'),
3312             }
3313         elif not self.ranges and not self.chapters:
3314             yield {}
3315
3316     @staticmethod
3317     def _handle_negative_timestamp(time, info):
3318         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3319
3320     def __eq__(self, other):
3321         return (isinstance(other, download_range_func)
3322                 and self.chapters == other.chapters and self.ranges == other.ranges)
3323
3324     def __repr__(self):
3325         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3326
3327
3328 def parse_dfxp_time_expr(time_expr):
3329     if not time_expr:
3330         return
3331
3332     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3333     if mobj:
3334         return float(mobj.group('time_offset'))
3335
3336     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3337     if mobj:
3338         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3339
3340
3341 def srt_subtitles_timecode(seconds):
3342     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3343
3344
3345 def ass_subtitles_timecode(seconds):
3346     time = timetuple_from_msec(seconds * 1000)
3347     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3348
3349
3350 def dfxp2srt(dfxp_data):
3351     '''
3352     @param dfxp_data A bytes-like object containing DFXP data
3353     @returns A unicode object containing converted SRT data
3354     '''
3355     LEGACY_NAMESPACES = (
3356         (b'http://www.w3.org/ns/ttml', [
3357             b'http://www.w3.org/2004/11/ttaf1',
3358             b'http://www.w3.org/2006/04/ttaf1',
3359             b'http://www.w3.org/2006/10/ttaf1',
3360         ]),
3361         (b'http://www.w3.org/ns/ttml#styling', [
3362             b'http://www.w3.org/ns/ttml#style',
3363         ]),
3364     )
3365
3366     SUPPORTED_STYLING = [
3367         'color',
3368         'fontFamily',
3369         'fontSize',
3370         'fontStyle',
3371         'fontWeight',
3372         'textDecoration'
3373     ]
3374
3375     _x = functools.partial(xpath_with_ns, ns_map={
3376         'xml': 'http://www.w3.org/XML/1998/namespace',
3377         'ttml': 'http://www.w3.org/ns/ttml',
3378         'tts': 'http://www.w3.org/ns/ttml#styling',
3379     })
3380
3381     styles = {}
3382     default_style = {}
3383
3384     class TTMLPElementParser:
3385         _out = ''
3386         _unclosed_elements = []
3387         _applied_styles = []
3388
3389         def start(self, tag, attrib):
3390             if tag in (_x('ttml:br'), 'br'):
3391                 self._out += '\n'
3392             else:
3393                 unclosed_elements = []
3394                 style = {}
3395                 element_style_id = attrib.get('style')
3396                 if default_style:
3397                     style.update(default_style)
3398                 if element_style_id:
3399                     style.update(styles.get(element_style_id, {}))
3400                 for prop in SUPPORTED_STYLING:
3401                     prop_val = attrib.get(_x('tts:' + prop))
3402                     if prop_val:
3403                         style[prop] = prop_val
3404                 if style:
3405                     font = ''
3406                     for k, v in sorted(style.items()):
3407                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3408                             continue
3409                         if k == 'color':
3410                             font += ' color="%s"' % v
3411                         elif k == 'fontSize':
3412                             font += ' size="%s"' % v
3413                         elif k == 'fontFamily':
3414                             font += ' face="%s"' % v
3415                         elif k == 'fontWeight' and v == 'bold':
3416                             self._out += '<b>'
3417                             unclosed_elements.append('b')
3418                         elif k == 'fontStyle' and v == 'italic':
3419                             self._out += '<i>'
3420                             unclosed_elements.append('i')
3421                         elif k == 'textDecoration' and v == 'underline':
3422                             self._out += '<u>'
3423                             unclosed_elements.append('u')
3424                     if font:
3425                         self._out += '<font' + font + '>'
3426                         unclosed_elements.append('font')
3427                     applied_style = {}
3428                     if self._applied_styles:
3429                         applied_style.update(self._applied_styles[-1])
3430                     applied_style.update(style)
3431                     self._applied_styles.append(applied_style)
3432                 self._unclosed_elements.append(unclosed_elements)
3433
3434         def end(self, tag):
3435             if tag not in (_x('ttml:br'), 'br'):
3436                 unclosed_elements = self._unclosed_elements.pop()
3437                 for element in reversed(unclosed_elements):
3438                     self._out += '</%s>' % element
3439                 if unclosed_elements and self._applied_styles:
3440                     self._applied_styles.pop()
3441
3442         def data(self, data):
3443             self._out += data
3444
3445         def close(self):
3446             return self._out.strip()
3447
3448     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3449     # This will not trigger false positives since only UTF-8 text is being replaced
3450     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3451
3452     def parse_node(node):
3453         target = TTMLPElementParser()
3454         parser = xml.etree.ElementTree.XMLParser(target=target)
3455         parser.feed(xml.etree.ElementTree.tostring(node))
3456         return parser.close()
3457
3458     for k, v in LEGACY_NAMESPACES:
3459         for ns in v:
3460             dfxp_data = dfxp_data.replace(ns, k)
3461
3462     dfxp = compat_etree_fromstring(dfxp_data)
3463     out = []
3464     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3465
3466     if not paras:
3467         raise ValueError('Invalid dfxp/TTML subtitle')
3468
3469     repeat = False
3470     while True:
3471         for style in dfxp.findall(_x('.//ttml:style')):
3472             style_id = style.get('id') or style.get(_x('xml:id'))
3473             if not style_id:
3474                 continue
3475             parent_style_id = style.get('style')
3476             if parent_style_id:
3477                 if parent_style_id not in styles:
3478                     repeat = True
3479                     continue
3480                 styles[style_id] = styles[parent_style_id].copy()
3481             for prop in SUPPORTED_STYLING:
3482                 prop_val = style.get(_x('tts:' + prop))
3483                 if prop_val:
3484                     styles.setdefault(style_id, {})[prop] = prop_val
3485         if repeat:
3486             repeat = False
3487         else:
3488             break
3489
3490     for p in ('body', 'div'):
3491         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3492         if ele is None:
3493             continue
3494         style = styles.get(ele.get('style'))
3495         if not style:
3496             continue
3497         default_style.update(style)
3498
3499     for para, index in zip(paras, itertools.count(1)):
3500         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3501         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3502         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3503         if begin_time is None:
3504             continue
3505         if not end_time:
3506             if not dur:
3507                 continue
3508             end_time = begin_time + dur
3509         out.append('%d\n%s --> %s\n%s\n\n' % (
3510             index,
3511             srt_subtitles_timecode(begin_time),
3512             srt_subtitles_timecode(end_time),
3513             parse_node(para)))
3514
3515     return ''.join(out)
3516
3517
3518 def cli_option(params, command_option, param, separator=None):
3519     param = params.get(param)
3520     return ([] if param is None
3521             else [command_option, str(param)] if separator is None
3522             else [f'{command_option}{separator}{param}'])
3523
3524
3525 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3526     param = params.get(param)
3527     assert param in (True, False, None)
3528     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3529
3530
3531 def cli_valueless_option(params, command_option, param, expected_value=True):
3532     return [command_option] if params.get(param) == expected_value else []
3533
3534
3535 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3536     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3537         if use_compat:
3538             return argdict
3539         else:
3540             argdict = None
3541     if argdict is None:
3542         return default
3543     assert isinstance(argdict, dict)
3544
3545     assert isinstance(keys, (list, tuple))
3546     for key_list in keys:
3547         arg_list = list(filter(
3548             lambda x: x is not None,
3549             [argdict.get(key.lower()) for key in variadic(key_list)]))
3550         if arg_list:
3551             return [arg for args in arg_list for arg in args]
3552     return default
3553
3554
3555 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3556     main_key, exe = main_key.lower(), exe.lower()
3557     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3558     keys = [f'{root_key}{k}' for k in (keys or [''])]
3559     if root_key in keys:
3560         if main_key != exe:
3561             keys.append((main_key, exe))
3562         keys.append('default')
3563     else:
3564         use_compat = False
3565     return cli_configuration_args(argdict, keys, default, use_compat)
3566
3567
3568 class ISO639Utils:
3569     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3570     _lang_map = {
3571         'aa': 'aar',
3572         'ab': 'abk',
3573         'ae': 'ave',
3574         'af': 'afr',
3575         'ak': 'aka',
3576         'am': 'amh',
3577         'an': 'arg',
3578         'ar': 'ara',
3579         'as': 'asm',
3580         'av': 'ava',
3581         'ay': 'aym',
3582         'az': 'aze',
3583         'ba': 'bak',
3584         'be': 'bel',
3585         'bg': 'bul',
3586         'bh': 'bih',
3587         'bi': 'bis',
3588         'bm': 'bam',
3589         'bn': 'ben',
3590         'bo': 'bod',
3591         'br': 'bre',
3592         'bs': 'bos',
3593         'ca': 'cat',
3594         'ce': 'che',
3595         'ch': 'cha',
3596         'co': 'cos',
3597         'cr': 'cre',
3598         'cs': 'ces',
3599         'cu': 'chu',
3600         'cv': 'chv',
3601         'cy': 'cym',
3602         'da': 'dan',
3603         'de': 'deu',
3604         'dv': 'div',
3605         'dz': 'dzo',
3606         'ee': 'ewe',
3607         'el': 'ell',
3608         'en': 'eng',
3609         'eo': 'epo',
3610         'es': 'spa',
3611         'et': 'est',
3612         'eu': 'eus',
3613         'fa': 'fas',
3614         'ff': 'ful',
3615         'fi': 'fin',
3616         'fj': 'fij',
3617         'fo': 'fao',
3618         'fr': 'fra',
3619         'fy': 'fry',
3620         'ga': 'gle',
3621         'gd': 'gla',
3622         'gl': 'glg',
3623         'gn': 'grn',
3624         'gu': 'guj',
3625         'gv': 'glv',
3626         'ha': 'hau',
3627         'he': 'heb',
3628         'iw': 'heb',  # Replaced by he in 1989 revision
3629         'hi': 'hin',
3630         'ho': 'hmo',
3631         'hr': 'hrv',
3632         'ht': 'hat',
3633         'hu': 'hun',
3634         'hy': 'hye',
3635         'hz': 'her',
3636         'ia': 'ina',
3637         'id': 'ind',
3638         'in': 'ind',  # Replaced by id in 1989 revision
3639         'ie': 'ile',
3640         'ig': 'ibo',
3641         'ii': 'iii',
3642         'ik': 'ipk',
3643         'io': 'ido',
3644         'is': 'isl',
3645         'it': 'ita',
3646         'iu': 'iku',
3647         'ja': 'jpn',
3648         'jv': 'jav',
3649         'ka': 'kat',
3650         'kg': 'kon',
3651         'ki': 'kik',
3652         'kj': 'kua',
3653         'kk': 'kaz',
3654         'kl': 'kal',
3655         'km': 'khm',
3656         'kn': 'kan',
3657         'ko': 'kor',
3658         'kr': 'kau',
3659         'ks': 'kas',
3660         'ku': 'kur',
3661         'kv': 'kom',
3662         'kw': 'cor',
3663         'ky': 'kir',
3664         'la': 'lat',
3665         'lb': 'ltz',
3666         'lg': 'lug',
3667         'li': 'lim',
3668         'ln': 'lin',
3669         'lo': 'lao',
3670         'lt': 'lit',
3671         'lu': 'lub',
3672         'lv': 'lav',
3673         'mg': 'mlg',
3674         'mh': 'mah',
3675         'mi': 'mri',
3676         'mk': 'mkd',
3677         'ml': 'mal',
3678         'mn': 'mon',
3679         'mr': 'mar',
3680         'ms': 'msa',
3681         'mt': 'mlt',
3682         'my': 'mya',
3683         'na': 'nau',
3684         'nb': 'nob',
3685         'nd': 'nde',
3686         'ne': 'nep',
3687         'ng': 'ndo',
3688         'nl': 'nld',
3689         'nn': 'nno',
3690         'no': 'nor',
3691         'nr': 'nbl',
3692         'nv': 'nav',
3693         'ny': 'nya',
3694         'oc': 'oci',
3695         'oj': 'oji',
3696         'om': 'orm',
3697         'or': 'ori',
3698         'os': 'oss',
3699         'pa': 'pan',
3700         'pe': 'per',
3701         'pi': 'pli',
3702         'pl': 'pol',
3703         'ps': 'pus',
3704         'pt': 'por',
3705         'qu': 'que',
3706         'rm': 'roh',
3707         'rn': 'run',
3708         'ro': 'ron',
3709         'ru': 'rus',
3710         'rw': 'kin',
3711         'sa': 'san',
3712         'sc': 'srd',
3713         'sd': 'snd',
3714         'se': 'sme',
3715         'sg': 'sag',
3716         'si': 'sin',
3717         'sk': 'slk',
3718         'sl': 'slv',
3719         'sm': 'smo',
3720         'sn': 'sna',
3721         'so': 'som',
3722         'sq': 'sqi',
3723         'sr': 'srp',
3724         'ss': 'ssw',
3725         'st': 'sot',
3726         'su': 'sun',
3727         'sv': 'swe',
3728         'sw': 'swa',
3729         'ta': 'tam',
3730         'te': 'tel',
3731         'tg': 'tgk',
3732         'th': 'tha',
3733         'ti': 'tir',
3734         'tk': 'tuk',
3735         'tl': 'tgl',
3736         'tn': 'tsn',
3737         'to': 'ton',
3738         'tr': 'tur',
3739         'ts': 'tso',
3740         'tt': 'tat',
3741         'tw': 'twi',
3742         'ty': 'tah',
3743         'ug': 'uig',
3744         'uk': 'ukr',
3745         'ur': 'urd',
3746         'uz': 'uzb',
3747         've': 'ven',
3748         'vi': 'vie',
3749         'vo': 'vol',
3750         'wa': 'wln',
3751         'wo': 'wol',
3752         'xh': 'xho',
3753         'yi': 'yid',
3754         'ji': 'yid',  # Replaced by yi in 1989 revision
3755         'yo': 'yor',
3756         'za': 'zha',
3757         'zh': 'zho',
3758         'zu': 'zul',
3759     }
3760
3761     @classmethod
3762     def short2long(cls, code):
3763         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3764         return cls._lang_map.get(code[:2])
3765
3766     @classmethod
3767     def long2short(cls, code):
3768         """Convert language code from ISO 639-2/T to ISO 639-1"""
3769         for short_name, long_name in cls._lang_map.items():
3770             if long_name == code:
3771                 return short_name
3772
3773
3774 class ISO3166Utils:
3775     # From http://data.okfn.org/data/core/country-list
3776     _country_map = {
3777         'AF': 'Afghanistan',
3778         'AX': 'Åland Islands',
3779         'AL': 'Albania',
3780         'DZ': 'Algeria',
3781         'AS': 'American Samoa',
3782         'AD': 'Andorra',
3783         'AO': 'Angola',
3784         'AI': 'Anguilla',
3785         'AQ': 'Antarctica',
3786         'AG': 'Antigua and Barbuda',
3787         'AR': 'Argentina',
3788         'AM': 'Armenia',
3789         'AW': 'Aruba',
3790         'AU': 'Australia',
3791         'AT': 'Austria',
3792         'AZ': 'Azerbaijan',
3793         'BS': 'Bahamas',
3794         'BH': 'Bahrain',
3795         'BD': 'Bangladesh',
3796         'BB': 'Barbados',
3797         'BY': 'Belarus',
3798         'BE': 'Belgium',
3799         'BZ': 'Belize',
3800         'BJ': 'Benin',
3801         'BM': 'Bermuda',
3802         'BT': 'Bhutan',
3803         'BO': 'Bolivia, Plurinational State of',
3804         'BQ': 'Bonaire, Sint Eustatius and Saba',
3805         'BA': 'Bosnia and Herzegovina',
3806         'BW': 'Botswana',
3807         'BV': 'Bouvet Island',
3808         'BR': 'Brazil',
3809         'IO': 'British Indian Ocean Territory',
3810         'BN': 'Brunei Darussalam',
3811         'BG': 'Bulgaria',
3812         'BF': 'Burkina Faso',
3813         'BI': 'Burundi',
3814         'KH': 'Cambodia',
3815         'CM': 'Cameroon',
3816         'CA': 'Canada',
3817         'CV': 'Cape Verde',
3818         'KY': 'Cayman Islands',
3819         'CF': 'Central African Republic',
3820         'TD': 'Chad',
3821         'CL': 'Chile',
3822         'CN': 'China',
3823         'CX': 'Christmas Island',
3824         'CC': 'Cocos (Keeling) Islands',
3825         'CO': 'Colombia',
3826         'KM': 'Comoros',
3827         'CG': 'Congo',
3828         'CD': 'Congo, the Democratic Republic of the',
3829         'CK': 'Cook Islands',
3830         'CR': 'Costa Rica',
3831         'CI': 'Côte d\'Ivoire',
3832         'HR': 'Croatia',
3833         'CU': 'Cuba',
3834         'CW': 'Curaçao',
3835         'CY': 'Cyprus',
3836         'CZ': 'Czech Republic',
3837         'DK': 'Denmark',
3838         'DJ': 'Djibouti',
3839         'DM': 'Dominica',
3840         'DO': 'Dominican Republic',
3841         'EC': 'Ecuador',
3842         'EG': 'Egypt',
3843         'SV': 'El Salvador',
3844         'GQ': 'Equatorial Guinea',
3845         'ER': 'Eritrea',
3846         'EE': 'Estonia',
3847         'ET': 'Ethiopia',
3848         'FK': 'Falkland Islands (Malvinas)',
3849         'FO': 'Faroe Islands',
3850         'FJ': 'Fiji',
3851         'FI': 'Finland',
3852         'FR': 'France',
3853         'GF': 'French Guiana',
3854         'PF': 'French Polynesia',
3855         'TF': 'French Southern Territories',
3856         'GA': 'Gabon',
3857         'GM': 'Gambia',
3858         'GE': 'Georgia',
3859         'DE': 'Germany',
3860         'GH': 'Ghana',
3861         'GI': 'Gibraltar',
3862         'GR': 'Greece',
3863         'GL': 'Greenland',
3864         'GD': 'Grenada',
3865         'GP': 'Guadeloupe',
3866         'GU': 'Guam',
3867         'GT': 'Guatemala',
3868         'GG': 'Guernsey',
3869         'GN': 'Guinea',
3870         'GW': 'Guinea-Bissau',
3871         'GY': 'Guyana',
3872         'HT': 'Haiti',
3873         'HM': 'Heard Island and McDonald Islands',
3874         'VA': 'Holy See (Vatican City State)',
3875         'HN': 'Honduras',
3876         'HK': 'Hong Kong',
3877         'HU': 'Hungary',
3878         'IS': 'Iceland',
3879         'IN': 'India',
3880         'ID': 'Indonesia',
3881         'IR': 'Iran, Islamic Republic of',
3882         'IQ': 'Iraq',
3883         'IE': 'Ireland',
3884         'IM': 'Isle of Man',
3885         'IL': 'Israel',
3886         'IT': 'Italy',
3887         'JM': 'Jamaica',
3888         'JP': 'Japan',
3889         'JE': 'Jersey',
3890         'JO': 'Jordan',
3891         'KZ': 'Kazakhstan',
3892         'KE': 'Kenya',
3893         'KI': 'Kiribati',
3894         'KP': 'Korea, Democratic People\'s Republic of',
3895         'KR': 'Korea, Republic of',
3896         'KW': 'Kuwait',
3897         'KG': 'Kyrgyzstan',
3898         'LA': 'Lao People\'s Democratic Republic',
3899         'LV': 'Latvia',
3900         'LB': 'Lebanon',
3901         'LS': 'Lesotho',
3902         'LR': 'Liberia',
3903         'LY': 'Libya',
3904         'LI': 'Liechtenstein',
3905         'LT': 'Lithuania',
3906         'LU': 'Luxembourg',
3907         'MO': 'Macao',
3908         'MK': 'Macedonia, the Former Yugoslav Republic of',
3909         'MG': 'Madagascar',
3910         'MW': 'Malawi',
3911         'MY': 'Malaysia',
3912         'MV': 'Maldives',
3913         'ML': 'Mali',
3914         'MT': 'Malta',
3915         'MH': 'Marshall Islands',
3916         'MQ': 'Martinique',
3917         'MR': 'Mauritania',
3918         'MU': 'Mauritius',
3919         'YT': 'Mayotte',
3920         'MX': 'Mexico',
3921         'FM': 'Micronesia, Federated States of',
3922         'MD': 'Moldova, Republic of',
3923         'MC': 'Monaco',
3924         'MN': 'Mongolia',
3925         'ME': 'Montenegro',
3926         'MS': 'Montserrat',
3927         'MA': 'Morocco',
3928         'MZ': 'Mozambique',
3929         'MM': 'Myanmar',
3930         'NA': 'Namibia',
3931         'NR': 'Nauru',
3932         'NP': 'Nepal',
3933         'NL': 'Netherlands',
3934         'NC': 'New Caledonia',
3935         'NZ': 'New Zealand',
3936         'NI': 'Nicaragua',
3937         'NE': 'Niger',
3938         'NG': 'Nigeria',
3939         'NU': 'Niue',
3940         'NF': 'Norfolk Island',
3941         'MP': 'Northern Mariana Islands',
3942         'NO': 'Norway',
3943         'OM': 'Oman',
3944         'PK': 'Pakistan',
3945         'PW': 'Palau',
3946         'PS': 'Palestine, State of',
3947         'PA': 'Panama',
3948         'PG': 'Papua New Guinea',
3949         'PY': 'Paraguay',
3950         'PE': 'Peru',
3951         'PH': 'Philippines',
3952         'PN': 'Pitcairn',
3953         'PL': 'Poland',
3954         'PT': 'Portugal',
3955         'PR': 'Puerto Rico',
3956         'QA': 'Qatar',
3957         'RE': 'Réunion',
3958         'RO': 'Romania',
3959         'RU': 'Russian Federation',
3960         'RW': 'Rwanda',
3961         'BL': 'Saint Barthélemy',
3962         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3963         'KN': 'Saint Kitts and Nevis',
3964         'LC': 'Saint Lucia',
3965         'MF': 'Saint Martin (French part)',
3966         'PM': 'Saint Pierre and Miquelon',
3967         'VC': 'Saint Vincent and the Grenadines',
3968         'WS': 'Samoa',
3969         'SM': 'San Marino',
3970         'ST': 'Sao Tome and Principe',
3971         'SA': 'Saudi Arabia',
3972         'SN': 'Senegal',
3973         'RS': 'Serbia',
3974         'SC': 'Seychelles',
3975         'SL': 'Sierra Leone',
3976         'SG': 'Singapore',
3977         'SX': 'Sint Maarten (Dutch part)',
3978         'SK': 'Slovakia',
3979         'SI': 'Slovenia',
3980         'SB': 'Solomon Islands',
3981         'SO': 'Somalia',
3982         'ZA': 'South Africa',
3983         'GS': 'South Georgia and the South Sandwich Islands',
3984         'SS': 'South Sudan',
3985         'ES': 'Spain',
3986         'LK': 'Sri Lanka',
3987         'SD': 'Sudan',
3988         'SR': 'Suriname',
3989         'SJ': 'Svalbard and Jan Mayen',
3990         'SZ': 'Swaziland',
3991         'SE': 'Sweden',
3992         'CH': 'Switzerland',
3993         'SY': 'Syrian Arab Republic',
3994         'TW': 'Taiwan, Province of China',
3995         'TJ': 'Tajikistan',
3996         'TZ': 'Tanzania, United Republic of',
3997         'TH': 'Thailand',
3998         'TL': 'Timor-Leste',
3999         'TG': 'Togo',
4000         'TK': 'Tokelau',
4001         'TO': 'Tonga',
4002         'TT': 'Trinidad and Tobago',
4003         'TN': 'Tunisia',
4004         'TR': 'Turkey',
4005         'TM': 'Turkmenistan',
4006         'TC': 'Turks and Caicos Islands',
4007         'TV': 'Tuvalu',
4008         'UG': 'Uganda',
4009         'UA': 'Ukraine',
4010         'AE': 'United Arab Emirates',
4011         'GB': 'United Kingdom',
4012         'US': 'United States',
4013         'UM': 'United States Minor Outlying Islands',
4014         'UY': 'Uruguay',
4015         'UZ': 'Uzbekistan',
4016         'VU': 'Vanuatu',
4017         'VE': 'Venezuela, Bolivarian Republic of',
4018         'VN': 'Viet Nam',
4019         'VG': 'Virgin Islands, British',
4020         'VI': 'Virgin Islands, U.S.',
4021         'WF': 'Wallis and Futuna',
4022         'EH': 'Western Sahara',
4023         'YE': 'Yemen',
4024         'ZM': 'Zambia',
4025         'ZW': 'Zimbabwe',
4026         # Not ISO 3166 codes, but used for IP blocks
4027         'AP': 'Asia/Pacific Region',
4028         'EU': 'Europe',
4029     }
4030
4031     @classmethod
4032     def short2full(cls, code):
4033         """Convert an ISO 3166-2 country code to the corresponding full name"""
4034         return cls._country_map.get(code.upper())
4035
4036
4037 class GeoUtils:
4038     # Major IPv4 address blocks per country
4039     _country_ip_map = {
4040         'AD': '46.172.224.0/19',
4041         'AE': '94.200.0.0/13',
4042         'AF': '149.54.0.0/17',
4043         'AG': '209.59.64.0/18',
4044         'AI': '204.14.248.0/21',
4045         'AL': '46.99.0.0/16',
4046         'AM': '46.70.0.0/15',
4047         'AO': '105.168.0.0/13',
4048         'AP': '182.50.184.0/21',
4049         'AQ': '23.154.160.0/24',
4050         'AR': '181.0.0.0/12',
4051         'AS': '202.70.112.0/20',
4052         'AT': '77.116.0.0/14',
4053         'AU': '1.128.0.0/11',
4054         'AW': '181.41.0.0/18',
4055         'AX': '185.217.4.0/22',
4056         'AZ': '5.197.0.0/16',
4057         'BA': '31.176.128.0/17',
4058         'BB': '65.48.128.0/17',
4059         'BD': '114.130.0.0/16',
4060         'BE': '57.0.0.0/8',
4061         'BF': '102.178.0.0/15',
4062         'BG': '95.42.0.0/15',
4063         'BH': '37.131.0.0/17',
4064         'BI': '154.117.192.0/18',
4065         'BJ': '137.255.0.0/16',
4066         'BL': '185.212.72.0/23',
4067         'BM': '196.12.64.0/18',
4068         'BN': '156.31.0.0/16',
4069         'BO': '161.56.0.0/16',
4070         'BQ': '161.0.80.0/20',
4071         'BR': '191.128.0.0/12',
4072         'BS': '24.51.64.0/18',
4073         'BT': '119.2.96.0/19',
4074         'BW': '168.167.0.0/16',
4075         'BY': '178.120.0.0/13',
4076         'BZ': '179.42.192.0/18',
4077         'CA': '99.224.0.0/11',
4078         'CD': '41.243.0.0/16',
4079         'CF': '197.242.176.0/21',
4080         'CG': '160.113.0.0/16',
4081         'CH': '85.0.0.0/13',
4082         'CI': '102.136.0.0/14',
4083         'CK': '202.65.32.0/19',
4084         'CL': '152.172.0.0/14',
4085         'CM': '102.244.0.0/14',
4086         'CN': '36.128.0.0/10',
4087         'CO': '181.240.0.0/12',
4088         'CR': '201.192.0.0/12',
4089         'CU': '152.206.0.0/15',
4090         'CV': '165.90.96.0/19',
4091         'CW': '190.88.128.0/17',
4092         'CY': '31.153.0.0/16',
4093         'CZ': '88.100.0.0/14',
4094         'DE': '53.0.0.0/8',
4095         'DJ': '197.241.0.0/17',
4096         'DK': '87.48.0.0/12',
4097         'DM': '192.243.48.0/20',
4098         'DO': '152.166.0.0/15',
4099         'DZ': '41.96.0.0/12',
4100         'EC': '186.68.0.0/15',
4101         'EE': '90.190.0.0/15',
4102         'EG': '156.160.0.0/11',
4103         'ER': '196.200.96.0/20',
4104         'ES': '88.0.0.0/11',
4105         'ET': '196.188.0.0/14',
4106         'EU': '2.16.0.0/13',
4107         'FI': '91.152.0.0/13',
4108         'FJ': '144.120.0.0/16',
4109         'FK': '80.73.208.0/21',
4110         'FM': '119.252.112.0/20',
4111         'FO': '88.85.32.0/19',
4112         'FR': '90.0.0.0/9',
4113         'GA': '41.158.0.0/15',
4114         'GB': '25.0.0.0/8',
4115         'GD': '74.122.88.0/21',
4116         'GE': '31.146.0.0/16',
4117         'GF': '161.22.64.0/18',
4118         'GG': '62.68.160.0/19',
4119         'GH': '154.160.0.0/12',
4120         'GI': '95.164.0.0/16',
4121         'GL': '88.83.0.0/19',
4122         'GM': '160.182.0.0/15',
4123         'GN': '197.149.192.0/18',
4124         'GP': '104.250.0.0/19',
4125         'GQ': '105.235.224.0/20',
4126         'GR': '94.64.0.0/13',
4127         'GT': '168.234.0.0/16',
4128         'GU': '168.123.0.0/16',
4129         'GW': '197.214.80.0/20',
4130         'GY': '181.41.64.0/18',
4131         'HK': '113.252.0.0/14',
4132         'HN': '181.210.0.0/16',
4133         'HR': '93.136.0.0/13',
4134         'HT': '148.102.128.0/17',
4135         'HU': '84.0.0.0/14',
4136         'ID': '39.192.0.0/10',
4137         'IE': '87.32.0.0/12',
4138         'IL': '79.176.0.0/13',
4139         'IM': '5.62.80.0/20',
4140         'IN': '117.192.0.0/10',
4141         'IO': '203.83.48.0/21',
4142         'IQ': '37.236.0.0/14',
4143         'IR': '2.176.0.0/12',
4144         'IS': '82.221.0.0/16',
4145         'IT': '79.0.0.0/10',
4146         'JE': '87.244.64.0/18',
4147         'JM': '72.27.0.0/17',
4148         'JO': '176.29.0.0/16',
4149         'JP': '133.0.0.0/8',
4150         'KE': '105.48.0.0/12',
4151         'KG': '158.181.128.0/17',
4152         'KH': '36.37.128.0/17',
4153         'KI': '103.25.140.0/22',
4154         'KM': '197.255.224.0/20',
4155         'KN': '198.167.192.0/19',
4156         'KP': '175.45.176.0/22',
4157         'KR': '175.192.0.0/10',
4158         'KW': '37.36.0.0/14',
4159         'KY': '64.96.0.0/15',
4160         'KZ': '2.72.0.0/13',
4161         'LA': '115.84.64.0/18',
4162         'LB': '178.135.0.0/16',
4163         'LC': '24.92.144.0/20',
4164         'LI': '82.117.0.0/19',
4165         'LK': '112.134.0.0/15',
4166         'LR': '102.183.0.0/16',
4167         'LS': '129.232.0.0/17',
4168         'LT': '78.56.0.0/13',
4169         'LU': '188.42.0.0/16',
4170         'LV': '46.109.0.0/16',
4171         'LY': '41.252.0.0/14',
4172         'MA': '105.128.0.0/11',
4173         'MC': '88.209.64.0/18',
4174         'MD': '37.246.0.0/16',
4175         'ME': '178.175.0.0/17',
4176         'MF': '74.112.232.0/21',
4177         'MG': '154.126.0.0/17',
4178         'MH': '117.103.88.0/21',
4179         'MK': '77.28.0.0/15',
4180         'ML': '154.118.128.0/18',
4181         'MM': '37.111.0.0/17',
4182         'MN': '49.0.128.0/17',
4183         'MO': '60.246.0.0/16',
4184         'MP': '202.88.64.0/20',
4185         'MQ': '109.203.224.0/19',
4186         'MR': '41.188.64.0/18',
4187         'MS': '208.90.112.0/22',
4188         'MT': '46.11.0.0/16',
4189         'MU': '105.16.0.0/12',
4190         'MV': '27.114.128.0/18',
4191         'MW': '102.70.0.0/15',
4192         'MX': '187.192.0.0/11',
4193         'MY': '175.136.0.0/13',
4194         'MZ': '197.218.0.0/15',
4195         'NA': '41.182.0.0/16',
4196         'NC': '101.101.0.0/18',
4197         'NE': '197.214.0.0/18',
4198         'NF': '203.17.240.0/22',
4199         'NG': '105.112.0.0/12',
4200         'NI': '186.76.0.0/15',
4201         'NL': '145.96.0.0/11',
4202         'NO': '84.208.0.0/13',
4203         'NP': '36.252.0.0/15',
4204         'NR': '203.98.224.0/19',
4205         'NU': '49.156.48.0/22',
4206         'NZ': '49.224.0.0/14',
4207         'OM': '5.36.0.0/15',
4208         'PA': '186.72.0.0/15',
4209         'PE': '186.160.0.0/14',
4210         'PF': '123.50.64.0/18',
4211         'PG': '124.240.192.0/19',
4212         'PH': '49.144.0.0/13',
4213         'PK': '39.32.0.0/11',
4214         'PL': '83.0.0.0/11',
4215         'PM': '70.36.0.0/20',
4216         'PR': '66.50.0.0/16',
4217         'PS': '188.161.0.0/16',
4218         'PT': '85.240.0.0/13',
4219         'PW': '202.124.224.0/20',
4220         'PY': '181.120.0.0/14',
4221         'QA': '37.210.0.0/15',
4222         'RE': '102.35.0.0/16',
4223         'RO': '79.112.0.0/13',
4224         'RS': '93.86.0.0/15',
4225         'RU': '5.136.0.0/13',
4226         'RW': '41.186.0.0/16',
4227         'SA': '188.48.0.0/13',
4228         'SB': '202.1.160.0/19',
4229         'SC': '154.192.0.0/11',
4230         'SD': '102.120.0.0/13',
4231         'SE': '78.64.0.0/12',
4232         'SG': '8.128.0.0/10',
4233         'SI': '188.196.0.0/14',
4234         'SK': '78.98.0.0/15',
4235         'SL': '102.143.0.0/17',
4236         'SM': '89.186.32.0/19',
4237         'SN': '41.82.0.0/15',
4238         'SO': '154.115.192.0/18',
4239         'SR': '186.179.128.0/17',
4240         'SS': '105.235.208.0/21',
4241         'ST': '197.159.160.0/19',
4242         'SV': '168.243.0.0/16',
4243         'SX': '190.102.0.0/20',
4244         'SY': '5.0.0.0/16',
4245         'SZ': '41.84.224.0/19',
4246         'TC': '65.255.48.0/20',
4247         'TD': '154.68.128.0/19',
4248         'TG': '196.168.0.0/14',
4249         'TH': '171.96.0.0/13',
4250         'TJ': '85.9.128.0/18',
4251         'TK': '27.96.24.0/21',
4252         'TL': '180.189.160.0/20',
4253         'TM': '95.85.96.0/19',
4254         'TN': '197.0.0.0/11',
4255         'TO': '175.176.144.0/21',
4256         'TR': '78.160.0.0/11',
4257         'TT': '186.44.0.0/15',
4258         'TV': '202.2.96.0/19',
4259         'TW': '120.96.0.0/11',
4260         'TZ': '156.156.0.0/14',
4261         'UA': '37.52.0.0/14',
4262         'UG': '102.80.0.0/13',
4263         'US': '6.0.0.0/8',
4264         'UY': '167.56.0.0/13',
4265         'UZ': '84.54.64.0/18',
4266         'VA': '212.77.0.0/19',
4267         'VC': '207.191.240.0/21',
4268         'VE': '186.88.0.0/13',
4269         'VG': '66.81.192.0/20',
4270         'VI': '146.226.0.0/16',
4271         'VN': '14.160.0.0/11',
4272         'VU': '202.80.32.0/20',
4273         'WF': '117.20.32.0/21',
4274         'WS': '202.4.32.0/19',
4275         'YE': '134.35.0.0/16',
4276         'YT': '41.242.116.0/22',
4277         'ZA': '41.0.0.0/11',
4278         'ZM': '102.144.0.0/13',
4279         'ZW': '102.177.192.0/18',
4280     }
4281
4282     @classmethod
4283     def random_ipv4(cls, code_or_block):
4284         if len(code_or_block) == 2:
4285             block = cls._country_ip_map.get(code_or_block.upper())
4286             if not block:
4287                 return None
4288         else:
4289             block = code_or_block
4290         addr, preflen = block.split('/')
4291         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4292         addr_max = addr_min | (0xffffffff >> int(preflen))
4293         return str(socket.inet_ntoa(
4294             struct.pack('!L', random.randint(addr_min, addr_max))))
4295
4296
4297 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4298 # released into Public Domain
4299 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4300
4301 def long_to_bytes(n, blocksize=0):
4302     """long_to_bytes(n:long, blocksize:int) : string
4303     Convert a long integer to a byte string.
4304
4305     If optional blocksize is given and greater than zero, pad the front of the
4306     byte string with binary zeros so that the length is a multiple of
4307     blocksize.
4308     """
4309     # after much testing, this algorithm was deemed to be the fastest
4310     s = b''
4311     n = int(n)
4312     while n > 0:
4313         s = struct.pack('>I', n & 0xffffffff) + s
4314         n = n >> 32
4315     # strip off leading zeros
4316     for i in range(len(s)):
4317         if s[i] != b'\000'[0]:
4318             break
4319     else:
4320         # only happens when n == 0
4321         s = b'\000'
4322         i = 0
4323     s = s[i:]
4324     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4325     # de-padding being done above, but sigh...
4326     if blocksize > 0 and len(s) % blocksize:
4327         s = (blocksize - len(s) % blocksize) * b'\000' + s
4328     return s
4329
4330
4331 def bytes_to_long(s):
4332     """bytes_to_long(string) : long
4333     Convert a byte string to a long integer.
4334
4335     This is (essentially) the inverse of long_to_bytes().
4336     """
4337     acc = 0
4338     length = len(s)
4339     if length % 4:
4340         extra = (4 - length % 4)
4341         s = b'\000' * extra + s
4342         length = length + extra
4343     for i in range(0, length, 4):
4344         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4345     return acc
4346
4347
4348 def ohdave_rsa_encrypt(data, exponent, modulus):
4349     '''
4350     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4351
4352     Input:
4353         data: data to encrypt, bytes-like object
4354         exponent, modulus: parameter e and N of RSA algorithm, both integer
4355     Output: hex string of encrypted data
4356
4357     Limitation: supports one block encryption only
4358     '''
4359
4360     payload = int(binascii.hexlify(data[::-1]), 16)
4361     encrypted = pow(payload, exponent, modulus)
4362     return '%x' % encrypted
4363
4364
4365 def pkcs1pad(data, length):
4366     """
4367     Padding input data with PKCS#1 scheme
4368
4369     @param {int[]} data        input data
4370     @param {int}   length      target length
4371     @returns {int[]}           padded data
4372     """
4373     if len(data) > length - 11:
4374         raise ValueError('Input data too long for PKCS#1 padding')
4375
4376     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4377     return [0, 2] + pseudo_random + [0] + data
4378
4379
4380 def _base_n_table(n, table):
4381     if not table and not n:
4382         raise ValueError('Either table or n must be specified')
4383     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4384
4385     if n and n != len(table):
4386         raise ValueError(f'base {n} exceeds table length {len(table)}')
4387     return table
4388
4389
4390 def encode_base_n(num, n=None, table=None):
4391     """Convert given int to a base-n string"""
4392     table = _base_n_table(n, table)
4393     if not num:
4394         return table[0]
4395
4396     result, base = '', len(table)
4397     while num:
4398         result = table[num % base] + result
4399         num = num // base
4400     return result
4401
4402
4403 def decode_base_n(string, n=None, table=None):
4404     """Convert given base-n string to int"""
4405     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4406     result, base = 0, len(table)
4407     for char in string:
4408         result = result * base + table[char]
4409     return result
4410
4411
4412 def decode_packed_codes(code):
4413     mobj = re.search(PACKED_CODES_RE, code)
4414     obfuscated_code, base, count, symbols = mobj.groups()
4415     base = int(base)
4416     count = int(count)
4417     symbols = symbols.split('|')
4418     symbol_table = {}
4419
4420     while count:
4421         count -= 1
4422         base_n_count = encode_base_n(count, base)
4423         symbol_table[base_n_count] = symbols[count] or base_n_count
4424
4425     return re.sub(
4426         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4427         obfuscated_code)
4428
4429
4430 def caesar(s, alphabet, shift):
4431     if shift == 0:
4432         return s
4433     l = len(alphabet)
4434     return ''.join(
4435         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4436         for c in s)
4437
4438
4439 def rot47(s):
4440     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4441
4442
4443 def parse_m3u8_attributes(attrib):
4444     info = {}
4445     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4446         if val.startswith('"'):
4447             val = val[1:-1]
4448         info[key] = val
4449     return info
4450
4451
4452 def urshift(val, n):
4453     return val >> n if val >= 0 else (val + 0x100000000) >> n
4454
4455
4456 def write_xattr(path, key, value):
4457     # Windows: Write xattrs to NTFS Alternate Data Streams:
4458     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4459     if compat_os_name == 'nt':
4460         assert ':' not in key
4461         assert os.path.exists(path)
4462
4463         try:
4464             with open(f'{path}:{key}', 'wb') as f:
4465                 f.write(value)
4466         except OSError as e:
4467             raise XAttrMetadataError(e.errno, e.strerror)
4468         return
4469
4470     # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
4471
4472     setxattr = None
4473     if callable(getattr(os, 'setxattr', None)):
4474         setxattr = os.setxattr
4475     elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4476         # Unicode arguments are not supported in pyxattr until version 0.5.0
4477         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4478         if version_tuple(xattr.__version__) >= (0, 5, 0):
4479             setxattr = xattr.set
4480     elif xattr:
4481         setxattr = xattr.setxattr
4482
4483     if setxattr:
4484         try:
4485             setxattr(path, key, value)
4486         except OSError as e:
4487             raise XAttrMetadataError(e.errno, e.strerror)
4488         return
4489
4490     # UNIX Method 2. Use setfattr/xattr executables
4491     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4492            else 'xattr' if check_executable('xattr', ['-h']) else None)
4493     if not exe:
4494         raise XAttrUnavailableError(
4495             'Couldn\'t find a tool to set the xattrs. Install either the "xattr" or "pyxattr" Python modules or the '
4496             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4497
4498     value = value.decode()
4499     try:
4500         _, stderr, returncode = Popen.run(
4501             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4502             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4503     except OSError as e:
4504         raise XAttrMetadataError(e.errno, e.strerror)
4505     if returncode:
4506         raise XAttrMetadataError(returncode, stderr)
4507
4508
4509 def random_birthday(year_field, month_field, day_field):
4510     start_date = dt.date(1950, 1, 1)
4511     end_date = dt.date(1995, 12, 31)
4512     offset = random.randint(0, (end_date - start_date).days)
4513     random_date = start_date + dt.timedelta(offset)
4514     return {
4515         year_field: str(random_date.year),
4516         month_field: str(random_date.month),
4517         day_field: str(random_date.day),
4518     }
4519
4520
4521 def find_available_port(interface=''):
4522     try:
4523         with socket.socket() as sock:
4524             sock.bind((interface, 0))
4525             return sock.getsockname()[1]
4526     except OSError:
4527         return None
4528
4529
4530 # Templates for internet shortcut files, which are plain text files.
4531 DOT_URL_LINK_TEMPLATE = '''\
4532 [InternetShortcut]
4533 URL=%(url)s
4534 '''
4535
4536 DOT_WEBLOC_LINK_TEMPLATE = '''\
4537 <?xml version="1.0" encoding="UTF-8"?>
4538 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4539 <plist version="1.0">
4540 <dict>
4541 \t<key>URL</key>
4542 \t<string>%(url)s</string>
4543 </dict>
4544 </plist>
4545 '''
4546
4547 DOT_DESKTOP_LINK_TEMPLATE = '''\
4548 [Desktop Entry]
4549 Encoding=UTF-8
4550 Name=%(filename)s
4551 Type=Link
4552 URL=%(url)s
4553 Icon=text-html
4554 '''
4555
4556 LINK_TEMPLATES = {
4557     'url': DOT_URL_LINK_TEMPLATE,
4558     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4559     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4560 }
4561
4562
4563 def iri_to_uri(iri):
4564     """
4565     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4566
4567     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4568     """
4569
4570     iri_parts = urllib.parse.urlparse(iri)
4571
4572     if '[' in iri_parts.netloc:
4573         raise ValueError('IPv6 URIs are not, yet, supported.')
4574         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4575
4576     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4577
4578     net_location = ''
4579     if iri_parts.username:
4580         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4581         if iri_parts.password is not None:
4582             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4583         net_location += '@'
4584
4585     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4586     # The 'idna' encoding produces ASCII text.
4587     if iri_parts.port is not None and iri_parts.port != 80:
4588         net_location += ':' + str(iri_parts.port)
4589
4590     return urllib.parse.urlunparse(
4591         (iri_parts.scheme,
4592             net_location,
4593
4594             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4595
4596             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4597             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4598
4599             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4600             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4601
4602             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4603
4604     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4605
4606
4607 def to_high_limit_path(path):
4608     if sys.platform in ['win32', 'cygwin']:
4609         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4610         return '\\\\?\\' + os.path.abspath(path)
4611
4612     return path
4613
4614
4615 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4616     val = traversal.traverse_obj(obj, *variadic(field))
4617     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4618         return default
4619     return template % func(val)
4620
4621
4622 def clean_podcast_url(url):
4623     url = re.sub(r'''(?x)
4624         (?:
4625             (?:
4626                 chtbl\.com/track|
4627                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4628                 play\.podtrac\.com|
4629                 chrt\.fm/track|
4630                 mgln\.ai/e
4631             )(?:/[^/.]+)?|
4632             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4633             flex\.acast\.com|
4634             pd(?:
4635                 cn\.co| # https://podcorn.com/analytics-prefix/
4636                 st\.fm # https://podsights.com/docs/
4637             )/e|
4638             [0-9]\.gum\.fm|
4639             pscrb\.fm/rss/p
4640         )/''', '', url)
4641     return re.sub(r'^\w+://(\w+://)', r'\1', url)
4642
4643
4644 _HEX_TABLE = '0123456789abcdef'
4645
4646
4647 def random_uuidv4():
4648     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4649
4650
4651 def make_dir(path, to_screen=None):
4652     try:
4653         dn = os.path.dirname(path)
4654         if dn:
4655             os.makedirs(dn, exist_ok=True)
4656         return True
4657     except OSError as err:
4658         if callable(to_screen) is not None:
4659             to_screen(f'unable to create directory {err}')
4660         return False
4661
4662
4663 def get_executable_path():
4664     from ..update import _get_variant_and_executable_path
4665
4666     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4667
4668
4669 def get_user_config_dirs(package_name):
4670     # .config (e.g. ~/.config/package_name)
4671     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4672     yield os.path.join(xdg_config_home, package_name)
4673
4674     # appdata (%APPDATA%/package_name)
4675     appdata_dir = os.getenv('appdata')
4676     if appdata_dir:
4677         yield os.path.join(appdata_dir, package_name)
4678
4679     # home (~/.package_name)
4680     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4681
4682
4683 def get_system_config_dirs(package_name):
4684     # /etc/package_name
4685     yield os.path.join('/etc', package_name)
4686
4687
4688 def time_seconds(**kwargs):
4689     """
4690     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4691     """
4692     return time.time() + dt.timedelta(**kwargs).total_seconds()
4693
4694
4695 # create a JSON Web Signature (jws) with HS256 algorithm
4696 # the resulting format is in JWS Compact Serialization
4697 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4698 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4699 def jwt_encode_hs256(payload_data, key, headers={}):
4700     header_data = {
4701         'alg': 'HS256',
4702         'typ': 'JWT',
4703     }
4704     if headers:
4705         header_data.update(headers)
4706     header_b64 = base64.b64encode(json.dumps(header_data).encode())
4707     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4708     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4709     signature_b64 = base64.b64encode(h.digest())
4710     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4711     return token
4712
4713
4714 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4715 def jwt_decode_hs256(jwt):
4716     header_b64, payload_b64, signature_b64 = jwt.split('.')
4717     # add trailing ='s that may have been stripped, superfluous ='s are ignored
4718     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4719     return payload_data
4720
4721
4722 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4723
4724
4725 @functools.cache
4726 def supports_terminal_sequences(stream):
4727     if compat_os_name == 'nt':
4728         if not WINDOWS_VT_MODE:
4729             return False
4730     elif not os.getenv('TERM'):
4731         return False
4732     try:
4733         return stream.isatty()
4734     except BaseException:
4735         return False
4736
4737
4738 def windows_enable_vt_mode():
4739     """Ref: https://bugs.python.org/issue30075 """
4740     if get_windows_version() < (10, 0, 10586):
4741         return
4742
4743     import ctypes
4744     import ctypes.wintypes
4745     import msvcrt
4746
4747     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4748
4749     dll = ctypes.WinDLL('kernel32', use_last_error=False)
4750     handle = os.open('CONOUT$', os.O_RDWR)
4751     try:
4752         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4753         dw_original_mode = ctypes.wintypes.DWORD()
4754         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4755         if not success:
4756             raise Exception('GetConsoleMode failed')
4757
4758         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4759             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4760         if not success:
4761             raise Exception('SetConsoleMode failed')
4762     finally:
4763         os.close(handle)
4764
4765     global WINDOWS_VT_MODE
4766     WINDOWS_VT_MODE = True
4767     supports_terminal_sequences.cache_clear()
4768
4769
4770 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4771
4772
4773 def remove_terminal_sequences(string):
4774     return _terminal_sequences_re.sub('', string)
4775
4776
4777 def number_of_digits(number):
4778     return len('%d' % number)
4779
4780
4781 def join_nonempty(*values, delim='-', from_dict=None):
4782     if from_dict is not None:
4783         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4784     return delim.join(map(str, filter(None, values)))
4785
4786
4787 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4788     """
4789     Find the largest format dimensions in terms of video width and, for each thumbnail:
4790     * Modify the URL: Match the width with the provided regex and replace with the former width
4791     * Update dimensions
4792
4793     This function is useful with video services that scale the provided thumbnails on demand
4794     """
4795     _keys = ('width', 'height')
4796     max_dimensions = max(
4797         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4798         default=(0, 0))
4799     if not max_dimensions[0]:
4800         return thumbnails
4801     return [
4802         merge_dicts(
4803             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4804             dict(zip(_keys, max_dimensions)), thumbnail)
4805         for thumbnail in thumbnails
4806     ]
4807
4808
4809 def parse_http_range(range):
4810     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4811     if not range:
4812         return None, None, None
4813     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4814     if not crg:
4815         return None, None, None
4816     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4817
4818
4819 def read_stdin(what):
4820     if what:
4821         eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4822         write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4823     return sys.stdin
4824
4825
4826 def determine_file_encoding(data):
4827     """
4828     Detect the text encoding used
4829     @returns (encoding, bytes to skip)
4830     """
4831
4832     # BOM marks are given priority over declarations
4833     for bom, enc in BOMS:
4834         if data.startswith(bom):
4835             return enc, len(bom)
4836
4837     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4838     # We ignore the endianness to get a good enough match
4839     data = data.replace(b'\0', b'')
4840     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4841     return mobj.group(1).decode() if mobj else None, 0
4842
4843
4844 class Config:
4845     own_args = None
4846     parsed_args = None
4847     filename = None
4848     __initialized = False
4849
4850     def __init__(self, parser, label=None):
4851         self.parser, self.label = parser, label
4852         self._loaded_paths, self.configs = set(), []
4853
4854     def init(self, args=None, filename=None):
4855         assert not self.__initialized
4856         self.own_args, self.filename = args, filename
4857         return self.load_configs()
4858
4859     def load_configs(self):
4860         directory = ''
4861         if self.filename:
4862             location = os.path.realpath(self.filename)
4863             directory = os.path.dirname(location)
4864             if location in self._loaded_paths:
4865                 return False
4866             self._loaded_paths.add(location)
4867
4868         self.__initialized = True
4869         opts, _ = self.parser.parse_known_args(self.own_args)
4870         self.parsed_args = self.own_args
4871         for location in opts.config_locations or []:
4872             if location == '-':
4873                 if location in self._loaded_paths:
4874                     continue
4875                 self._loaded_paths.add(location)
4876                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4877                 continue
4878             location = os.path.join(directory, expand_path(location))
4879             if os.path.isdir(location):
4880                 location = os.path.join(location, 'yt-dlp.conf')
4881             if not os.path.exists(location):
4882                 self.parser.error(f'config location {location} does not exist')
4883             self.append_config(self.read_file(location), location)
4884         return True
4885
4886     def __str__(self):
4887         label = join_nonempty(
4888             self.label, 'config', f'"{self.filename}"' if self.filename else '',
4889             delim=' ')
4890         return join_nonempty(
4891             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4892             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4893             delim='\n')
4894
4895     @staticmethod
4896     def read_file(filename, default=[]):
4897         try:
4898             optionf = open(filename, 'rb')
4899         except OSError:
4900             return default  # silently skip if file is not present
4901         try:
4902             enc, skip = determine_file_encoding(optionf.read(512))
4903             optionf.seek(skip, io.SEEK_SET)
4904         except OSError:
4905             enc = None  # silently skip read errors
4906         try:
4907             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4908             contents = optionf.read().decode(enc or preferredencoding())
4909             res = shlex.split(contents, comments=True)
4910         except Exception as err:
4911             raise ValueError(f'Unable to parse "{filename}": {err}')
4912         finally:
4913             optionf.close()
4914         return res
4915
4916     @staticmethod
4917     def hide_login_info(opts):
4918         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4919         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4920
4921         def _scrub_eq(o):
4922             m = eqre.match(o)
4923             if m:
4924                 return m.group('key') + '=PRIVATE'
4925             else:
4926                 return o
4927
4928         opts = list(map(_scrub_eq, opts))
4929         for idx, opt in enumerate(opts):
4930             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4931                 opts[idx + 1] = 'PRIVATE'
4932         return opts
4933
4934     def append_config(self, *args, label=None):
4935         config = type(self)(self.parser, label)
4936         config._loaded_paths = self._loaded_paths
4937         if config.init(*args):
4938             self.configs.append(config)
4939
4940     @property
4941     def all_args(self):
4942         for config in reversed(self.configs):
4943             yield from config.all_args
4944         yield from self.parsed_args or []
4945
4946     def parse_known_args(self, **kwargs):
4947         return self.parser.parse_known_args(self.all_args, **kwargs)
4948
4949     def parse_args(self):
4950         return self.parser.parse_args(self.all_args)
4951
4952
4953 def merge_headers(*dicts):
4954     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4955     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
4956
4957
4958 def cached_method(f):
4959     """Cache a method"""
4960     signature = inspect.signature(f)
4961
4962     @functools.wraps(f)
4963     def wrapper(self, *args, **kwargs):
4964         bound_args = signature.bind(self, *args, **kwargs)
4965         bound_args.apply_defaults()
4966         key = tuple(bound_args.arguments.values())[1:]
4967
4968         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
4969         if key not in cache:
4970             cache[key] = f(self, *args, **kwargs)
4971         return cache[key]
4972     return wrapper
4973
4974
4975 class classproperty:
4976     """property access for class methods with optional caching"""
4977     def __new__(cls, func=None, *args, **kwargs):
4978         if not func:
4979             return functools.partial(cls, *args, **kwargs)
4980         return super().__new__(cls)
4981
4982     def __init__(self, func, *, cache=False):
4983         functools.update_wrapper(self, func)
4984         self.func = func
4985         self._cache = {} if cache else None
4986
4987     def __get__(self, _, cls):
4988         if self._cache is None:
4989             return self.func(cls)
4990         elif cls not in self._cache:
4991             self._cache[cls] = self.func(cls)
4992         return self._cache[cls]
4993
4994
4995 class function_with_repr:
4996     def __init__(self, func, repr_=None):
4997         functools.update_wrapper(self, func)
4998         self.func, self.__repr = func, repr_
4999
5000     def __call__(self, *args, **kwargs):
5001         return self.func(*args, **kwargs)
5002
5003     @classmethod
5004     def set_repr(cls, repr_):
5005         return functools.partial(cls, repr_=repr_)
5006
5007     def __repr__(self):
5008         if self.__repr:
5009             return self.__repr
5010         return f'{self.func.__module__}.{self.func.__qualname__}'
5011
5012
5013 class Namespace(types.SimpleNamespace):
5014     """Immutable namespace"""
5015
5016     def __iter__(self):
5017         return iter(self.__dict__.values())
5018
5019     @property
5020     def items_(self):
5021         return self.__dict__.items()
5022
5023
5024 MEDIA_EXTENSIONS = Namespace(
5025     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5026     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5027     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5028     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5029     thumbnails=('jpg', 'png', 'webp'),
5030     storyboards=('mhtml', ),
5031     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5032     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5033 )
5034 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5035 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5036
5037 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5038
5039
5040 class RetryManager:
5041     """Usage:
5042         for retry in RetryManager(...):
5043             try:
5044                 ...
5045             except SomeException as err:
5046                 retry.error = err
5047                 continue
5048     """
5049     attempt, _error = 0, None
5050
5051     def __init__(self, _retries, _error_callback, **kwargs):
5052         self.retries = _retries or 0
5053         self.error_callback = functools.partial(_error_callback, **kwargs)
5054
5055     def _should_retry(self):
5056         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5057
5058     @property
5059     def error(self):
5060         if self._error is NO_DEFAULT:
5061             return None
5062         return self._error
5063
5064     @error.setter
5065     def error(self, value):
5066         self._error = value
5067
5068     def __iter__(self):
5069         while self._should_retry():
5070             self.error = NO_DEFAULT
5071             self.attempt += 1
5072             yield self
5073             if self.error:
5074                 self.error_callback(self.error, self.attempt, self.retries)
5075
5076     @staticmethod
5077     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5078         """Utility function for reporting retries"""
5079         if count > retries:
5080             if error:
5081                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5082             raise e
5083
5084         if not count:
5085             return warn(e)
5086         elif isinstance(e, ExtractorError):
5087             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5088         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5089
5090         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5091         if delay:
5092             info(f'Sleeping {delay:.2f} seconds ...')
5093             time.sleep(delay)
5094
5095
5096 def make_archive_id(ie, video_id):
5097     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5098     return f'{ie_key.lower()} {video_id}'
5099
5100
5101 def truncate_string(s, left, right=0):
5102     assert left > 3 and right >= 0
5103     if s is None or len(s) <= left + right:
5104         return s
5105     return f'{s[:left - 3]}...{s[-right:] if right else ""}'
5106
5107
5108 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5109     assert 'all' in alias_dict, '"all" alias is required'
5110     requested = list(start or [])
5111     for val in options:
5112         discard = val.startswith('-')
5113         if discard:
5114             val = val[1:]
5115
5116         if val in alias_dict:
5117             val = alias_dict[val] if not discard else [
5118                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5119             # NB: Do not allow regex in aliases for performance
5120             requested = orderedSet_from_options(val, alias_dict, start=requested)
5121             continue
5122
5123         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5124                    else [val] if val in alias_dict['all'] else None)
5125         if current is None:
5126             raise ValueError(val)
5127
5128         if discard:
5129             for item in current:
5130                 while item in requested:
5131                     requested.remove(item)
5132         else:
5133             requested.extend(current)
5134
5135     return orderedSet(requested)
5136
5137
5138 # TODO: Rewrite
5139 class FormatSorter:
5140     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5141
5142     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5143                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5144                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5145     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5146                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5147                     'fps', 'fs_approx', 'source', 'id')
5148
5149     settings = {
5150         'vcodec': {'type': 'ordered', 'regex': True,
5151                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5152         'acodec': {'type': 'ordered', 'regex': True,
5153                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5154         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5155                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5156         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5157                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5158         'vext': {'type': 'ordered', 'field': 'video_ext',
5159                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5160                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5161         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5162                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5163                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5164         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5165         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5166                        'field': ('vcodec', 'acodec'),
5167                        'function': lambda it: int(any(v != 'none' for v in it))},
5168         'ie_pref': {'priority': True, 'type': 'extractor'},
5169         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5170         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5171         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5172         'quality': {'convert': 'float', 'default': -1},
5173         'filesize': {'convert': 'bytes'},
5174         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5175         'id': {'convert': 'string', 'field': 'format_id'},
5176         'height': {'convert': 'float_none'},
5177         'width': {'convert': 'float_none'},
5178         'fps': {'convert': 'float_none'},
5179         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5180         'tbr': {'convert': 'float_none'},
5181         'vbr': {'convert': 'float_none'},
5182         'abr': {'convert': 'float_none'},
5183         'asr': {'convert': 'float_none'},
5184         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5185
5186         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5187         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5188                'function': lambda it: next(filter(None, it), None)},
5189         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5190                  'function': lambda it: next(filter(None, it), None)},
5191         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5192         'res': {'type': 'multiple', 'field': ('height', 'width'),
5193                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5194
5195         # Actual field names
5196         'format_id': {'type': 'alias', 'field': 'id'},
5197         'preference': {'type': 'alias', 'field': 'ie_pref'},
5198         'language_preference': {'type': 'alias', 'field': 'lang'},
5199         'source_preference': {'type': 'alias', 'field': 'source'},
5200         'protocol': {'type': 'alias', 'field': 'proto'},
5201         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5202         'audio_channels': {'type': 'alias', 'field': 'channels'},
5203
5204         # Deprecated
5205         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5206         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5207         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5208         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5209         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5210         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5211         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5212         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5213         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5214         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5215         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5216         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5217         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5218         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5219         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5220         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5221         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5222         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5223         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5224         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5225     }
5226
5227     def __init__(self, ydl, field_preference):
5228         self.ydl = ydl
5229         self._order = []
5230         self.evaluate_params(self.ydl.params, field_preference)
5231         if ydl.params.get('verbose'):
5232             self.print_verbose_info(self.ydl.write_debug)
5233
5234     def _get_field_setting(self, field, key):
5235         if field not in self.settings:
5236             if key in ('forced', 'priority'):
5237                 return False
5238             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5239                                         'deprecated and may be removed in a future version')
5240             self.settings[field] = {}
5241         propObj = self.settings[field]
5242         if key not in propObj:
5243             type = propObj.get('type')
5244             if key == 'field':
5245                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5246             elif key == 'convert':
5247                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5248             else:
5249                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5250             propObj[key] = default
5251         return propObj[key]
5252
5253     def _resolve_field_value(self, field, value, convertNone=False):
5254         if value is None:
5255             if not convertNone:
5256                 return None
5257         else:
5258             value = value.lower()
5259         conversion = self._get_field_setting(field, 'convert')
5260         if conversion == 'ignore':
5261             return None
5262         if conversion == 'string':
5263             return value
5264         elif conversion == 'float_none':
5265             return float_or_none(value)
5266         elif conversion == 'bytes':
5267             return parse_bytes(value)
5268         elif conversion == 'order':
5269             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5270             use_regex = self._get_field_setting(field, 'regex')
5271             list_length = len(order_list)
5272             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5273             if use_regex and value is not None:
5274                 for i, regex in enumerate(order_list):
5275                     if regex and re.match(regex, value):
5276                         return list_length - i
5277                 return list_length - empty_pos  # not in list
5278             else:  # not regex or  value = None
5279                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5280         else:
5281             if value.isnumeric():
5282                 return float(value)
5283             else:
5284                 self.settings[field]['convert'] = 'string'
5285                 return value
5286
5287     def evaluate_params(self, params, sort_extractor):
5288         self._use_free_order = params.get('prefer_free_formats', False)
5289         self._sort_user = params.get('format_sort', [])
5290         self._sort_extractor = sort_extractor
5291
5292         def add_item(field, reverse, closest, limit_text):
5293             field = field.lower()
5294             if field in self._order:
5295                 return
5296             self._order.append(field)
5297             limit = self._resolve_field_value(field, limit_text)
5298             data = {
5299                 'reverse': reverse,
5300                 'closest': False if limit is None else closest,
5301                 'limit_text': limit_text,
5302                 'limit': limit}
5303             if field in self.settings:
5304                 self.settings[field].update(data)
5305             else:
5306                 self.settings[field] = data
5307
5308         sort_list = (
5309             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5310             + (tuple() if params.get('format_sort_force', False)
5311                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5312             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5313
5314         for item in sort_list:
5315             match = re.match(self.regex, item)
5316             if match is None:
5317                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5318             field = match.group('field')
5319             if field is None:
5320                 continue
5321             if self._get_field_setting(field, 'type') == 'alias':
5322                 alias, field = field, self._get_field_setting(field, 'field')
5323                 if self._get_field_setting(alias, 'deprecated'):
5324                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5325                                                 f'be removed in a future version. Please use {field} instead')
5326             reverse = match.group('reverse') is not None
5327             closest = match.group('separator') == '~'
5328             limit_text = match.group('limit')
5329
5330             has_limit = limit_text is not None
5331             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5332             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5333
5334             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5335             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5336             limit_count = len(limits)
5337             for (i, f) in enumerate(fields):
5338                 add_item(f, reverse, closest,
5339                          limits[i] if i < limit_count
5340                          else limits[0] if has_limit and not has_multiple_limits
5341                          else None)
5342
5343     def print_verbose_info(self, write_debug):
5344         if self._sort_user:
5345             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5346         if self._sort_extractor:
5347             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5348         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5349             '+' if self._get_field_setting(field, 'reverse') else '', field,
5350             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5351                           self._get_field_setting(field, 'limit_text'),
5352                           self._get_field_setting(field, 'limit'))
5353             if self._get_field_setting(field, 'limit_text') is not None else '')
5354             for field in self._order if self._get_field_setting(field, 'visible')]))
5355
5356     def _calculate_field_preference_from_value(self, format, field, type, value):
5357         reverse = self._get_field_setting(field, 'reverse')
5358         closest = self._get_field_setting(field, 'closest')
5359         limit = self._get_field_setting(field, 'limit')
5360
5361         if type == 'extractor':
5362             maximum = self._get_field_setting(field, 'max')
5363             if value is None or (maximum is not None and value >= maximum):
5364                 value = -1
5365         elif type == 'boolean':
5366             in_list = self._get_field_setting(field, 'in_list')
5367             not_in_list = self._get_field_setting(field, 'not_in_list')
5368             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5369         elif type == 'ordered':
5370             value = self._resolve_field_value(field, value, True)
5371
5372         # try to convert to number
5373         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5374         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5375         if is_num:
5376             value = val_num
5377
5378         return ((-10, 0) if value is None
5379                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5380                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5381                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5382                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5383                 else (-1, value, 0))
5384
5385     def _calculate_field_preference(self, format, field):
5386         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5387         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5388         if type == 'multiple':
5389             type = 'field'  # Only 'field' is allowed in multiple for now
5390             actual_fields = self._get_field_setting(field, 'field')
5391
5392             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5393         else:
5394             value = get_value(field)
5395         return self._calculate_field_preference_from_value(format, field, type, value)
5396
5397     def calculate_preference(self, format):
5398         # Determine missing protocol
5399         if not format.get('protocol'):
5400             format['protocol'] = determine_protocol(format)
5401
5402         # Determine missing ext
5403         if not format.get('ext') and 'url' in format:
5404             format['ext'] = determine_ext(format['url'])
5405         if format.get('vcodec') == 'none':
5406             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5407             format['video_ext'] = 'none'
5408         else:
5409             format['video_ext'] = format['ext']
5410             format['audio_ext'] = 'none'
5411         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5412         #    format['preference'] = -1000
5413
5414         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5415             # HEVC-over-FLV is out-of-spec by FLV's original spec
5416             # ref. https://trac.ffmpeg.org/ticket/6389
5417             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5418             format['preference'] = -100
5419
5420         # Determine missing bitrates
5421         if format.get('vcodec') == 'none':
5422             format['vbr'] = 0
5423         if format.get('acodec') == 'none':
5424             format['abr'] = 0
5425         if not format.get('vbr') and format.get('vcodec') != 'none':
5426             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5427         if not format.get('abr') and format.get('acodec') != 'none':
5428             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5429         if not format.get('tbr'):
5430             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5431
5432         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5433
5434
5435 def filesize_from_tbr(tbr, duration):
5436     """
5437     @param tbr:      Total bitrate in kbps (1000 bits/sec)
5438     @param duration: Duration in seconds
5439     @returns         Filesize in bytes
5440     """
5441     if tbr is None or duration is None:
5442         return None
5443     return int(duration * tbr * (1000 / 8))
5444
5445
5446 # XXX: Temporary
5447 class _YDLLogger:
5448     def __init__(self, ydl=None):
5449         self._ydl = ydl
5450
5451     def debug(self, message):
5452         if self._ydl:
5453             self._ydl.write_debug(message)
5454
5455     def info(self, message):
5456         if self._ydl:
5457             self._ydl.to_screen(message)
5458
5459     def warning(self, message, *, once=False):
5460         if self._ydl:
5461             self._ydl.report_warning(message, once)
5462
5463     def error(self, message, *, is_error=True):
5464         if self._ydl:
5465             self._ydl.report_error(message, is_error=is_error)
5466
5467     def stdout(self, message):
5468         if self._ydl:
5469             self._ydl.to_stdout(message)
5470
5471     def stderr(self, message):
5472         if self._ydl:
5473             self._ydl.to_stderr(message)