yt_dlp/utils/_utils.py

   1 import base64
   2 import binascii
   3 import calendar
   4 import codecs
   5 import collections
   6 import collections.abc
   7 import contextlib
   8 import datetime as dt
   9 import email.header
  10 import email.utils
  11 import errno
  12 import hashlib
  13 import hmac
  14 import html.entities
  15 import html.parser
  16 import inspect
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import mimetypes
  23 import netrc
  24 import operator
  25 import os
  26 import platform
  27 import random
  28 import re
  29 import shlex
  30 import socket
  31 import ssl
  32 import struct
  33 import subprocess
  34 import sys
  35 import tempfile
  36 import time
  37 import traceback
  38 import types
  39 import unicodedata
  40 import urllib.error
  41 import urllib.parse
  42 import urllib.request
  43 import xml.etree.ElementTree
  44
  45 from . import traversal
  46
  47 from ..compat import functools  # isort: split
  48 from ..compat import (
  49     compat_etree_fromstring,
  50     compat_expanduser,
  51     compat_HTMLParseError,
  52     compat_os_name,
  53 )
  54 from ..dependencies import xattr
  55
  56 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  57
  58 # This is not clearly defined otherwise
  59 compiled_regex_type = type(re.compile(''))
  60
  61
  62 class NO_DEFAULT:
  63     pass
  64
  65
  66 def IDENTITY(x):
  67     return x
  68
  69
  70 ENGLISH_MONTH_NAMES = [
  71     'January', 'February', 'March', 'April', 'May', 'June',
  72     'July', 'August', 'September', 'October', 'November', 'December']
  73
  74 MONTH_NAMES = {
  75     'en': ENGLISH_MONTH_NAMES,
  76     'fr': [
  77         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  78         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  79     # these follow the genitive grammatical case (dopełniacz)
  80     # some websites might be using nominative, which will require another month list
  81     # https://en.wikibooks.org/wiki/Polish/Noun_cases
  82     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  83            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  84 }
  85
  86 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  87 TIMEZONE_NAMES = {
  88     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  89     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
  90     'EST': -5, 'EDT': -4,  # Eastern
  91     'CST': -6, 'CDT': -5,  # Central
  92     'MST': -7, 'MDT': -6,  # Mountain
  93     'PST': -8, 'PDT': -7   # Pacific
  94 }
  95
  96 # needed for sanitizing filenames in restricted mode
  97 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
  98                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
  99                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 100
 101 DATE_FORMATS = (
 102     '%d %B %Y',
 103     '%d %b %Y',
 104     '%B %d %Y',
 105     '%B %dst %Y',
 106     '%B %dnd %Y',
 107     '%B %drd %Y',
 108     '%B %dth %Y',
 109     '%b %d %Y',
 110     '%b %dst %Y',
 111     '%b %dnd %Y',
 112     '%b %drd %Y',
 113     '%b %dth %Y',
 114     '%b %dst %Y %I:%M',
 115     '%b %dnd %Y %I:%M',
 116     '%b %drd %Y %I:%M',
 117     '%b %dth %Y %I:%M',
 118     '%Y %m %d',
 119     '%Y-%m-%d',
 120     '%Y.%m.%d.',
 121     '%Y/%m/%d',
 122     '%Y/%m/%d %H:%M',
 123     '%Y/%m/%d %H:%M:%S',
 124     '%Y%m%d%H%M',
 125     '%Y%m%d%H%M%S',
 126     '%Y%m%d',
 127     '%Y-%m-%d %H:%M',
 128     '%Y-%m-%d %H:%M:%S',
 129     '%Y-%m-%d %H:%M:%S.%f',
 130     '%Y-%m-%d %H:%M:%S:%f',
 131     '%d.%m.%Y %H:%M',
 132     '%d.%m.%Y %H.%M',
 133     '%Y-%m-%dT%H:%M:%SZ',
 134     '%Y-%m-%dT%H:%M:%S.%fZ',
 135     '%Y-%m-%dT%H:%M:%S.%f0Z',
 136     '%Y-%m-%dT%H:%M:%S',
 137     '%Y-%m-%dT%H:%M:%S.%f',
 138     '%Y-%m-%dT%H:%M',
 139     '%b %d %Y at %H:%M',
 140     '%b %d %Y at %H:%M:%S',
 141     '%B %d %Y at %H:%M',
 142     '%B %d %Y at %H:%M:%S',
 143     '%H:%M %d-%b-%Y',
 144 )
 145
 146 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 147 DATE_FORMATS_DAY_FIRST.extend([
 148     '%d-%m-%Y',
 149     '%d.%m.%Y',
 150     '%d.%m.%y',
 151     '%d/%m/%Y',
 152     '%d/%m/%y',
 153     '%d/%m/%Y %H:%M:%S',
 154     '%d-%m-%Y %H:%M',
 155     '%H:%M %d/%m/%Y',
 156 ])
 157
 158 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 159 DATE_FORMATS_MONTH_FIRST.extend([
 160     '%m-%d-%Y',
 161     '%m.%d.%Y',
 162     '%m/%d/%Y',
 163     '%m/%d/%y',
 164     '%m/%d/%Y %H:%M:%S',
 165 ])
 166
 167 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 168 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 169
 170 NUMBER_RE = r'\d+(?:\.\d+)?'
 171
 172
 173 @functools.cache
 174 def preferredencoding():
 175     """Get preferred encoding.
 176
 177     Returns the best encoding scheme for the system, based on
 178     locale.getpreferredencoding() and some further tweaks.
 179     """
 180     try:
 181         pref = locale.getpreferredencoding()
 182         'TEST'.encode(pref)
 183     except Exception:
 184         pref = 'UTF-8'
 185
 186     return pref
 187
 188
 189 def write_json_file(obj, fn):
 190     """ Encode obj as JSON and write it to fn, atomically if possible """
 191
 192     tf = tempfile.NamedTemporaryFile(
 193         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 194         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 195
 196     try:
 197         with tf:
 198             json.dump(obj, tf, ensure_ascii=False)
 199         if sys.platform == 'win32':
 200             # Need to remove existing file on Windows, else os.rename raises
 201             # WindowsError or FileExistsError.
 202             with contextlib.suppress(OSError):
 203                 os.unlink(fn)
 204         with contextlib.suppress(OSError):
 205             mask = os.umask(0)
 206             os.umask(mask)
 207             os.chmod(tf.name, 0o666 & ~mask)
 208         os.rename(tf.name, fn)
 209     except Exception:
 210         with contextlib.suppress(OSError):
 211             os.remove(tf.name)
 212         raise
 213
 214
 215 def find_xpath_attr(node, xpath, key, val=None):
 216     """ Find the xpath xpath[@key=val] """
 217     assert re.match(r'^[a-zA-Z_-]+$', key)
 218     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 219     return node.find(expr)
 220
 221 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 222 # the namespace parameter
 223
 224
 225 def xpath_with_ns(path, ns_map):
 226     components = [c.split(':') for c in path.split('/')]
 227     replaced = []
 228     for c in components:
 229         if len(c) == 1:
 230             replaced.append(c[0])
 231         else:
 232             ns, tag = c
 233             replaced.append('{%s}%s' % (ns_map[ns], tag))
 234     return '/'.join(replaced)
 235
 236
 237 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 238     def _find_xpath(xpath):
 239         return node.find(xpath)
 240
 241     if isinstance(xpath, str):
 242         n = _find_xpath(xpath)
 243     else:
 244         for xp in xpath:
 245             n = _find_xpath(xp)
 246             if n is not None:
 247                 break
 248
 249     if n is None:
 250         if default is not NO_DEFAULT:
 251             return default
 252         elif fatal:
 253             name = xpath if name is None else name
 254             raise ExtractorError('Could not find XML element %s' % name)
 255         else:
 256             return None
 257     return n
 258
 259
 260 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 261     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 262     if n is None or n == default:
 263         return n
 264     if n.text is None:
 265         if default is not NO_DEFAULT:
 266             return default
 267         elif fatal:
 268             name = xpath if name is None else name
 269             raise ExtractorError('Could not find XML element\'s text %s' % name)
 270         else:
 271             return None
 272     return n.text
 273
 274
 275 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 276     n = find_xpath_attr(node, xpath, key)
 277     if n is None:
 278         if default is not NO_DEFAULT:
 279             return default
 280         elif fatal:
 281             name = f'{xpath}[@{key}]' if name is None else name
 282             raise ExtractorError('Could not find XML attribute %s' % name)
 283         else:
 284             return None
 285     return n.attrib[key]
 286
 287
 288 def get_element_by_id(id, html, **kwargs):
 289     """Return the content of the tag with the specified ID in the passed HTML document"""
 290     return get_element_by_attribute('id', id, html, **kwargs)
 291
 292
 293 def get_element_html_by_id(id, html, **kwargs):
 294     """Return the html of the tag with the specified ID in the passed HTML document"""
 295     return get_element_html_by_attribute('id', id, html, **kwargs)
 296
 297
 298 def get_element_by_class(class_name, html):
 299     """Return the content of the first tag with the specified class in the passed HTML document"""
 300     retval = get_elements_by_class(class_name, html)
 301     return retval[0] if retval else None
 302
 303
 304 def get_element_html_by_class(class_name, html):
 305     """Return the html of the first tag with the specified class in the passed HTML document"""
 306     retval = get_elements_html_by_class(class_name, html)
 307     return retval[0] if retval else None
 308
 309
 310 def get_element_by_attribute(attribute, value, html, **kwargs):
 311     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 312     return retval[0] if retval else None
 313
 314
 315 def get_element_html_by_attribute(attribute, value, html, **kargs):
 316     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 317     return retval[0] if retval else None
 318
 319
 320 def get_elements_by_class(class_name, html, **kargs):
 321     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 322     return get_elements_by_attribute(
 323         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 324         html, escape_value=False)
 325
 326
 327 def get_elements_html_by_class(class_name, html):
 328     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 329     return get_elements_html_by_attribute(
 330         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 331         html, escape_value=False)
 332
 333
 334 def get_elements_by_attribute(*args, **kwargs):
 335     """Return the content of the tag with the specified attribute in the passed HTML document"""
 336     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 337
 338
 339 def get_elements_html_by_attribute(*args, **kwargs):
 340     """Return the html of the tag with the specified attribute in the passed HTML document"""
 341     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 342
 343
 344 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 345     """
 346     Return the text (content) and the html (whole) of the tag with the specified
 347     attribute in the passed HTML document
 348     """
 349     if not value:
 350         return
 351
 352     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 353
 354     value = re.escape(value) if escape_value else value
 355
 356     partial_element_re = rf'''(?x)
 357         <(?P<tag>{tag})
 358          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 359          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 360         '''
 361
 362     for m in re.finditer(partial_element_re, html):
 363         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 364
 365         yield (
 366             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 367             whole
 368         )
 369
 370
 371 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 372     """
 373     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 374     closing tag for the first opening tag it has encountered, and can be used
 375     as a context manager
 376     """
 377
 378     class HTMLBreakOnClosingTagException(Exception):
 379         pass
 380
 381     def __init__(self):
 382         self.tagstack = collections.deque()
 383         html.parser.HTMLParser.__init__(self)
 384
 385     def __enter__(self):
 386         return self
 387
 388     def __exit__(self, *_):
 389         self.close()
 390
 391     def close(self):
 392         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 393         # so data remains buffered; we no longer have any interest in it, thus
 394         # override this method to discard it
 395         pass
 396
 397     def handle_starttag(self, tag, _):
 398         self.tagstack.append(tag)
 399
 400     def handle_endtag(self, tag):
 401         if not self.tagstack:
 402             raise compat_HTMLParseError('no tags in the stack')
 403         while self.tagstack:
 404             inner_tag = self.tagstack.pop()
 405             if inner_tag == tag:
 406                 break
 407         else:
 408             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 409         if not self.tagstack:
 410             raise self.HTMLBreakOnClosingTagException()
 411
 412
 413 # XXX: This should be far less strict
 414 def get_element_text_and_html_by_tag(tag, html):
 415     """
 416     For the first element with the specified tag in the passed HTML document
 417     return its' content (text) and the whole element (html)
 418     """
 419     def find_or_raise(haystack, needle, exc):
 420         try:
 421             return haystack.index(needle)
 422         except ValueError:
 423             raise exc
 424     closing_tag = f'</{tag}>'
 425     whole_start = find_or_raise(
 426         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 427     content_start = find_or_raise(
 428         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 429     content_start += whole_start + 1
 430     with HTMLBreakOnClosingTagParser() as parser:
 431         parser.feed(html[whole_start:content_start])
 432         if not parser.tagstack or parser.tagstack[0] != tag:
 433             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 434         offset = content_start
 435         while offset < len(html):
 436             next_closing_tag_start = find_or_raise(
 437                 html[offset:], closing_tag,
 438                 compat_HTMLParseError(f'closing {tag} tag not found'))
 439             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 440             try:
 441                 parser.feed(html[offset:offset + next_closing_tag_end])
 442                 offset += next_closing_tag_end
 443             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 444                 return html[content_start:offset + next_closing_tag_start], \
 445                     html[whole_start:offset + next_closing_tag_end]
 446         raise compat_HTMLParseError('unexpected end of html')
 447
 448
 449 class HTMLAttributeParser(html.parser.HTMLParser):
 450     """Trivial HTML parser to gather the attributes for a single element"""
 451
 452     def __init__(self):
 453         self.attrs = {}
 454         html.parser.HTMLParser.__init__(self)
 455
 456     def handle_starttag(self, tag, attrs):
 457         self.attrs = dict(attrs)
 458         raise compat_HTMLParseError('done')
 459
 460
 461 class HTMLListAttrsParser(html.parser.HTMLParser):
 462     """HTML parser to gather the attributes for the elements of a list"""
 463
 464     def __init__(self):
 465         html.parser.HTMLParser.__init__(self)
 466         self.items = []
 467         self._level = 0
 468
 469     def handle_starttag(self, tag, attrs):
 470         if tag == 'li' and self._level == 0:
 471             self.items.append(dict(attrs))
 472         self._level += 1
 473
 474     def handle_endtag(self, tag):
 475         self._level -= 1
 476
 477
 478 def extract_attributes(html_element):
 479     """Given a string for an HTML element such as
 480     <el
 481          a="foo" B="bar" c="&98;az" d=boz
 482          empty= noval entity="&amp;"
 483          sq='"' dq="'"
 484     >
 485     Decode and return a dictionary of attributes.
 486     {
 487         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 488         'empty': '', 'noval': None, 'entity': '&',
 489         'sq': '"', 'dq': '\''
 490     }.
 491     """
 492     parser = HTMLAttributeParser()
 493     with contextlib.suppress(compat_HTMLParseError):
 494         parser.feed(html_element)
 495         parser.close()
 496     return parser.attrs
 497
 498
 499 def parse_list(webpage):
 500     """Given a string for an series of HTML <li> elements,
 501     return a dictionary of their attributes"""
 502     parser = HTMLListAttrsParser()
 503     parser.feed(webpage)
 504     parser.close()
 505     return parser.items
 506
 507
 508 def clean_html(html):
 509     """Clean an HTML snippet into a readable string"""
 510
 511     if html is None:  # Convenience for sanitizing descriptions etc.
 512         return html
 513
 514     html = re.sub(r'\s+', ' ', html)
 515     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 516     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 517     # Strip html tags
 518     html = re.sub('<.*?>', '', html)
 519     # Replace html entities
 520     html = unescapeHTML(html)
 521     return html.strip()
 522
 523
 524 class LenientJSONDecoder(json.JSONDecoder):
 525     # TODO: Write tests
 526     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 527         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 528         self._close_attempts = 2 * close_objects
 529         super().__init__(*args, **kwargs)
 530
 531     @staticmethod
 532     def _close_object(err):
 533         doc = err.doc[:err.pos]
 534         # We need to add comma first to get the correct error message
 535         if err.msg.startswith('Expecting \',\''):
 536             return doc + ','
 537         elif not doc.endswith(','):
 538             return
 539
 540         if err.msg.startswith('Expecting property name'):
 541             return doc[:-1] + '}'
 542         elif err.msg.startswith('Expecting value'):
 543             return doc[:-1] + ']'
 544
 545     def decode(self, s):
 546         if self.transform_source:
 547             s = self.transform_source(s)
 548         for attempt in range(self._close_attempts + 1):
 549             try:
 550                 if self.ignore_extra:
 551                     return self.raw_decode(s.lstrip())[0]
 552                 return super().decode(s)
 553             except json.JSONDecodeError as e:
 554                 if e.pos is None:
 555                     raise
 556                 elif attempt < self._close_attempts:
 557                     s = self._close_object(e)
 558                     if s is not None:
 559                         continue
 560                 raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
 561         assert False, 'Too many attempts to decode JSON'
 562
 563
 564 def sanitize_open(filename, open_mode):
 565     """Try to open the given filename, and slightly tweak it if this fails.
 566
 567     Attempts to open the given filename. If this fails, it tries to change
 568     the filename slightly, step by step, until it's either able to open it
 569     or it fails and raises a final exception, like the standard open()
 570     function.
 571
 572     It returns the tuple (stream, definitive_file_name).
 573     """
 574     if filename == '-':
 575         if sys.platform == 'win32':
 576             import msvcrt
 577
 578             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 579             with contextlib.suppress(io.UnsupportedOperation):
 580                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 581         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 582
 583     for attempt in range(2):
 584         try:
 585             try:
 586                 if sys.platform == 'win32':
 587                     # FIXME: An exclusive lock also locks the file from being read.
 588                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 589                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 590                     raise LockingUnsupportedError()
 591                 stream = locked_file(filename, open_mode, block=False).__enter__()
 592             except OSError:
 593                 stream = open(filename, open_mode)
 594             return stream, filename
 595         except OSError as err:
 596             if attempt or err.errno in (errno.EACCES,):
 597                 raise
 598             old_filename, filename = filename, sanitize_path(filename)
 599             if old_filename == filename:
 600                 raise
 601
 602
 603 def timeconvert(timestr):
 604     """Convert RFC 2822 defined time string into system timestamp"""
 605     timestamp = None
 606     timetuple = email.utils.parsedate_tz(timestr)
 607     if timetuple is not None:
 608         timestamp = email.utils.mktime_tz(timetuple)
 609     return timestamp
 610
 611
 612 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 613     """Sanitizes a string so it could be used as part of a filename.
 614     @param restricted   Use a stricter subset of allowed characters
 615     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 616                         If unset, yt-dlp's new sanitization rules are in effect
 617     """
 618     if s == '':
 619         return ''
 620
 621     def replace_insane(char):
 622         if restricted and char in ACCENT_CHARS:
 623             return ACCENT_CHARS[char]
 624         elif not restricted and char == '\n':
 625             return '\0 '
 626         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 627             # Replace with their full-width unicode counterparts
 628             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 629         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 630             return ''
 631         elif char == '"':
 632             return '' if restricted else '\''
 633         elif char == ':':
 634             return '\0_\0-' if restricted else '\0 \0-'
 635         elif char in '\\/|*<>':
 636             return '\0_'
 637         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 638             return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
 639         return char
 640
 641     # Replace look-alike Unicode glyphs
 642     if restricted and (is_id is NO_DEFAULT or not is_id):
 643         s = unicodedata.normalize('NFKC', s)
 644     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 645     result = ''.join(map(replace_insane, s))
 646     if is_id is NO_DEFAULT:
 647         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 648         STRIP_RE = r'(?:\0.|[ _-])*'
 649         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 650     result = result.replace('\0', '') or '_'
 651
 652     if not is_id:
 653         while '__' in result:
 654             result = result.replace('__', '_')
 655         result = result.strip('_')
 656         # Common case of "Foreign band name - English song title"
 657         if restricted and result.startswith('-_'):
 658             result = result[2:]
 659         if result.startswith('-'):
 660             result = '_' + result[len('-'):]
 661         result = result.lstrip('.')
 662         if not result:
 663             result = '_'
 664     return result
 665
 666
 667 def sanitize_path(s, force=False):
 668     """Sanitizes and normalizes path on Windows"""
 669     # XXX: this handles drive relative paths (c:sth) incorrectly
 670     if sys.platform == 'win32':
 671         force = False
 672         drive_or_unc, _ = os.path.splitdrive(s)
 673     elif force:
 674         drive_or_unc = ''
 675     else:
 676         return s
 677
 678     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 679     if drive_or_unc:
 680         norm_path.pop(0)
 681     sanitized_path = [
 682         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 683         for path_part in norm_path]
 684     if drive_or_unc:
 685         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 686     elif force and s and s[0] == os.path.sep:
 687         sanitized_path.insert(0, os.path.sep)
 688     # TODO: Fix behavioral differences <3.12
 689     # The workaround using `normpath` only superficially passes tests
 690     # Ref: https://github.com/python/cpython/pull/100351
 691     return os.path.normpath(os.path.join(*sanitized_path))
 692
 693
 694 def sanitize_url(url, *, scheme='http'):
 695     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 696     # the number of unwanted failures due to missing protocol
 697     if url is None:
 698         return
 699     elif url.startswith('//'):
 700         return f'{scheme}:{url}'
 701     # Fix some common typos seen so far
 702     COMMON_TYPOS = (
 703         # https://github.com/ytdl-org/youtube-dl/issues/15649
 704         (r'^httpss://', r'https://'),
 705         # https://bx1.be/lives/direct-tv/
 706         (r'^rmtp([es]?)://', r'rtmp\1://'),
 707     )
 708     for mistake, fixup in COMMON_TYPOS:
 709         if re.match(mistake, url):
 710             return re.sub(mistake, fixup, url)
 711     return url
 712
 713
 714 def extract_basic_auth(url):
 715     parts = urllib.parse.urlsplit(url)
 716     if parts.username is None:
 717         return url, None
 718     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 719         parts.hostname if parts.port is None
 720         else '%s:%d' % (parts.hostname, parts.port))))
 721     auth_payload = base64.b64encode(
 722         ('%s:%s' % (parts.username, parts.password or '')).encode())
 723     return url, f'Basic {auth_payload.decode()}'
 724
 725
 726 def expand_path(s):
 727     """Expand shell variables and ~"""
 728     return os.path.expandvars(compat_expanduser(s))
 729
 730
 731 def orderedSet(iterable, *, lazy=False):
 732     """Remove all duplicates from the input iterable"""
 733     def _iter():
 734         seen = []  # Do not use set since the items can be unhashable
 735         for x in iterable:
 736             if x not in seen:
 737                 seen.append(x)
 738                 yield x
 739
 740     return _iter() if lazy else list(_iter())
 741
 742
 743 def _htmlentity_transform(entity_with_semicolon):
 744     """Transforms an HTML entity to a character."""
 745     entity = entity_with_semicolon[:-1]
 746
 747     # Known non-numeric HTML entity
 748     if entity in html.entities.name2codepoint:
 749         return chr(html.entities.name2codepoint[entity])
 750
 751     # TODO: HTML5 allows entities without a semicolon.
 752     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 753     if entity_with_semicolon in html.entities.html5:
 754         return html.entities.html5[entity_with_semicolon]
 755
 756     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 757     if mobj is not None:
 758         numstr = mobj.group(1)
 759         if numstr.startswith('x'):
 760             base = 16
 761             numstr = '0%s' % numstr
 762         else:
 763             base = 10
 764         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 765         with contextlib.suppress(ValueError):
 766             return chr(int(numstr, base))
 767
 768     # Unknown entity in name, return its literal representation
 769     return '&%s;' % entity
 770
 771
 772 def unescapeHTML(s):
 773     if s is None:
 774         return None
 775     assert isinstance(s, str)
 776
 777     return re.sub(
 778         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 779
 780
 781 def escapeHTML(text):
 782     return (
 783         text
 784         .replace('&', '&amp;')
 785         .replace('<', '&lt;')
 786         .replace('>', '&gt;')
 787         .replace('"', '&quot;')
 788         .replace("'", '&#39;')
 789     )
 790
 791
 792 class netrc_from_content(netrc.netrc):
 793     def __init__(self, content):
 794         self.hosts, self.macros = {}, {}
 795         with io.StringIO(content) as stream:
 796             self._parse('-', stream, False)
 797
 798
 799 class Popen(subprocess.Popen):
 800     if sys.platform == 'win32':
 801         _startupinfo = subprocess.STARTUPINFO()
 802         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 803     else:
 804         _startupinfo = None
 805
 806     @staticmethod
 807     def _fix_pyinstaller_ld_path(env):
 808         """Restore LD_LIBRARY_PATH when using PyInstaller
 809             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 810                  https://github.com/yt-dlp/yt-dlp/issues/4573
 811         """
 812         if not hasattr(sys, '_MEIPASS'):
 813             return
 814
 815         def _fix(key):
 816             orig = env.get(f'{key}_ORIG')
 817             if orig is None:
 818                 env.pop(key, None)
 819             else:
 820                 env[key] = orig
 821
 822         _fix('LD_LIBRARY_PATH')  # Linux
 823         _fix('DYLD_LIBRARY_PATH')  # macOS
 824
 825     def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
 826         if env is None:
 827             env = os.environ.copy()
 828         self._fix_pyinstaller_ld_path(env)
 829
 830         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 831         if text is True:
 832             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 833             kwargs.setdefault('encoding', 'utf-8')
 834             kwargs.setdefault('errors', 'replace')
 835
 836         if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
 837             if not isinstance(args, str):
 838                 args = shell_quote(args, shell=True)
 839             shell = False
 840             # Set variable for `cmd.exe` newline escaping (see `utils.shell_quote`)
 841             env['='] = '"^\n\n"'
 842             args = f'{self.__comspec()} /Q /S /D /V:OFF /E:ON /C "{args}"'
 843
 844         super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
 845
 846     def __comspec(self):
 847         comspec = os.environ.get('ComSpec') or os.path.join(
 848             os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
 849         if os.path.isabs(comspec):
 850             return comspec
 851         raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
 852
 853     def communicate_or_kill(self, *args, **kwargs):
 854         try:
 855             return self.communicate(*args, **kwargs)
 856         except BaseException:  # Including KeyboardInterrupt
 857             self.kill(timeout=None)
 858             raise
 859
 860     def kill(self, *, timeout=0):
 861         super().kill()
 862         if timeout != 0:
 863             self.wait(timeout=timeout)
 864
 865     @classmethod
 866     def run(cls, *args, timeout=None, **kwargs):
 867         with cls(*args, **kwargs) as proc:
 868             default = '' if proc.__text_mode else b''
 869             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 870             return stdout or default, stderr or default, proc.returncode
 871
 872
 873 def encodeArgument(s):
 874     # Legacy code that uses byte strings
 875     # Uncomment the following line after fixing all post processors
 876     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 877     return s if isinstance(s, str) else s.decode('ascii')
 878
 879
 880 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 881
 882
 883 def timetuple_from_msec(msec):
 884     secs, msec = divmod(msec, 1000)
 885     mins, secs = divmod(secs, 60)
 886     hrs, mins = divmod(mins, 60)
 887     return _timetuple(hrs, mins, secs, msec)
 888
 889
 890 def formatSeconds(secs, delim=':', msec=False):
 891     time = timetuple_from_msec(secs * 1000)
 892     if time.hours:
 893         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 894     elif time.minutes:
 895         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 896     else:
 897         ret = '%d' % time.seconds
 898     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 899
 900
 901 def bug_reports_message(before=';'):
 902     from ..update import REPOSITORY
 903
 904     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 905            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 906
 907     before = before.rstrip()
 908     if not before or before.endswith(('.', '!', '?')):
 909         msg = msg[0].title() + msg[1:]
 910
 911     return (before + ' ' if before else '') + msg
 912
 913
 914 class YoutubeDLError(Exception):
 915     """Base exception for YoutubeDL errors."""
 916     msg = None
 917
 918     def __init__(self, msg=None):
 919         if msg is not None:
 920             self.msg = msg
 921         elif self.msg is None:
 922             self.msg = type(self).__name__
 923         super().__init__(self.msg)
 924
 925
 926 class ExtractorError(YoutubeDLError):
 927     """Error during info extraction."""
 928
 929     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 930         """ tb, if given, is the original traceback (so that it can be printed out).
 931         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 932         """
 933         from ..networking.exceptions import network_exceptions
 934         if sys.exc_info()[0] in network_exceptions:
 935             expected = True
 936
 937         self.orig_msg = str(msg)
 938         self.traceback = tb
 939         self.expected = expected
 940         self.cause = cause
 941         self.video_id = video_id
 942         self.ie = ie
 943         self.exc_info = sys.exc_info()  # preserve original exception
 944         if isinstance(self.exc_info[1], ExtractorError):
 945             self.exc_info = self.exc_info[1].exc_info
 946         super().__init__(self.__msg)
 947
 948     @property
 949     def __msg(self):
 950         return ''.join((
 951             format_field(self.ie, None, '[%s] '),
 952             format_field(self.video_id, None, '%s: '),
 953             self.orig_msg,
 954             format_field(self.cause, None, ' (caused by %r)'),
 955             '' if self.expected else bug_reports_message()))
 956
 957     def format_traceback(self):
 958         return join_nonempty(
 959             self.traceback and ''.join(traceback.format_tb(self.traceback)),
 960             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
 961             delim='\n') or None
 962
 963     def __setattr__(self, name, value):
 964         super().__setattr__(name, value)
 965         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
 966             self.msg = self.__msg or type(self).__name__
 967             self.args = (self.msg, )  # Cannot be property
 968
 969
 970 class UnsupportedError(ExtractorError):
 971     def __init__(self, url):
 972         super().__init__(
 973             'Unsupported URL: %s' % url, expected=True)
 974         self.url = url
 975
 976
 977 class RegexNotFoundError(ExtractorError):
 978     """Error when a regex didn't match"""
 979     pass
 980
 981
 982 class GeoRestrictedError(ExtractorError):
 983     """Geographic restriction Error exception.
 984
 985     This exception may be thrown when a video is not available from your
 986     geographic location due to geographic restrictions imposed by a website.
 987     """
 988
 989     def __init__(self, msg, countries=None, **kwargs):
 990         kwargs['expected'] = True
 991         super().__init__(msg, **kwargs)
 992         self.countries = countries
 993
 994
 995 class UserNotLive(ExtractorError):
 996     """Error when a channel/user is not live"""
 997
 998     def __init__(self, msg=None, **kwargs):
 999         kwargs['expected'] = True
1000         super().__init__(msg or 'The channel is not currently live', **kwargs)
1001
1002
1003 class DownloadError(YoutubeDLError):
1004     """Download Error exception.
1005
1006     This exception may be thrown by FileDownloader objects if they are not
1007     configured to continue on errors. They will contain the appropriate
1008     error message.
1009     """
1010
1011     def __init__(self, msg, exc_info=None):
1012         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1013         super().__init__(msg)
1014         self.exc_info = exc_info
1015
1016
1017 class EntryNotInPlaylist(YoutubeDLError):
1018     """Entry not in playlist exception.
1019
1020     This exception will be thrown by YoutubeDL when a requested entry
1021     is not found in the playlist info_dict
1022     """
1023     msg = 'Entry not found in info'
1024
1025
1026 class SameFileError(YoutubeDLError):
1027     """Same File exception.
1028
1029     This exception will be thrown by FileDownloader objects if they detect
1030     multiple files would have to be downloaded to the same file on disk.
1031     """
1032     msg = 'Fixed output name but more than one file to download'
1033
1034     def __init__(self, filename=None):
1035         if filename is not None:
1036             self.msg += f': {filename}'
1037         super().__init__(self.msg)
1038
1039
1040 class PostProcessingError(YoutubeDLError):
1041     """Post Processing exception.
1042
1043     This exception may be raised by PostProcessor's .run() method to
1044     indicate an error in the postprocessing task.
1045     """
1046
1047
1048 class DownloadCancelled(YoutubeDLError):
1049     """ Exception raised when the download queue should be interrupted """
1050     msg = 'The download was cancelled'
1051
1052
1053 class ExistingVideoReached(DownloadCancelled):
1054     """ --break-on-existing triggered """
1055     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1056
1057
1058 class RejectedVideoReached(DownloadCancelled):
1059     """ --break-match-filter triggered """
1060     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1061
1062
1063 class MaxDownloadsReached(DownloadCancelled):
1064     """ --max-downloads limit has been reached. """
1065     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1066
1067
1068 class ReExtractInfo(YoutubeDLError):
1069     """ Video info needs to be re-extracted. """
1070
1071     def __init__(self, msg, expected=False):
1072         super().__init__(msg)
1073         self.expected = expected
1074
1075
1076 class ThrottledDownload(ReExtractInfo):
1077     """ Download speed below --throttled-rate. """
1078     msg = 'The download speed is below throttle limit'
1079
1080     def __init__(self):
1081         super().__init__(self.msg, expected=False)
1082
1083
1084 class UnavailableVideoError(YoutubeDLError):
1085     """Unavailable Format exception.
1086
1087     This exception will be thrown when a video is requested
1088     in a format that is not available for that video.
1089     """
1090     msg = 'Unable to download video'
1091
1092     def __init__(self, err=None):
1093         if err is not None:
1094             self.msg += f': {err}'
1095         super().__init__(self.msg)
1096
1097
1098 class ContentTooShortError(YoutubeDLError):
1099     """Content Too Short exception.
1100
1101     This exception may be raised by FileDownloader objects when a file they
1102     download is too small for what the server announced first, indicating
1103     the connection was probably interrupted.
1104     """
1105
1106     def __init__(self, downloaded, expected):
1107         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1108         # Both in bytes
1109         self.downloaded = downloaded
1110         self.expected = expected
1111
1112
1113 class XAttrMetadataError(YoutubeDLError):
1114     def __init__(self, code=None, msg='Unknown error'):
1115         super().__init__(msg)
1116         self.code = code
1117         self.msg = msg
1118
1119         # Parsing code and msg
1120         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1121                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1122             self.reason = 'NO_SPACE'
1123         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1124             self.reason = 'VALUE_TOO_LONG'
1125         else:
1126             self.reason = 'NOT_SUPPORTED'
1127
1128
1129 class XAttrUnavailableError(YoutubeDLError):
1130     pass
1131
1132
1133 def is_path_like(f):
1134     return isinstance(f, (str, bytes, os.PathLike))
1135
1136
1137 def extract_timezone(date_str):
1138     m = re.search(
1139         r'''(?x)
1140             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1141             (?P<tz>Z|                                            # just the UTC Z, or
1142                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1143                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1144                    [ ]?                                          # optional space
1145                 (?P<sign>\+|-)                                   # +/-
1146                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1147             $)
1148         ''', date_str)
1149     if not m:
1150         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1151         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1152         if timezone is not None:
1153             date_str = date_str[:-len(m.group('tz'))]
1154         timezone = dt.timedelta(hours=timezone or 0)
1155     else:
1156         date_str = date_str[:-len(m.group('tz'))]
1157         if not m.group('sign'):
1158             timezone = dt.timedelta()
1159         else:
1160             sign = 1 if m.group('sign') == '+' else -1
1161             timezone = dt.timedelta(
1162                 hours=sign * int(m.group('hours')),
1163                 minutes=sign * int(m.group('minutes')))
1164     return timezone, date_str
1165
1166
1167 def parse_iso8601(date_str, delimiter='T', timezone=None):
1168     """ Return a UNIX timestamp from the given date """
1169
1170     if date_str is None:
1171         return None
1172
1173     date_str = re.sub(r'\.[0-9]+', '', date_str)
1174
1175     if timezone is None:
1176         timezone, date_str = extract_timezone(date_str)
1177
1178     with contextlib.suppress(ValueError):
1179         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1180         dt_ = dt.datetime.strptime(date_str, date_format) - timezone
1181         return calendar.timegm(dt_.timetuple())
1182
1183
1184 def date_formats(day_first=True):
1185     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1186
1187
1188 def unified_strdate(date_str, day_first=True):
1189     """Return a string with the date in the format YYYYMMDD"""
1190
1191     if date_str is None:
1192         return None
1193     upload_date = None
1194     # Replace commas
1195     date_str = date_str.replace(',', ' ')
1196     # Remove AM/PM + timezone
1197     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1198     _, date_str = extract_timezone(date_str)
1199
1200     for expression in date_formats(day_first):
1201         with contextlib.suppress(ValueError):
1202             upload_date = dt.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1203     if upload_date is None:
1204         timetuple = email.utils.parsedate_tz(date_str)
1205         if timetuple:
1206             with contextlib.suppress(ValueError):
1207                 upload_date = dt.datetime(*timetuple[:6]).strftime('%Y%m%d')
1208     if upload_date is not None:
1209         return str(upload_date)
1210
1211
1212 def unified_timestamp(date_str, day_first=True):
1213     if not isinstance(date_str, str):
1214         return None
1215
1216     date_str = re.sub(r'\s+', ' ', re.sub(
1217         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1218
1219     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1220     timezone, date_str = extract_timezone(date_str)
1221
1222     # Remove AM/PM + timezone
1223     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1224
1225     # Remove unrecognized timezones from ISO 8601 alike timestamps
1226     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1227     if m:
1228         date_str = date_str[:-len(m.group('tz'))]
1229
1230     # Python only supports microseconds, so remove nanoseconds
1231     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1232     if m:
1233         date_str = m.group(1)
1234
1235     for expression in date_formats(day_first):
1236         with contextlib.suppress(ValueError):
1237             dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta)
1238             return calendar.timegm(dt_.timetuple())
1239
1240     timetuple = email.utils.parsedate_tz(date_str)
1241     if timetuple:
1242         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1243
1244
1245 def determine_ext(url, default_ext='unknown_video'):
1246     if url is None or '.' not in url:
1247         return default_ext
1248     guess = url.partition('?')[0].rpartition('.')[2]
1249     if re.match(r'^[A-Za-z0-9]+$', guess):
1250         return guess
1251     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1252     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1253         return guess.rstrip('/')
1254     else:
1255         return default_ext
1256
1257
1258 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1259     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1260
1261
1262 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1263     R"""
1264     Return a datetime object from a string.
1265     Supported format:
1266         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1267
1268     @param format       strftime format of DATE
1269     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1270                         auto: round to the unit provided in date_str (if applicable).
1271     """
1272     auto_precision = False
1273     if precision == 'auto':
1274         auto_precision = True
1275         precision = 'microsecond'
1276     today = datetime_round(dt.datetime.now(dt.timezone.utc), precision)
1277     if date_str in ('now', 'today'):
1278         return today
1279     if date_str == 'yesterday':
1280         return today - dt.timedelta(days=1)
1281     match = re.match(
1282         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1283         date_str)
1284     if match is not None:
1285         start_time = datetime_from_str(match.group('start'), precision, format)
1286         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1287         unit = match.group('unit')
1288         if unit == 'month' or unit == 'year':
1289             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1290             unit = 'day'
1291         else:
1292             if unit == 'week':
1293                 unit = 'day'
1294                 time *= 7
1295             delta = dt.timedelta(**{unit + 's': time})
1296             new_date = start_time + delta
1297         if auto_precision:
1298             return datetime_round(new_date, unit)
1299         return new_date
1300
1301     return datetime_round(dt.datetime.strptime(date_str, format), precision)
1302
1303
1304 def date_from_str(date_str, format='%Y%m%d', strict=False):
1305     R"""
1306     Return a date object from a string using datetime_from_str
1307
1308     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1309                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1310     """
1311     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1312         raise ValueError(f'Invalid date format "{date_str}"')
1313     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1314
1315
1316 def datetime_add_months(dt_, months):
1317     """Increment/Decrement a datetime object by months."""
1318     month = dt_.month + months - 1
1319     year = dt_.year + month // 12
1320     month = month % 12 + 1
1321     day = min(dt_.day, calendar.monthrange(year, month)[1])
1322     return dt_.replace(year, month, day)
1323
1324
1325 def datetime_round(dt_, precision='day'):
1326     """
1327     Round a datetime object's time to a specific precision
1328     """
1329     if precision == 'microsecond':
1330         return dt_
1331
1332     unit_seconds = {
1333         'day': 86400,
1334         'hour': 3600,
1335         'minute': 60,
1336         'second': 1,
1337     }
1338     roundto = lambda x, n: ((x + n / 2) // n) * n
1339     timestamp = roundto(calendar.timegm(dt_.timetuple()), unit_seconds[precision])
1340     return dt.datetime.fromtimestamp(timestamp, dt.timezone.utc)
1341
1342
1343 def hyphenate_date(date_str):
1344     """
1345     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1346     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1347     if match is not None:
1348         return '-'.join(match.groups())
1349     else:
1350         return date_str
1351
1352
1353 class DateRange:
1354     """Represents a time interval between two dates"""
1355
1356     def __init__(self, start=None, end=None):
1357         """start and end must be strings in the format accepted by date"""
1358         if start is not None:
1359             self.start = date_from_str(start, strict=True)
1360         else:
1361             self.start = dt.datetime.min.date()
1362         if end is not None:
1363             self.end = date_from_str(end, strict=True)
1364         else:
1365             self.end = dt.datetime.max.date()
1366         if self.start > self.end:
1367             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1368
1369     @classmethod
1370     def day(cls, day):
1371         """Returns a range that only contains the given day"""
1372         return cls(day, day)
1373
1374     def __contains__(self, date):
1375         """Check if the date is in the range"""
1376         if not isinstance(date, dt.date):
1377             date = date_from_str(date)
1378         return self.start <= date <= self.end
1379
1380     def __repr__(self):
1381         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1382
1383     def __str__(self):
1384         return f'{self.start} to {self.end}'
1385
1386     def __eq__(self, other):
1387         return (isinstance(other, DateRange)
1388                 and self.start == other.start and self.end == other.end)
1389
1390
1391 @functools.cache
1392 def system_identifier():
1393     python_implementation = platform.python_implementation()
1394     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1395         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1396     libc_ver = []
1397     with contextlib.suppress(OSError):  # We may not have access to the executable
1398         libc_ver = platform.libc_ver()
1399
1400     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1401         platform.python_version(),
1402         python_implementation,
1403         platform.machine(),
1404         platform.architecture()[0],
1405         platform.platform(),
1406         ssl.OPENSSL_VERSION,
1407         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1408     )
1409
1410
1411 @functools.cache
1412 def get_windows_version():
1413     ''' Get Windows version. returns () if it's not running on Windows '''
1414     if compat_os_name == 'nt':
1415         return version_tuple(platform.win32_ver()[1])
1416     else:
1417         return ()
1418
1419
1420 def write_string(s, out=None, encoding=None):
1421     assert isinstance(s, str)
1422     out = out or sys.stderr
1423     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1424     if not out:
1425         return
1426
1427     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1428         s = re.sub(r'([\r\n]+)', r' \1', s)
1429
1430     enc, buffer = None, out
1431     # `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
1432     if 'b' in (getattr(out, 'mode', None) or ''):
1433         enc = encoding or preferredencoding()
1434     elif hasattr(out, 'buffer'):
1435         buffer = out.buffer
1436         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1437
1438     buffer.write(s.encode(enc, 'ignore') if enc else s)
1439     out.flush()
1440
1441
1442 # TODO: Use global logger
1443 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1444     from .. import _IN_CLI
1445     if _IN_CLI:
1446         if msg in deprecation_warning._cache:
1447             return
1448         deprecation_warning._cache.add(msg)
1449         if printer:
1450             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1451         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1452     else:
1453         import warnings
1454         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1455
1456
1457 deprecation_warning._cache = set()
1458
1459
1460 def bytes_to_intlist(bs):
1461     if not bs:
1462         return []
1463     if isinstance(bs[0], int):  # Python 3
1464         return list(bs)
1465     else:
1466         return [ord(c) for c in bs]
1467
1468
1469 def intlist_to_bytes(xs):
1470     if not xs:
1471         return b''
1472     return struct.pack('%dB' % len(xs), *xs)
1473
1474
1475 class LockingUnsupportedError(OSError):
1476     msg = 'File locking is not supported'
1477
1478     def __init__(self):
1479         super().__init__(self.msg)
1480
1481
1482 # Cross-platform file locking
1483 if sys.platform == 'win32':
1484     import ctypes
1485     import ctypes.wintypes
1486     import msvcrt
1487
1488     class OVERLAPPED(ctypes.Structure):
1489         _fields_ = [
1490             ('Internal', ctypes.wintypes.LPVOID),
1491             ('InternalHigh', ctypes.wintypes.LPVOID),
1492             ('Offset', ctypes.wintypes.DWORD),
1493             ('OffsetHigh', ctypes.wintypes.DWORD),
1494             ('hEvent', ctypes.wintypes.HANDLE),
1495         ]
1496
1497     kernel32 = ctypes.WinDLL('kernel32')
1498     LockFileEx = kernel32.LockFileEx
1499     LockFileEx.argtypes = [
1500         ctypes.wintypes.HANDLE,     # hFile
1501         ctypes.wintypes.DWORD,      # dwFlags
1502         ctypes.wintypes.DWORD,      # dwReserved
1503         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1504         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1505         ctypes.POINTER(OVERLAPPED)  # Overlapped
1506     ]
1507     LockFileEx.restype = ctypes.wintypes.BOOL
1508     UnlockFileEx = kernel32.UnlockFileEx
1509     UnlockFileEx.argtypes = [
1510         ctypes.wintypes.HANDLE,     # hFile
1511         ctypes.wintypes.DWORD,      # dwReserved
1512         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1513         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1514         ctypes.POINTER(OVERLAPPED)  # Overlapped
1515     ]
1516     UnlockFileEx.restype = ctypes.wintypes.BOOL
1517     whole_low = 0xffffffff
1518     whole_high = 0x7fffffff
1519
1520     def _lock_file(f, exclusive, block):
1521         overlapped = OVERLAPPED()
1522         overlapped.Offset = 0
1523         overlapped.OffsetHigh = 0
1524         overlapped.hEvent = 0
1525         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1526
1527         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1528                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1529                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1530             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1531             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1532
1533     def _unlock_file(f):
1534         assert f._lock_file_overlapped_p
1535         handle = msvcrt.get_osfhandle(f.fileno())
1536         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1537             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1538
1539 else:
1540     try:
1541         import fcntl
1542
1543         def _lock_file(f, exclusive, block):
1544             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1545             if not block:
1546                 flags |= fcntl.LOCK_NB
1547             try:
1548                 fcntl.flock(f, flags)
1549             except BlockingIOError:
1550                 raise
1551             except OSError:  # AOSP does not have flock()
1552                 fcntl.lockf(f, flags)
1553
1554         def _unlock_file(f):
1555             with contextlib.suppress(OSError):
1556                 return fcntl.flock(f, fcntl.LOCK_UN)
1557             with contextlib.suppress(OSError):
1558                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1559             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1560
1561     except ImportError:
1562
1563         def _lock_file(f, exclusive, block):
1564             raise LockingUnsupportedError()
1565
1566         def _unlock_file(f):
1567             raise LockingUnsupportedError()
1568
1569
1570 class locked_file:
1571     locked = False
1572
1573     def __init__(self, filename, mode, block=True, encoding=None):
1574         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1575             raise NotImplementedError(mode)
1576         self.mode, self.block = mode, block
1577
1578         writable = any(f in mode for f in 'wax+')
1579         readable = any(f in mode for f in 'r+')
1580         flags = functools.reduce(operator.ior, (
1581             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
1582             getattr(os, 'O_BINARY', 0),  # Windows only
1583             getattr(os, 'O_NOINHERIT', 0),  # Windows only
1584             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
1585             os.O_APPEND if 'a' in mode else 0,
1586             os.O_EXCL if 'x' in mode else 0,
1587             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1588         ))
1589
1590         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1591
1592     def __enter__(self):
1593         exclusive = 'r' not in self.mode
1594         try:
1595             _lock_file(self.f, exclusive, self.block)
1596             self.locked = True
1597         except OSError:
1598             self.f.close()
1599             raise
1600         if 'w' in self.mode:
1601             try:
1602                 self.f.truncate()
1603             except OSError as e:
1604                 if e.errno not in (
1605                     errno.ESPIPE,  # Illegal seek - expected for FIFO
1606                     errno.EINVAL,  # Invalid argument - expected for /dev/null
1607                 ):
1608                     raise
1609         return self
1610
1611     def unlock(self):
1612         if not self.locked:
1613             return
1614         try:
1615             _unlock_file(self.f)
1616         finally:
1617             self.locked = False
1618
1619     def __exit__(self, *_):
1620         try:
1621             self.unlock()
1622         finally:
1623             self.f.close()
1624
1625     open = __enter__
1626     close = __exit__
1627
1628     def __getattr__(self, attr):
1629         return getattr(self.f, attr)
1630
1631     def __iter__(self):
1632         return iter(self.f)
1633
1634
1635 @functools.cache
1636 def get_filesystem_encoding():
1637     encoding = sys.getfilesystemencoding()
1638     return encoding if encoding is not None else 'utf-8'
1639
1640
1641 _WINDOWS_QUOTE_TRANS = str.maketrans({'"': '\\"', '\\': '\\\\'})
1642 _CMD_QUOTE_TRANS = str.maketrans({
1643     # Keep quotes balanced by replacing them with `""` instead of `\\"`
1644     '"': '""',
1645     # Requires a variable `=` containing `"^\n\n"` (set in `utils.Popen`)
1646     # `=` should be unique since variables containing `=` cannot be set using cmd
1647     '\n': '%=%',
1648     # While we are only required to escape backslashes immediately before quotes,
1649     # we instead escape all of 'em anyways to be consistent
1650     '\\': '\\\\',
1651     # Use zero length variable replacement so `%` doesn't get expanded
1652     # `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`)
1653     '%': '%%cd:~,%',
1654 })
1655
1656
1657 def shell_quote(args, *, shell=False):
1658     args = list(variadic(args))
1659     if any(isinstance(item, bytes) for item in args):
1660         deprecation_warning('Passing bytes to utils.shell_quote is deprecated')
1661         encoding = get_filesystem_encoding()
1662         for index, item in enumerate(args):
1663             if isinstance(item, bytes):
1664                 args[index] = item.decode(encoding)
1665
1666     if compat_os_name != 'nt':
1667         return shlex.join(args)
1668
1669     trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS
1670     return ' '.join(
1671         s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII) else s.translate(trans).join('""')
1672         for s in args)
1673
1674
1675 def smuggle_url(url, data):
1676     """ Pass additional data in a URL for internal use. """
1677
1678     url, idata = unsmuggle_url(url, {})
1679     data.update(idata)
1680     sdata = urllib.parse.urlencode(
1681         {'__youtubedl_smuggle': json.dumps(data)})
1682     return url + '#' + sdata
1683
1684
1685 def unsmuggle_url(smug_url, default=None):
1686     if '#__youtubedl_smuggle' not in smug_url:
1687         return smug_url, default
1688     url, _, sdata = smug_url.rpartition('#')
1689     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1690     data = json.loads(jsond)
1691     return url, data
1692
1693
1694 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1695     """ Formats numbers with decimal sufixes like K, M, etc """
1696     num, factor = float_or_none(num), float(factor)
1697     if num is None or num < 0:
1698         return None
1699     POSSIBLE_SUFFIXES = 'kMGTPEZY'
1700     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1701     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1702     if factor == 1024:
1703         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1704     converted = num / (factor ** exponent)
1705     return fmt % (converted, suffix)
1706
1707
1708 def format_bytes(bytes):
1709     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1710
1711
1712 def lookup_unit_table(unit_table, s, strict=False):
1713     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1714     units_re = '|'.join(re.escape(u) for u in unit_table)
1715     m = (re.fullmatch if strict else re.match)(
1716         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1717     if not m:
1718         return None
1719
1720     num = float(m.group('num').replace(',', '.'))
1721     mult = unit_table[m.group('unit')]
1722     return round(num * mult)
1723
1724
1725 def parse_bytes(s):
1726     """Parse a string indicating a byte quantity into an integer"""
1727     return lookup_unit_table(
1728         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1729         s.upper(), strict=True)
1730
1731
1732 def parse_filesize(s):
1733     if s is None:
1734         return None
1735
1736     # The lower-case forms are of course incorrect and unofficial,
1737     # but we support those too
1738     _UNIT_TABLE = {
1739         'B': 1,
1740         'b': 1,
1741         'bytes': 1,
1742         'KiB': 1024,
1743         'KB': 1000,
1744         'kB': 1024,
1745         'Kb': 1000,
1746         'kb': 1000,
1747         'kilobytes': 1000,
1748         'kibibytes': 1024,
1749         'MiB': 1024 ** 2,
1750         'MB': 1000 ** 2,
1751         'mB': 1024 ** 2,
1752         'Mb': 1000 ** 2,
1753         'mb': 1000 ** 2,
1754         'megabytes': 1000 ** 2,
1755         'mebibytes': 1024 ** 2,
1756         'GiB': 1024 ** 3,
1757         'GB': 1000 ** 3,
1758         'gB': 1024 ** 3,
1759         'Gb': 1000 ** 3,
1760         'gb': 1000 ** 3,
1761         'gigabytes': 1000 ** 3,
1762         'gibibytes': 1024 ** 3,
1763         'TiB': 1024 ** 4,
1764         'TB': 1000 ** 4,
1765         'tB': 1024 ** 4,
1766         'Tb': 1000 ** 4,
1767         'tb': 1000 ** 4,
1768         'terabytes': 1000 ** 4,
1769         'tebibytes': 1024 ** 4,
1770         'PiB': 1024 ** 5,
1771         'PB': 1000 ** 5,
1772         'pB': 1024 ** 5,
1773         'Pb': 1000 ** 5,
1774         'pb': 1000 ** 5,
1775         'petabytes': 1000 ** 5,
1776         'pebibytes': 1024 ** 5,
1777         'EiB': 1024 ** 6,
1778         'EB': 1000 ** 6,
1779         'eB': 1024 ** 6,
1780         'Eb': 1000 ** 6,
1781         'eb': 1000 ** 6,
1782         'exabytes': 1000 ** 6,
1783         'exbibytes': 1024 ** 6,
1784         'ZiB': 1024 ** 7,
1785         'ZB': 1000 ** 7,
1786         'zB': 1024 ** 7,
1787         'Zb': 1000 ** 7,
1788         'zb': 1000 ** 7,
1789         'zettabytes': 1000 ** 7,
1790         'zebibytes': 1024 ** 7,
1791         'YiB': 1024 ** 8,
1792         'YB': 1000 ** 8,
1793         'yB': 1024 ** 8,
1794         'Yb': 1000 ** 8,
1795         'yb': 1000 ** 8,
1796         'yottabytes': 1000 ** 8,
1797         'yobibytes': 1024 ** 8,
1798     }
1799
1800     return lookup_unit_table(_UNIT_TABLE, s)
1801
1802
1803 def parse_count(s):
1804     if s is None:
1805         return None
1806
1807     s = re.sub(r'^[^\d]+\s', '', s).strip()
1808
1809     if re.match(r'^[\d,.]+$', s):
1810         return str_to_int(s)
1811
1812     _UNIT_TABLE = {
1813         'k': 1000,
1814         'K': 1000,
1815         'm': 1000 ** 2,
1816         'M': 1000 ** 2,
1817         'kk': 1000 ** 2,
1818         'KK': 1000 ** 2,
1819         'b': 1000 ** 3,
1820         'B': 1000 ** 3,
1821     }
1822
1823     ret = lookup_unit_table(_UNIT_TABLE, s)
1824     if ret is not None:
1825         return ret
1826
1827     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1828     if mobj:
1829         return str_to_int(mobj.group(1))
1830
1831
1832 def parse_resolution(s, *, lenient=False):
1833     if s is None:
1834         return {}
1835
1836     if lenient:
1837         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1838     else:
1839         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1840     if mobj:
1841         return {
1842             'width': int(mobj.group('w')),
1843             'height': int(mobj.group('h')),
1844         }
1845
1846     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1847     if mobj:
1848         return {'height': int(mobj.group(1))}
1849
1850     mobj = re.search(r'\b([48])[kK]\b', s)
1851     if mobj:
1852         return {'height': int(mobj.group(1)) * 540}
1853
1854     return {}
1855
1856
1857 def parse_bitrate(s):
1858     if not isinstance(s, str):
1859         return
1860     mobj = re.search(r'\b(\d+)\s*kbps', s)
1861     if mobj:
1862         return int(mobj.group(1))
1863
1864
1865 def month_by_name(name, lang='en'):
1866     """ Return the number of a month by (locale-independently) English name """
1867
1868     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1869
1870     try:
1871         return month_names.index(name) + 1
1872     except ValueError:
1873         return None
1874
1875
1876 def month_by_abbreviation(abbrev):
1877     """ Return the number of a month by (locale-independently) English
1878         abbreviations """
1879
1880     try:
1881         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1882     except ValueError:
1883         return None
1884
1885
1886 def fix_xml_ampersands(xml_str):
1887     """Replace all the '&' by '&amp;' in XML"""
1888     return re.sub(
1889         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1890         '&amp;',
1891         xml_str)
1892
1893
1894 def setproctitle(title):
1895     assert isinstance(title, str)
1896
1897     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1898     try:
1899         import ctypes
1900     except ImportError:
1901         return
1902
1903     try:
1904         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1905     except OSError:
1906         return
1907     except TypeError:
1908         # LoadLibrary in Windows Python 2.7.13 only expects
1909         # a bytestring, but since unicode_literals turns
1910         # every string into a unicode string, it fails.
1911         return
1912     title_bytes = title.encode()
1913     buf = ctypes.create_string_buffer(len(title_bytes))
1914     buf.value = title_bytes
1915     try:
1916         # PR_SET_NAME = 15      Ref: /usr/include/linux/prctl.h
1917         libc.prctl(15, buf, 0, 0, 0)
1918     except AttributeError:
1919         return  # Strange libc, just skip this
1920
1921
1922 def remove_start(s, start):
1923     return s[len(start):] if s is not None and s.startswith(start) else s
1924
1925
1926 def remove_end(s, end):
1927     return s[:-len(end)] if s is not None and s.endswith(end) else s
1928
1929
1930 def remove_quotes(s):
1931     if s is None or len(s) < 2:
1932         return s
1933     for quote in ('"', "'", ):
1934         if s[0] == quote and s[-1] == quote:
1935             return s[1:-1]
1936     return s
1937
1938
1939 def get_domain(url):
1940     """
1941     This implementation is inconsistent, but is kept for compatibility.
1942     Use this only for "webpage_url_domain"
1943     """
1944     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1945
1946
1947 def url_basename(url):
1948     path = urllib.parse.urlparse(url).path
1949     return path.strip('/').split('/')[-1]
1950
1951
1952 def base_url(url):
1953     return re.match(r'https?://[^?#]+/', url).group()
1954
1955
1956 def urljoin(base, path):
1957     if isinstance(path, bytes):
1958         path = path.decode()
1959     if not isinstance(path, str) or not path:
1960         return None
1961     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1962         return path
1963     if isinstance(base, bytes):
1964         base = base.decode()
1965     if not isinstance(base, str) or not re.match(
1966             r'^(?:https?:)?//', base):
1967         return None
1968     return urllib.parse.urljoin(base, path)
1969
1970
1971 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1972     if get_attr and v is not None:
1973         v = getattr(v, get_attr, None)
1974     try:
1975         return int(v) * invscale // scale
1976     except (ValueError, TypeError, OverflowError):
1977         return default
1978
1979
1980 def str_or_none(v, default=None):
1981     return default if v is None else str(v)
1982
1983
1984 def str_to_int(int_str):
1985     """ A more relaxed version of int_or_none """
1986     if isinstance(int_str, int):
1987         return int_str
1988     elif isinstance(int_str, str):
1989         int_str = re.sub(r'[,\.\+]', '', int_str)
1990         return int_or_none(int_str)
1991
1992
1993 def float_or_none(v, scale=1, invscale=1, default=None):
1994     if v is None:
1995         return default
1996     try:
1997         return float(v) * invscale / scale
1998     except (ValueError, TypeError):
1999         return default
2000
2001
2002 def bool_or_none(v, default=None):
2003     return v if isinstance(v, bool) else default
2004
2005
2006 def strip_or_none(v, default=None):
2007     return v.strip() if isinstance(v, str) else default
2008
2009
2010 def url_or_none(url):
2011     if not url or not isinstance(url, str):
2012         return None
2013     url = url.strip()
2014     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2015
2016
2017 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
2018     datetime_object = None
2019     try:
2020         if isinstance(timestamp, (int, float)):  # unix timestamp
2021             # Using naive datetime here can break timestamp() in Windows
2022             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2023             # Also, dt.datetime.fromtimestamp breaks for negative timestamps
2024             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2025             datetime_object = (dt.datetime.fromtimestamp(0, dt.timezone.utc)
2026                                + dt.timedelta(seconds=timestamp))
2027         elif isinstance(timestamp, str):  # assume YYYYMMDD
2028             datetime_object = dt.datetime.strptime(timestamp, '%Y%m%d')
2029         date_format = re.sub(  # Support %s on windows
2030             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2031         return datetime_object.strftime(date_format)
2032     except (ValueError, TypeError, AttributeError):
2033         return default
2034
2035
2036 def parse_duration(s):
2037     if not isinstance(s, str):
2038         return None
2039     s = s.strip()
2040     if not s:
2041         return None
2042
2043     days, hours, mins, secs, ms = [None] * 5
2044     m = re.match(r'''(?x)
2045             (?P<before_secs>
2046                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2047             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2048             (?P<ms>[.:][0-9]+)?Z?$
2049         ''', s)
2050     if m:
2051         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2052     else:
2053         m = re.match(
2054             r'''(?ix)(?:P?
2055                 (?:
2056                     [0-9]+\s*y(?:ears?)?,?\s*
2057                 )?
2058                 (?:
2059                     [0-9]+\s*m(?:onths?)?,?\s*
2060                 )?
2061                 (?:
2062                     [0-9]+\s*w(?:eeks?)?,?\s*
2063                 )?
2064                 (?:
2065                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2066                 )?
2067                 T)?
2068                 (?:
2069                     (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2070                 )?
2071                 (?:
2072                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2073                 )?
2074                 (?:
2075                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2076                 )?Z?$''', s)
2077         if m:
2078             days, hours, mins, secs, ms = m.groups()
2079         else:
2080             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2081             if m:
2082                 hours, mins = m.groups()
2083             else:
2084                 return None
2085
2086     if ms:
2087         ms = ms.replace(':', '.')
2088     return sum(float(part or 0) * mult for part, mult in (
2089         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2090
2091
2092 def prepend_extension(filename, ext, expected_real_ext=None):
2093     name, real_ext = os.path.splitext(filename)
2094     return (
2095         f'{name}.{ext}{real_ext}'
2096         if not expected_real_ext or real_ext[1:] == expected_real_ext
2097         else f'{filename}.{ext}')
2098
2099
2100 def replace_extension(filename, ext, expected_real_ext=None):
2101     name, real_ext = os.path.splitext(filename)
2102     return '{}.{}'.format(
2103         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2104         ext)
2105
2106
2107 def check_executable(exe, args=[]):
2108     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2109     args can be a list of arguments for a short output (like -version) """
2110     try:
2111         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2112     except OSError:
2113         return False
2114     return exe
2115
2116
2117 def _get_exe_version_output(exe, args):
2118     try:
2119         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2120         # SIGTTOU if yt-dlp is run in the background.
2121         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2122         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2123                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2124         if ret:
2125             return None
2126     except OSError:
2127         return False
2128     return stdout
2129
2130
2131 def detect_exe_version(output, version_re=None, unrecognized='present'):
2132     assert isinstance(output, str)
2133     if version_re is None:
2134         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2135     m = re.search(version_re, output)
2136     if m:
2137         return m.group(1)
2138     else:
2139         return unrecognized
2140
2141
2142 def get_exe_version(exe, args=['--version'],
2143                     version_re=None, unrecognized=('present', 'broken')):
2144     """ Returns the version of the specified executable,
2145     or False if the executable is not present """
2146     unrecognized = variadic(unrecognized)
2147     assert len(unrecognized) in (1, 2)
2148     out = _get_exe_version_output(exe, args)
2149     if out is None:
2150         return unrecognized[-1]
2151     return out and detect_exe_version(out, version_re, unrecognized[0])
2152
2153
2154 def frange(start=0, stop=None, step=1):
2155     """Float range"""
2156     if stop is None:
2157         start, stop = 0, start
2158     sign = [-1, 1][step > 0] if step else 0
2159     while sign * start < sign * stop:
2160         yield start
2161         start += step
2162
2163
2164 class LazyList(collections.abc.Sequence):
2165     """Lazy immutable list from an iterable
2166     Note that slices of a LazyList are lists and not LazyList"""
2167
2168     class IndexError(IndexError):
2169         pass
2170
2171     def __init__(self, iterable, *, reverse=False, _cache=None):
2172         self._iterable = iter(iterable)
2173         self._cache = [] if _cache is None else _cache
2174         self._reversed = reverse
2175
2176     def __iter__(self):
2177         if self._reversed:
2178             # We need to consume the entire iterable to iterate in reverse
2179             yield from self.exhaust()
2180             return
2181         yield from self._cache
2182         for item in self._iterable:
2183             self._cache.append(item)
2184             yield item
2185
2186     def _exhaust(self):
2187         self._cache.extend(self._iterable)
2188         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2189         return self._cache
2190
2191     def exhaust(self):
2192         """Evaluate the entire iterable"""
2193         return self._exhaust()[::-1 if self._reversed else 1]
2194
2195     @staticmethod
2196     def _reverse_index(x):
2197         return None if x is None else ~x
2198
2199     def __getitem__(self, idx):
2200         if isinstance(idx, slice):
2201             if self._reversed:
2202                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2203             start, stop, step = idx.start, idx.stop, idx.step or 1
2204         elif isinstance(idx, int):
2205             if self._reversed:
2206                 idx = self._reverse_index(idx)
2207             start, stop, step = idx, idx, 0
2208         else:
2209             raise TypeError('indices must be integers or slices')
2210         if ((start or 0) < 0 or (stop or 0) < 0
2211                 or (start is None and step < 0)
2212                 or (stop is None and step > 0)):
2213             # We need to consume the entire iterable to be able to slice from the end
2214             # Obviously, never use this with infinite iterables
2215             self._exhaust()
2216             try:
2217                 return self._cache[idx]
2218             except IndexError as e:
2219                 raise self.IndexError(e) from e
2220         n = max(start or 0, stop or 0) - len(self._cache) + 1
2221         if n > 0:
2222             self._cache.extend(itertools.islice(self._iterable, n))
2223         try:
2224             return self._cache[idx]
2225         except IndexError as e:
2226             raise self.IndexError(e) from e
2227
2228     def __bool__(self):
2229         try:
2230             self[-1] if self._reversed else self[0]
2231         except self.IndexError:
2232             return False
2233         return True
2234
2235     def __len__(self):
2236         self._exhaust()
2237         return len(self._cache)
2238
2239     def __reversed__(self):
2240         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2241
2242     def __copy__(self):
2243         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2244
2245     def __repr__(self):
2246         # repr and str should mimic a list. So we exhaust the iterable
2247         return repr(self.exhaust())
2248
2249     def __str__(self):
2250         return repr(self.exhaust())
2251
2252
2253 class PagedList:
2254
2255     class IndexError(IndexError):
2256         pass
2257
2258     def __len__(self):
2259         # This is only useful for tests
2260         return len(self.getslice())
2261
2262     def __init__(self, pagefunc, pagesize, use_cache=True):
2263         self._pagefunc = pagefunc
2264         self._pagesize = pagesize
2265         self._pagecount = float('inf')
2266         self._use_cache = use_cache
2267         self._cache = {}
2268
2269     def getpage(self, pagenum):
2270         page_results = self._cache.get(pagenum)
2271         if page_results is None:
2272             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2273         if self._use_cache:
2274             self._cache[pagenum] = page_results
2275         return page_results
2276
2277     def getslice(self, start=0, end=None):
2278         return list(self._getslice(start, end))
2279
2280     def _getslice(self, start, end):
2281         raise NotImplementedError('This method must be implemented by subclasses')
2282
2283     def __getitem__(self, idx):
2284         assert self._use_cache, 'Indexing PagedList requires cache'
2285         if not isinstance(idx, int) or idx < 0:
2286             raise TypeError('indices must be non-negative integers')
2287         entries = self.getslice(idx, idx + 1)
2288         if not entries:
2289             raise self.IndexError()
2290         return entries[0]
2291
2292     def __bool__(self):
2293         return bool(self.getslice(0, 1))
2294
2295
2296 class OnDemandPagedList(PagedList):
2297     """Download pages until a page with less than maximum results"""
2298
2299     def _getslice(self, start, end):
2300         for pagenum in itertools.count(start // self._pagesize):
2301             firstid = pagenum * self._pagesize
2302             nextfirstid = pagenum * self._pagesize + self._pagesize
2303             if start >= nextfirstid:
2304                 continue
2305
2306             startv = (
2307                 start % self._pagesize
2308                 if firstid <= start < nextfirstid
2309                 else 0)
2310             endv = (
2311                 ((end - 1) % self._pagesize) + 1
2312                 if (end is not None and firstid <= end <= nextfirstid)
2313                 else None)
2314
2315             try:
2316                 page_results = self.getpage(pagenum)
2317             except Exception:
2318                 self._pagecount = pagenum - 1
2319                 raise
2320             if startv != 0 or endv is not None:
2321                 page_results = page_results[startv:endv]
2322             yield from page_results
2323
2324             # A little optimization - if current page is not "full", ie. does
2325             # not contain page_size videos then we can assume that this page
2326             # is the last one - there are no more ids on further pages -
2327             # i.e. no need to query again.
2328             if len(page_results) + startv < self._pagesize:
2329                 break
2330
2331             # If we got the whole page, but the next page is not interesting,
2332             # break out early as well
2333             if end == nextfirstid:
2334                 break
2335
2336
2337 class InAdvancePagedList(PagedList):
2338     """PagedList with total number of pages known in advance"""
2339
2340     def __init__(self, pagefunc, pagecount, pagesize):
2341         PagedList.__init__(self, pagefunc, pagesize, True)
2342         self._pagecount = pagecount
2343
2344     def _getslice(self, start, end):
2345         start_page = start // self._pagesize
2346         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2347         skip_elems = start - start_page * self._pagesize
2348         only_more = None if end is None else end - start
2349         for pagenum in range(start_page, end_page):
2350             page_results = self.getpage(pagenum)
2351             if skip_elems:
2352                 page_results = page_results[skip_elems:]
2353                 skip_elems = None
2354             if only_more is not None:
2355                 if len(page_results) < only_more:
2356                     only_more -= len(page_results)
2357                 else:
2358                     yield from page_results[:only_more]
2359                     break
2360             yield from page_results
2361
2362
2363 class PlaylistEntries:
2364     MissingEntry = object()
2365     is_exhausted = False
2366
2367     def __init__(self, ydl, info_dict):
2368         self.ydl = ydl
2369
2370         # _entries must be assigned now since infodict can change during iteration
2371         entries = info_dict.get('entries')
2372         if entries is None:
2373             raise EntryNotInPlaylist('There are no entries')
2374         elif isinstance(entries, list):
2375             self.is_exhausted = True
2376
2377         requested_entries = info_dict.get('requested_entries')
2378         self.is_incomplete = requested_entries is not None
2379         if self.is_incomplete:
2380             assert self.is_exhausted
2381             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2382             for i, entry in zip(requested_entries, entries):
2383                 self._entries[i - 1] = entry
2384         elif isinstance(entries, (list, PagedList, LazyList)):
2385             self._entries = entries
2386         else:
2387             self._entries = LazyList(entries)
2388
2389     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2390         (?P<start>[+-]?\d+)?
2391         (?P<range>[:-]
2392             (?P<end>[+-]?\d+|inf(?:inite)?)?
2393             (?::(?P<step>[+-]?\d+))?
2394         )?''')
2395
2396     @classmethod
2397     def parse_playlist_items(cls, string):
2398         for segment in string.split(','):
2399             if not segment:
2400                 raise ValueError('There is two or more consecutive commas')
2401             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2402             if not mobj:
2403                 raise ValueError(f'{segment!r} is not a valid specification')
2404             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2405             if int_or_none(step) == 0:
2406                 raise ValueError(f'Step in {segment!r} cannot be zero')
2407             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2408
2409     def get_requested_items(self):
2410         playlist_items = self.ydl.params.get('playlist_items')
2411         playlist_start = self.ydl.params.get('playliststart', 1)
2412         playlist_end = self.ydl.params.get('playlistend')
2413         # For backwards compatibility, interpret -1 as whole list
2414         if playlist_end in (-1, None):
2415             playlist_end = ''
2416         if not playlist_items:
2417             playlist_items = f'{playlist_start}:{playlist_end}'
2418         elif playlist_start != 1 or playlist_end:
2419             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2420
2421         for index in self.parse_playlist_items(playlist_items):
2422             for i, entry in self[index]:
2423                 yield i, entry
2424                 if not entry:
2425                     continue
2426                 try:
2427                     # The item may have just been added to archive. Don't break due to it
2428                     if not self.ydl.params.get('lazy_playlist'):
2429                         # TODO: Add auto-generated fields
2430                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2431                 except (ExistingVideoReached, RejectedVideoReached):
2432                     return
2433
2434     def get_full_count(self):
2435         if self.is_exhausted and not self.is_incomplete:
2436             return len(self)
2437         elif isinstance(self._entries, InAdvancePagedList):
2438             if self._entries._pagesize == 1:
2439                 return self._entries._pagecount
2440
2441     @functools.cached_property
2442     def _getter(self):
2443         if isinstance(self._entries, list):
2444             def get_entry(i):
2445                 try:
2446                     entry = self._entries[i]
2447                 except IndexError:
2448                     entry = self.MissingEntry
2449                     if not self.is_incomplete:
2450                         raise self.IndexError()
2451                 if entry is self.MissingEntry:
2452                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2453                 return entry
2454         else:
2455             def get_entry(i):
2456                 try:
2457                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2458                 except (LazyList.IndexError, PagedList.IndexError):
2459                     raise self.IndexError()
2460         return get_entry
2461
2462     def __getitem__(self, idx):
2463         if isinstance(idx, int):
2464             idx = slice(idx, idx)
2465
2466         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2467         step = 1 if idx.step is None else idx.step
2468         if idx.start is None:
2469             start = 0 if step > 0 else len(self) - 1
2470         else:
2471             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2472
2473         # NB: Do not call len(self) when idx == [:]
2474         if idx.stop is None:
2475             stop = 0 if step < 0 else float('inf')
2476         else:
2477             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2478         stop += [-1, 1][step > 0]
2479
2480         for i in frange(start, stop, step):
2481             if i < 0:
2482                 continue
2483             try:
2484                 entry = self._getter(i)
2485             except self.IndexError:
2486                 self.is_exhausted = True
2487                 if step > 0:
2488                     break
2489                 continue
2490             yield i + 1, entry
2491
2492     def __len__(self):
2493         return len(tuple(self[:]))
2494
2495     class IndexError(IndexError):
2496         pass
2497
2498
2499 def uppercase_escape(s):
2500     unicode_escape = codecs.getdecoder('unicode_escape')
2501     return re.sub(
2502         r'\\U[0-9a-fA-F]{8}',
2503         lambda m: unicode_escape(m.group(0))[0],
2504         s)
2505
2506
2507 def lowercase_escape(s):
2508     unicode_escape = codecs.getdecoder('unicode_escape')
2509     return re.sub(
2510         r'\\u[0-9a-fA-F]{4}',
2511         lambda m: unicode_escape(m.group(0))[0],
2512         s)
2513
2514
2515 def parse_qs(url, **kwargs):
2516     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2517
2518
2519 def read_batch_urls(batch_fd):
2520     def fixup(url):
2521         if not isinstance(url, str):
2522             url = url.decode('utf-8', 'replace')
2523         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2524         for bom in BOM_UTF8:
2525             if url.startswith(bom):
2526                 url = url[len(bom):]
2527         url = url.lstrip()
2528         if not url or url.startswith(('#', ';', ']')):
2529             return False
2530         # "#" cannot be stripped out since it is part of the URI
2531         # However, it can be safely stripped out if following a whitespace
2532         return re.split(r'\s#', url, 1)[0].rstrip()
2533
2534     with contextlib.closing(batch_fd) as fd:
2535         return [url for url in map(fixup, fd) if url]
2536
2537
2538 def urlencode_postdata(*args, **kargs):
2539     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2540
2541
2542 def update_url(url, *, query_update=None, **kwargs):
2543     """Replace URL components specified by kwargs
2544        @param url           str or parse url tuple
2545        @param query_update  update query
2546        @returns             str
2547     """
2548     if isinstance(url, str):
2549         if not kwargs and not query_update:
2550             return url
2551         else:
2552             url = urllib.parse.urlparse(url)
2553     if query_update:
2554         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2555         kwargs['query'] = urllib.parse.urlencode({
2556             **urllib.parse.parse_qs(url.query),
2557             **query_update
2558         }, True)
2559     return urllib.parse.urlunparse(url._replace(**kwargs))
2560
2561
2562 def update_url_query(url, query):
2563     return update_url(url, query_update=query)
2564
2565
2566 def _multipart_encode_impl(data, boundary):
2567     content_type = 'multipart/form-data; boundary=%s' % boundary
2568
2569     out = b''
2570     for k, v in data.items():
2571         out += b'--' + boundary.encode('ascii') + b'\r\n'
2572         if isinstance(k, str):
2573             k = k.encode()
2574         if isinstance(v, str):
2575             v = v.encode()
2576         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2577         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2578         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2579         if boundary.encode('ascii') in content:
2580             raise ValueError('Boundary overlaps with data')
2581         out += content
2582
2583     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2584
2585     return out, content_type
2586
2587
2588 def multipart_encode(data, boundary=None):
2589     '''
2590     Encode a dict to RFC 7578-compliant form-data
2591
2592     data:
2593         A dict where keys and values can be either Unicode or bytes-like
2594         objects.
2595     boundary:
2596         If specified a Unicode object, it's used as the boundary. Otherwise
2597         a random boundary is generated.
2598
2599     Reference: https://tools.ietf.org/html/rfc7578
2600     '''
2601     has_specified_boundary = boundary is not None
2602
2603     while True:
2604         if boundary is None:
2605             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2606
2607         try:
2608             out, content_type = _multipart_encode_impl(data, boundary)
2609             break
2610         except ValueError:
2611             if has_specified_boundary:
2612                 raise
2613             boundary = None
2614
2615     return out, content_type
2616
2617
2618 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2619     if blocked_types is NO_DEFAULT:
2620         blocked_types = (str, bytes, collections.abc.Mapping)
2621     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2622
2623
2624 def variadic(x, allowed_types=NO_DEFAULT):
2625     if not isinstance(allowed_types, (tuple, type)):
2626         deprecation_warning('allowed_types should be a tuple or a type')
2627         allowed_types = tuple(allowed_types)
2628     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2629
2630
2631 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2632     for f in funcs:
2633         try:
2634             val = f(*args, **kwargs)
2635         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2636             pass
2637         else:
2638             if expected_type is None or isinstance(val, expected_type):
2639                 return val
2640
2641
2642 def try_get(src, getter, expected_type=None):
2643     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2644
2645
2646 def filter_dict(dct, cndn=lambda _, v: v is not None):
2647     return {k: v for k, v in dct.items() if cndn(k, v)}
2648
2649
2650 def merge_dicts(*dicts):
2651     merged = {}
2652     for a_dict in dicts:
2653         for k, v in a_dict.items():
2654             if (v is not None and k not in merged
2655                     or isinstance(v, str) and merged[k] == ''):
2656                 merged[k] = v
2657     return merged
2658
2659
2660 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2661     return string if isinstance(string, str) else str(string, encoding, errors)
2662
2663
2664 US_RATINGS = {
2665     'G': 0,
2666     'PG': 10,
2667     'PG-13': 13,
2668     'R': 16,
2669     'NC': 18,
2670 }
2671
2672
2673 TV_PARENTAL_GUIDELINES = {
2674     'TV-Y': 0,
2675     'TV-Y7': 7,
2676     'TV-G': 0,
2677     'TV-PG': 0,
2678     'TV-14': 14,
2679     'TV-MA': 17,
2680 }
2681
2682
2683 def parse_age_limit(s):
2684     # isinstance(False, int) is True. So type() must be used instead
2685     if type(s) is int:  # noqa: E721
2686         return s if 0 <= s <= 21 else None
2687     elif not isinstance(s, str):
2688         return None
2689     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2690     if m:
2691         return int(m.group('age'))
2692     s = s.upper()
2693     if s in US_RATINGS:
2694         return US_RATINGS[s]
2695     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2696     if m:
2697         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2698     return None
2699
2700
2701 def strip_jsonp(code):
2702     return re.sub(
2703         r'''(?sx)^
2704             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2705             (?:\s*&&\s*(?P=func_name))?
2706             \s*\(\s*(?P<callback_data>.*)\);?
2707             \s*?(?://[^\n]*)*$''',
2708         r'\g<callback_data>', code)
2709
2710
2711 def js_to_json(code, vars={}, *, strict=False):
2712     # vars is a dict of var, val pairs to substitute
2713     STRING_QUOTES = '\'"`'
2714     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2715     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2716     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2717     INTEGER_TABLE = (
2718         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2719         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2720     )
2721
2722     def process_escape(match):
2723         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2724         escape = match.group(1) or match.group(2)
2725
2726         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2727                 else R'\u00' if escape == 'x'
2728                 else '' if escape == '\n'
2729                 else escape)
2730
2731     def template_substitute(match):
2732         evaluated = js_to_json(match.group(1), vars, strict=strict)
2733         if evaluated[0] == '"':
2734             return json.loads(evaluated)
2735         return evaluated
2736
2737     def fix_kv(m):
2738         v = m.group(0)
2739         if v in ('true', 'false', 'null'):
2740             return v
2741         elif v in ('undefined', 'void 0'):
2742             return 'null'
2743         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2744             return ''
2745
2746         if v[0] in STRING_QUOTES:
2747             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2748             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2749             return f'"{escaped}"'
2750
2751         for regex, base in INTEGER_TABLE:
2752             im = re.match(regex, v)
2753             if im:
2754                 i = int(im.group(1), base)
2755                 return f'"{i}":' if v.endswith(':') else str(i)
2756
2757         if v in vars:
2758             try:
2759                 if not strict:
2760                     json.loads(vars[v])
2761             except json.JSONDecodeError:
2762                 return json.dumps(vars[v])
2763             else:
2764                 return vars[v]
2765
2766         if not strict:
2767             return f'"{v}"'
2768
2769         raise ValueError(f'Unknown value: {v}')
2770
2771     def create_map(mobj):
2772         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2773
2774     code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
2775     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2776     if not strict:
2777         code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
2778         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2779         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2780         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2781
2782     return re.sub(rf'''(?sx)
2783         {STRING_RE}|
2784         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2785         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2786         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2787         [0-9]+(?={SKIP_RE}:)|
2788         !+
2789         ''', fix_kv, code)
2790
2791
2792 def qualities(quality_ids):
2793     """ Get a numeric quality value out of a list of possible values """
2794     def q(qid):
2795         try:
2796             return quality_ids.index(qid)
2797         except ValueError:
2798             return -1
2799     return q
2800
2801
2802 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2803
2804
2805 DEFAULT_OUTTMPL = {
2806     'default': '%(title)s [%(id)s].%(ext)s',
2807     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2808 }
2809 OUTTMPL_TYPES = {
2810     'chapter': None,
2811     'subtitle': None,
2812     'thumbnail': None,
2813     'description': 'description',
2814     'annotation': 'annotations.xml',
2815     'infojson': 'info.json',
2816     'link': None,
2817     'pl_video': None,
2818     'pl_thumbnail': None,
2819     'pl_description': 'description',
2820     'pl_infojson': 'info.json',
2821 }
2822
2823 # As of [1] format syntax is:
2824 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2825 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2826 STR_FORMAT_RE_TMPL = r'''(?x)
2827     (?<!%)(?P<prefix>(?:%%)*)
2828     %
2829     (?P<has_key>\((?P<key>{0})\))?
2830     (?P<format>
2831         (?P<conversion>[#0\-+ ]+)?
2832         (?P<min_width>\d+)?
2833         (?P<precision>\.\d+)?
2834         (?P<len_mod>[hlL])?  # unused in python
2835         {1}  # conversion type
2836     )
2837 '''
2838
2839
2840 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2841
2842
2843 def limit_length(s, length):
2844     """ Add ellipses to overly long strings """
2845     if s is None:
2846         return None
2847     ELLIPSES = '...'
2848     if len(s) > length:
2849         return s[:length - len(ELLIPSES)] + ELLIPSES
2850     return s
2851
2852
2853 def version_tuple(v):
2854     return tuple(int(e) for e in re.split(r'[-.]', v))
2855
2856
2857 def is_outdated_version(version, limit, assume_new=True):
2858     if not version:
2859         return not assume_new
2860     try:
2861         return version_tuple(version) < version_tuple(limit)
2862     except ValueError:
2863         return not assume_new
2864
2865
2866 def ytdl_is_updateable():
2867     """ Returns if yt-dlp can be updated with -U """
2868
2869     from ..update import is_non_updateable
2870
2871     return not is_non_updateable()
2872
2873
2874 def args_to_str(args):
2875     # Get a short string representation for a subprocess command
2876     return shell_quote(args)
2877
2878
2879 def error_to_str(err):
2880     return f'{type(err).__name__}: {err}'
2881
2882
2883 def mimetype2ext(mt, default=NO_DEFAULT):
2884     if not isinstance(mt, str):
2885         if default is not NO_DEFAULT:
2886             return default
2887         return None
2888
2889     MAP = {
2890         # video
2891         '3gpp': '3gp',
2892         'mp2t': 'ts',
2893         'mp4': 'mp4',
2894         'mpeg': 'mpeg',
2895         'mpegurl': 'm3u8',
2896         'quicktime': 'mov',
2897         'webm': 'webm',
2898         'vp9': 'vp9',
2899         'video/ogg': 'ogv',
2900         'x-flv': 'flv',
2901         'x-m4v': 'm4v',
2902         'x-matroska': 'mkv',
2903         'x-mng': 'mng',
2904         'x-mp4-fragmented': 'mp4',
2905         'x-ms-asf': 'asf',
2906         'x-ms-wmv': 'wmv',
2907         'x-msvideo': 'avi',
2908
2909         # application (streaming playlists)
2910         'dash+xml': 'mpd',
2911         'f4m+xml': 'f4m',
2912         'hds+xml': 'f4m',
2913         'vnd.apple.mpegurl': 'm3u8',
2914         'vnd.ms-sstr+xml': 'ism',
2915         'x-mpegurl': 'm3u8',
2916
2917         # audio
2918         'audio/mp4': 'm4a',
2919         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2920         # Using .mp3 as it's the most popular one
2921         'audio/mpeg': 'mp3',
2922         'audio/webm': 'webm',
2923         'audio/x-matroska': 'mka',
2924         'audio/x-mpegurl': 'm3u',
2925         'midi': 'mid',
2926         'ogg': 'ogg',
2927         'wav': 'wav',
2928         'wave': 'wav',
2929         'x-aac': 'aac',
2930         'x-flac': 'flac',
2931         'x-m4a': 'm4a',
2932         'x-realaudio': 'ra',
2933         'x-wav': 'wav',
2934
2935         # image
2936         'avif': 'avif',
2937         'bmp': 'bmp',
2938         'gif': 'gif',
2939         'jpeg': 'jpg',
2940         'png': 'png',
2941         'svg+xml': 'svg',
2942         'tiff': 'tif',
2943         'vnd.wap.wbmp': 'wbmp',
2944         'webp': 'webp',
2945         'x-icon': 'ico',
2946         'x-jng': 'jng',
2947         'x-ms-bmp': 'bmp',
2948
2949         # caption
2950         'filmstrip+json': 'fs',
2951         'smptett+xml': 'tt',
2952         'ttaf+xml': 'dfxp',
2953         'ttml+xml': 'ttml',
2954         'x-ms-sami': 'sami',
2955
2956         # misc
2957         'gzip': 'gz',
2958         'json': 'json',
2959         'xml': 'xml',
2960         'zip': 'zip',
2961     }
2962
2963     mimetype = mt.partition(';')[0].strip().lower()
2964     _, _, subtype = mimetype.rpartition('/')
2965
2966     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2967     if ext:
2968         return ext
2969     elif default is not NO_DEFAULT:
2970         return default
2971     return subtype.replace('+', '.')
2972
2973
2974 def ext2mimetype(ext_or_url):
2975     if not ext_or_url:
2976         return None
2977     if '.' not in ext_or_url:
2978         ext_or_url = f'file.{ext_or_url}'
2979     return mimetypes.guess_type(ext_or_url)[0]
2980
2981
2982 def parse_codecs(codecs_str):
2983     # http://tools.ietf.org/html/rfc6381
2984     if not codecs_str:
2985         return {}
2986     split_codecs = list(filter(None, map(
2987         str.strip, codecs_str.strip().strip(',').split(','))))
2988     vcodec, acodec, scodec, hdr = None, None, None, None
2989     for full_codec in split_codecs:
2990         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2991         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2992                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2993             if vcodec:
2994                 continue
2995             vcodec = full_codec
2996             if parts[0] in ('dvh1', 'dvhe'):
2997                 hdr = 'DV'
2998             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2999                 hdr = 'HDR10'
3000             elif parts[:2] == ['vp9', '2']:
3001                 hdr = 'HDR10'
3002         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3003                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3004             acodec = acodec or full_codec
3005         elif parts[0] in ('stpp', 'wvtt'):
3006             scodec = scodec or full_codec
3007         else:
3008             write_string(f'WARNING: Unknown codec {full_codec}\n')
3009     if vcodec or acodec or scodec:
3010         return {
3011             'vcodec': vcodec or 'none',
3012             'acodec': acodec or 'none',
3013             'dynamic_range': hdr,
3014             **({'scodec': scodec} if scodec is not None else {}),
3015         }
3016     elif len(split_codecs) == 2:
3017         return {
3018             'vcodec': split_codecs[0],
3019             'acodec': split_codecs[1],
3020         }
3021     return {}
3022
3023
3024 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3025     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3026
3027     allow_mkv = not preferences or 'mkv' in preferences
3028
3029     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3030         return 'mkv'  # TODO: any other format allows this?
3031
3032     # TODO: All codecs supported by parse_codecs isn't handled here
3033     COMPATIBLE_CODECS = {
3034         'mp4': {
3035             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3036             'h264', 'aacl', 'ec-3',  # Set in ISM
3037         },
3038         'webm': {
3039             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3040             'vp9x', 'vp8x',  # in the webm spec
3041         },
3042     }
3043
3044     sanitize_codec = functools.partial(
3045         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3046     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3047
3048     for ext in preferences or COMPATIBLE_CODECS.keys():
3049         codec_set = COMPATIBLE_CODECS.get(ext, set())
3050         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3051             return ext
3052
3053     COMPATIBLE_EXTS = (
3054         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3055         {'webm', 'weba'},
3056     )
3057     for ext in preferences or vexts:
3058         current_exts = {ext, *vexts, *aexts}
3059         if ext == 'mkv' or current_exts == {ext} or any(
3060                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3061             return ext
3062     return 'mkv' if allow_mkv else preferences[-1]
3063
3064
3065 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3066     getheader = url_handle.headers.get
3067
3068     cd = getheader('Content-Disposition')
3069     if cd:
3070         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3071         if m:
3072             e = determine_ext(m.group('filename'), default_ext=None)
3073             if e:
3074                 return e
3075
3076     meta_ext = getheader('x-amz-meta-name')
3077     if meta_ext:
3078         e = meta_ext.rpartition('.')[2]
3079         if e:
3080             return e
3081
3082     return mimetype2ext(getheader('Content-Type'), default=default)
3083
3084
3085 def encode_data_uri(data, mime_type):
3086     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3087
3088
3089 def age_restricted(content_limit, age_limit):
3090     """ Returns True iff the content should be blocked """
3091
3092     if age_limit is None:  # No limit set
3093         return False
3094     if content_limit is None:
3095         return False  # Content available for everyone
3096     return age_limit < content_limit
3097
3098
3099 # List of known byte-order-marks (BOM)
3100 BOMS = [
3101     (b'\xef\xbb\xbf', 'utf-8'),
3102     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3103     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3104     (b'\xff\xfe', 'utf-16-le'),
3105     (b'\xfe\xff', 'utf-16-be'),
3106 ]
3107
3108
3109 def is_html(first_bytes):
3110     """ Detect whether a file contains HTML by examining its first bytes. """
3111
3112     encoding = 'utf-8'
3113     for bom, enc in BOMS:
3114         while first_bytes.startswith(bom):
3115             encoding, first_bytes = enc, first_bytes[len(bom):]
3116
3117     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3118
3119
3120 def determine_protocol(info_dict):
3121     protocol = info_dict.get('protocol')
3122     if protocol is not None:
3123         return protocol
3124
3125     url = sanitize_url(info_dict['url'])
3126     if url.startswith('rtmp'):
3127         return 'rtmp'
3128     elif url.startswith('mms'):
3129         return 'mms'
3130     elif url.startswith('rtsp'):
3131         return 'rtsp'
3132
3133     ext = determine_ext(url)
3134     if ext == 'm3u8':
3135         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3136     elif ext == 'f4m':
3137         return 'f4m'
3138
3139     return urllib.parse.urlparse(url).scheme
3140
3141
3142 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3143     """ Render a list of rows, each as a list of values.
3144     Text after a \t will be right aligned """
3145     def width(string):
3146         return len(remove_terminal_sequences(string).replace('\t', ''))
3147
3148     def get_max_lens(table):
3149         return [max(width(str(v)) for v in col) for col in zip(*table)]
3150
3151     def filter_using_list(row, filterArray):
3152         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3153
3154     max_lens = get_max_lens(data) if hide_empty else []
3155     header_row = filter_using_list(header_row, max_lens)
3156     data = [filter_using_list(row, max_lens) for row in data]
3157
3158     table = [header_row] + data
3159     max_lens = get_max_lens(table)
3160     extra_gap += 1
3161     if delim:
3162         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3163         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3164     for row in table:
3165         for pos, text in enumerate(map(str, row)):
3166             if '\t' in text:
3167                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3168             else:
3169                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3170     ret = '\n'.join(''.join(row).rstrip() for row in table)
3171     return ret
3172
3173
3174 def _match_one(filter_part, dct, incomplete):
3175     # TODO: Generalize code with YoutubeDL._build_format_filter
3176     STRING_OPERATORS = {
3177         '*=': operator.contains,
3178         '^=': lambda attr, value: attr.startswith(value),
3179         '$=': lambda attr, value: attr.endswith(value),
3180         '~=': lambda attr, value: re.search(value, attr),
3181     }
3182     COMPARISON_OPERATORS = {
3183         **STRING_OPERATORS,
3184         '<=': operator.le,  # "<=" must be defined above "<"
3185         '<': operator.lt,
3186         '>=': operator.ge,
3187         '>': operator.gt,
3188         '=': operator.eq,
3189     }
3190
3191     if isinstance(incomplete, bool):
3192         is_incomplete = lambda _: incomplete
3193     else:
3194         is_incomplete = lambda k: k in incomplete
3195
3196     operator_rex = re.compile(r'''(?x)
3197         (?P<key>[a-z_]+)
3198         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3199         (?:
3200             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3201             (?P<strval>.+?)
3202         )
3203         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3204     m = operator_rex.fullmatch(filter_part.strip())
3205     if m:
3206         m = m.groupdict()
3207         unnegated_op = COMPARISON_OPERATORS[m['op']]
3208         if m['negation']:
3209             op = lambda attr, value: not unnegated_op(attr, value)
3210         else:
3211             op = unnegated_op
3212         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3213         if m['quote']:
3214             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3215         actual_value = dct.get(m['key'])
3216         numeric_comparison = None
3217         if isinstance(actual_value, (int, float)):
3218             # If the original field is a string and matching comparisonvalue is
3219             # a number we should respect the origin of the original field
3220             # and process comparison value as a string (see
3221             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3222             try:
3223                 numeric_comparison = int(comparison_value)
3224             except ValueError:
3225                 numeric_comparison = parse_filesize(comparison_value)
3226                 if numeric_comparison is None:
3227                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3228                 if numeric_comparison is None:
3229                     numeric_comparison = parse_duration(comparison_value)
3230         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3231             raise ValueError('Operator %s only supports string values!' % m['op'])
3232         if actual_value is None:
3233             return is_incomplete(m['key']) or m['none_inclusive']
3234         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3235
3236     UNARY_OPERATORS = {
3237         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3238         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3239     }
3240     operator_rex = re.compile(r'''(?x)
3241         (?P<op>%s)\s*(?P<key>[a-z_]+)
3242         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3243     m = operator_rex.fullmatch(filter_part.strip())
3244     if m:
3245         op = UNARY_OPERATORS[m.group('op')]
3246         actual_value = dct.get(m.group('key'))
3247         if is_incomplete(m.group('key')) and actual_value is None:
3248             return True
3249         return op(actual_value)
3250
3251     raise ValueError('Invalid filter part %r' % filter_part)
3252
3253
3254 def match_str(filter_str, dct, incomplete=False):
3255     """ Filter a dictionary with a simple string syntax.
3256     @returns           Whether the filter passes
3257     @param incomplete  Set of keys that is expected to be missing from dct.
3258                        Can be True/False to indicate all/none of the keys may be missing.
3259                        All conditions on incomplete keys pass if the key is missing
3260     """
3261     return all(
3262         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3263         for filter_part in re.split(r'(?<!\\)&', filter_str))
3264
3265
3266 def match_filter_func(filters, breaking_filters=None):
3267     if not filters and not breaking_filters:
3268         return None
3269     repr_ = f'{match_filter_func.__module__}.{match_filter_func.__qualname__}({filters}, {breaking_filters})'
3270
3271     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3272     filters = set(variadic(filters or []))
3273
3274     interactive = '-' in filters
3275     if interactive:
3276         filters.remove('-')
3277
3278     @function_with_repr.set_repr(repr_)
3279     def _match_func(info_dict, incomplete=False):
3280         ret = breaking_filters(info_dict, incomplete)
3281         if ret is not None:
3282             raise RejectedVideoReached(ret)
3283
3284         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3285             return NO_DEFAULT if interactive and not incomplete else None
3286         else:
3287             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3288             filter_str = ') | ('.join(map(str.strip, filters))
3289             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3290     return _match_func
3291
3292
3293 class download_range_func:
3294     def __init__(self, chapters, ranges, from_info=False):
3295         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3296
3297     def __call__(self, info_dict, ydl):
3298
3299         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3300                    else 'Cannot match chapters since chapter information is unavailable')
3301         for regex in self.chapters or []:
3302             for i, chapter in enumerate(info_dict.get('chapters') or []):
3303                 if re.search(regex, chapter['title']):
3304                     warning = None
3305                     yield {**chapter, 'index': i}
3306         if self.chapters and warning:
3307             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3308
3309         for start, end in self.ranges or []:
3310             yield {
3311                 'start_time': self._handle_negative_timestamp(start, info_dict),
3312                 'end_time': self._handle_negative_timestamp(end, info_dict),
3313             }
3314
3315         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3316             yield {
3317                 'start_time': info_dict.get('start_time') or 0,
3318                 'end_time': info_dict.get('end_time') or float('inf'),
3319             }
3320         elif not self.ranges and not self.chapters:
3321             yield {}
3322
3323     @staticmethod
3324     def _handle_negative_timestamp(time, info):
3325         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3326
3327     def __eq__(self, other):
3328         return (isinstance(other, download_range_func)
3329                 and self.chapters == other.chapters and self.ranges == other.ranges)
3330
3331     def __repr__(self):
3332         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3333
3334
3335 def parse_dfxp_time_expr(time_expr):
3336     if not time_expr:
3337         return
3338
3339     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3340     if mobj:
3341         return float(mobj.group('time_offset'))
3342
3343     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3344     if mobj:
3345         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3346
3347
3348 def srt_subtitles_timecode(seconds):
3349     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3350
3351
3352 def ass_subtitles_timecode(seconds):
3353     time = timetuple_from_msec(seconds * 1000)
3354     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3355
3356
3357 def dfxp2srt(dfxp_data):
3358     '''
3359     @param dfxp_data A bytes-like object containing DFXP data
3360     @returns A unicode object containing converted SRT data
3361     '''
3362     LEGACY_NAMESPACES = (
3363         (b'http://www.w3.org/ns/ttml', [
3364             b'http://www.w3.org/2004/11/ttaf1',
3365             b'http://www.w3.org/2006/04/ttaf1',
3366             b'http://www.w3.org/2006/10/ttaf1',
3367         ]),
3368         (b'http://www.w3.org/ns/ttml#styling', [
3369             b'http://www.w3.org/ns/ttml#style',
3370         ]),
3371     )
3372
3373     SUPPORTED_STYLING = [
3374         'color',
3375         'fontFamily',
3376         'fontSize',
3377         'fontStyle',
3378         'fontWeight',
3379         'textDecoration'
3380     ]
3381
3382     _x = functools.partial(xpath_with_ns, ns_map={
3383         'xml': 'http://www.w3.org/XML/1998/namespace',
3384         'ttml': 'http://www.w3.org/ns/ttml',
3385         'tts': 'http://www.w3.org/ns/ttml#styling',
3386     })
3387
3388     styles = {}
3389     default_style = {}
3390
3391     class TTMLPElementParser:
3392         _out = ''
3393         _unclosed_elements = []
3394         _applied_styles = []
3395
3396         def start(self, tag, attrib):
3397             if tag in (_x('ttml:br'), 'br'):
3398                 self._out += '\n'
3399             else:
3400                 unclosed_elements = []
3401                 style = {}
3402                 element_style_id = attrib.get('style')
3403                 if default_style:
3404                     style.update(default_style)
3405                 if element_style_id:
3406                     style.update(styles.get(element_style_id, {}))
3407                 for prop in SUPPORTED_STYLING:
3408                     prop_val = attrib.get(_x('tts:' + prop))
3409                     if prop_val:
3410                         style[prop] = prop_val
3411                 if style:
3412                     font = ''
3413                     for k, v in sorted(style.items()):
3414                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3415                             continue
3416                         if k == 'color':
3417                             font += ' color="%s"' % v
3418                         elif k == 'fontSize':
3419                             font += ' size="%s"' % v
3420                         elif k == 'fontFamily':
3421                             font += ' face="%s"' % v
3422                         elif k == 'fontWeight' and v == 'bold':
3423                             self._out += '<b>'
3424                             unclosed_elements.append('b')
3425                         elif k == 'fontStyle' and v == 'italic':
3426                             self._out += '<i>'
3427                             unclosed_elements.append('i')
3428                         elif k == 'textDecoration' and v == 'underline':
3429                             self._out += '<u>'
3430                             unclosed_elements.append('u')
3431                     if font:
3432                         self._out += '<font' + font + '>'
3433                         unclosed_elements.append('font')
3434                     applied_style = {}
3435                     if self._applied_styles:
3436                         applied_style.update(self._applied_styles[-1])
3437                     applied_style.update(style)
3438                     self._applied_styles.append(applied_style)
3439                 self._unclosed_elements.append(unclosed_elements)
3440
3441         def end(self, tag):
3442             if tag not in (_x('ttml:br'), 'br'):
3443                 unclosed_elements = self._unclosed_elements.pop()
3444                 for element in reversed(unclosed_elements):
3445                     self._out += '</%s>' % element
3446                 if unclosed_elements and self._applied_styles:
3447                     self._applied_styles.pop()
3448
3449         def data(self, data):
3450             self._out += data
3451
3452         def close(self):
3453             return self._out.strip()
3454
3455     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3456     # This will not trigger false positives since only UTF-8 text is being replaced
3457     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3458
3459     def parse_node(node):
3460         target = TTMLPElementParser()
3461         parser = xml.etree.ElementTree.XMLParser(target=target)
3462         parser.feed(xml.etree.ElementTree.tostring(node))
3463         return parser.close()
3464
3465     for k, v in LEGACY_NAMESPACES:
3466         for ns in v:
3467             dfxp_data = dfxp_data.replace(ns, k)
3468
3469     dfxp = compat_etree_fromstring(dfxp_data)
3470     out = []
3471     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3472
3473     if not paras:
3474         raise ValueError('Invalid dfxp/TTML subtitle')
3475
3476     repeat = False
3477     while True:
3478         for style in dfxp.findall(_x('.//ttml:style')):
3479             style_id = style.get('id') or style.get(_x('xml:id'))
3480             if not style_id:
3481                 continue
3482             parent_style_id = style.get('style')
3483             if parent_style_id:
3484                 if parent_style_id not in styles:
3485                     repeat = True
3486                     continue
3487                 styles[style_id] = styles[parent_style_id].copy()
3488             for prop in SUPPORTED_STYLING:
3489                 prop_val = style.get(_x('tts:' + prop))
3490                 if prop_val:
3491                     styles.setdefault(style_id, {})[prop] = prop_val
3492         if repeat:
3493             repeat = False
3494         else:
3495             break
3496
3497     for p in ('body', 'div'):
3498         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3499         if ele is None:
3500             continue
3501         style = styles.get(ele.get('style'))
3502         if not style:
3503             continue
3504         default_style.update(style)
3505
3506     for para, index in zip(paras, itertools.count(1)):
3507         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3508         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3509         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3510         if begin_time is None:
3511             continue
3512         if not end_time:
3513             if not dur:
3514                 continue
3515             end_time = begin_time + dur
3516         out.append('%d\n%s --> %s\n%s\n\n' % (
3517             index,
3518             srt_subtitles_timecode(begin_time),
3519             srt_subtitles_timecode(end_time),
3520             parse_node(para)))
3521
3522     return ''.join(out)
3523
3524
3525 def cli_option(params, command_option, param, separator=None):
3526     param = params.get(param)
3527     return ([] if param is None
3528             else [command_option, str(param)] if separator is None
3529             else [f'{command_option}{separator}{param}'])
3530
3531
3532 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3533     param = params.get(param)
3534     assert param in (True, False, None)
3535     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3536
3537
3538 def cli_valueless_option(params, command_option, param, expected_value=True):
3539     return [command_option] if params.get(param) == expected_value else []
3540
3541
3542 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3543     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3544         if use_compat:
3545             return argdict
3546         else:
3547             argdict = None
3548     if argdict is None:
3549         return default
3550     assert isinstance(argdict, dict)
3551
3552     assert isinstance(keys, (list, tuple))
3553     for key_list in keys:
3554         arg_list = list(filter(
3555             lambda x: x is not None,
3556             [argdict.get(key.lower()) for key in variadic(key_list)]))
3557         if arg_list:
3558             return [arg for args in arg_list for arg in args]
3559     return default
3560
3561
3562 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3563     main_key, exe = main_key.lower(), exe.lower()
3564     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3565     keys = [f'{root_key}{k}' for k in (keys or [''])]
3566     if root_key in keys:
3567         if main_key != exe:
3568             keys.append((main_key, exe))
3569         keys.append('default')
3570     else:
3571         use_compat = False
3572     return cli_configuration_args(argdict, keys, default, use_compat)
3573
3574
3575 class ISO639Utils:
3576     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3577     _lang_map = {
3578         'aa': 'aar',
3579         'ab': 'abk',
3580         'ae': 'ave',
3581         'af': 'afr',
3582         'ak': 'aka',
3583         'am': 'amh',
3584         'an': 'arg',
3585         'ar': 'ara',
3586         'as': 'asm',
3587         'av': 'ava',
3588         'ay': 'aym',
3589         'az': 'aze',
3590         'ba': 'bak',
3591         'be': 'bel',
3592         'bg': 'bul',
3593         'bh': 'bih',
3594         'bi': 'bis',
3595         'bm': 'bam',
3596         'bn': 'ben',
3597         'bo': 'bod',
3598         'br': 'bre',
3599         'bs': 'bos',
3600         'ca': 'cat',
3601         'ce': 'che',
3602         'ch': 'cha',
3603         'co': 'cos',
3604         'cr': 'cre',
3605         'cs': 'ces',
3606         'cu': 'chu',
3607         'cv': 'chv',
3608         'cy': 'cym',
3609         'da': 'dan',
3610         'de': 'deu',
3611         'dv': 'div',
3612         'dz': 'dzo',
3613         'ee': 'ewe',
3614         'el': 'ell',
3615         'en': 'eng',
3616         'eo': 'epo',
3617         'es': 'spa',
3618         'et': 'est',
3619         'eu': 'eus',
3620         'fa': 'fas',
3621         'ff': 'ful',
3622         'fi': 'fin',
3623         'fj': 'fij',
3624         'fo': 'fao',
3625         'fr': 'fra',
3626         'fy': 'fry',
3627         'ga': 'gle',
3628         'gd': 'gla',
3629         'gl': 'glg',
3630         'gn': 'grn',
3631         'gu': 'guj',
3632         'gv': 'glv',
3633         'ha': 'hau',
3634         'he': 'heb',
3635         'iw': 'heb',  # Replaced by he in 1989 revision
3636         'hi': 'hin',
3637         'ho': 'hmo',
3638         'hr': 'hrv',
3639         'ht': 'hat',
3640         'hu': 'hun',
3641         'hy': 'hye',
3642         'hz': 'her',
3643         'ia': 'ina',
3644         'id': 'ind',
3645         'in': 'ind',  # Replaced by id in 1989 revision
3646         'ie': 'ile',
3647         'ig': 'ibo',
3648         'ii': 'iii',
3649         'ik': 'ipk',
3650         'io': 'ido',
3651         'is': 'isl',
3652         'it': 'ita',
3653         'iu': 'iku',
3654         'ja': 'jpn',
3655         'jv': 'jav',
3656         'ka': 'kat',
3657         'kg': 'kon',
3658         'ki': 'kik',
3659         'kj': 'kua',
3660         'kk': 'kaz',
3661         'kl': 'kal',
3662         'km': 'khm',
3663         'kn': 'kan',
3664         'ko': 'kor',
3665         'kr': 'kau',
3666         'ks': 'kas',
3667         'ku': 'kur',
3668         'kv': 'kom',
3669         'kw': 'cor',
3670         'ky': 'kir',
3671         'la': 'lat',
3672         'lb': 'ltz',
3673         'lg': 'lug',
3674         'li': 'lim',
3675         'ln': 'lin',
3676         'lo': 'lao',
3677         'lt': 'lit',
3678         'lu': 'lub',
3679         'lv': 'lav',
3680         'mg': 'mlg',
3681         'mh': 'mah',
3682         'mi': 'mri',
3683         'mk': 'mkd',
3684         'ml': 'mal',
3685         'mn': 'mon',
3686         'mr': 'mar',
3687         'ms': 'msa',
3688         'mt': 'mlt',
3689         'my': 'mya',
3690         'na': 'nau',
3691         'nb': 'nob',
3692         'nd': 'nde',
3693         'ne': 'nep',
3694         'ng': 'ndo',
3695         'nl': 'nld',
3696         'nn': 'nno',
3697         'no': 'nor',
3698         'nr': 'nbl',
3699         'nv': 'nav',
3700         'ny': 'nya',
3701         'oc': 'oci',
3702         'oj': 'oji',
3703         'om': 'orm',
3704         'or': 'ori',
3705         'os': 'oss',
3706         'pa': 'pan',
3707         'pe': 'per',
3708         'pi': 'pli',
3709         'pl': 'pol',
3710         'ps': 'pus',
3711         'pt': 'por',
3712         'qu': 'que',
3713         'rm': 'roh',
3714         'rn': 'run',
3715         'ro': 'ron',
3716         'ru': 'rus',
3717         'rw': 'kin',
3718         'sa': 'san',
3719         'sc': 'srd',
3720         'sd': 'snd',
3721         'se': 'sme',
3722         'sg': 'sag',
3723         'si': 'sin',
3724         'sk': 'slk',
3725         'sl': 'slv',
3726         'sm': 'smo',
3727         'sn': 'sna',
3728         'so': 'som',
3729         'sq': 'sqi',
3730         'sr': 'srp',
3731         'ss': 'ssw',
3732         'st': 'sot',
3733         'su': 'sun',
3734         'sv': 'swe',
3735         'sw': 'swa',
3736         'ta': 'tam',
3737         'te': 'tel',
3738         'tg': 'tgk',
3739         'th': 'tha',
3740         'ti': 'tir',
3741         'tk': 'tuk',
3742         'tl': 'tgl',
3743         'tn': 'tsn',
3744         'to': 'ton',
3745         'tr': 'tur',
3746         'ts': 'tso',
3747         'tt': 'tat',
3748         'tw': 'twi',
3749         'ty': 'tah',
3750         'ug': 'uig',
3751         'uk': 'ukr',
3752         'ur': 'urd',
3753         'uz': 'uzb',
3754         've': 'ven',
3755         'vi': 'vie',
3756         'vo': 'vol',
3757         'wa': 'wln',
3758         'wo': 'wol',
3759         'xh': 'xho',
3760         'yi': 'yid',
3761         'ji': 'yid',  # Replaced by yi in 1989 revision
3762         'yo': 'yor',
3763         'za': 'zha',
3764         'zh': 'zho',
3765         'zu': 'zul',
3766     }
3767
3768     @classmethod
3769     def short2long(cls, code):
3770         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3771         return cls._lang_map.get(code[:2])
3772
3773     @classmethod
3774     def long2short(cls, code):
3775         """Convert language code from ISO 639-2/T to ISO 639-1"""
3776         for short_name, long_name in cls._lang_map.items():
3777             if long_name == code:
3778                 return short_name
3779
3780
3781 class ISO3166Utils:
3782     # From http://data.okfn.org/data/core/country-list
3783     _country_map = {
3784         'AF': 'Afghanistan',
3785         'AX': 'Åland Islands',
3786         'AL': 'Albania',
3787         'DZ': 'Algeria',
3788         'AS': 'American Samoa',
3789         'AD': 'Andorra',
3790         'AO': 'Angola',
3791         'AI': 'Anguilla',
3792         'AQ': 'Antarctica',
3793         'AG': 'Antigua and Barbuda',
3794         'AR': 'Argentina',
3795         'AM': 'Armenia',
3796         'AW': 'Aruba',
3797         'AU': 'Australia',
3798         'AT': 'Austria',
3799         'AZ': 'Azerbaijan',
3800         'BS': 'Bahamas',
3801         'BH': 'Bahrain',
3802         'BD': 'Bangladesh',
3803         'BB': 'Barbados',
3804         'BY': 'Belarus',
3805         'BE': 'Belgium',
3806         'BZ': 'Belize',
3807         'BJ': 'Benin',
3808         'BM': 'Bermuda',
3809         'BT': 'Bhutan',
3810         'BO': 'Bolivia, Plurinational State of',
3811         'BQ': 'Bonaire, Sint Eustatius and Saba',
3812         'BA': 'Bosnia and Herzegovina',
3813         'BW': 'Botswana',
3814         'BV': 'Bouvet Island',
3815         'BR': 'Brazil',
3816         'IO': 'British Indian Ocean Territory',
3817         'BN': 'Brunei Darussalam',
3818         'BG': 'Bulgaria',
3819         'BF': 'Burkina Faso',
3820         'BI': 'Burundi',
3821         'KH': 'Cambodia',
3822         'CM': 'Cameroon',
3823         'CA': 'Canada',
3824         'CV': 'Cape Verde',
3825         'KY': 'Cayman Islands',
3826         'CF': 'Central African Republic',
3827         'TD': 'Chad',
3828         'CL': 'Chile',
3829         'CN': 'China',
3830         'CX': 'Christmas Island',
3831         'CC': 'Cocos (Keeling) Islands',
3832         'CO': 'Colombia',
3833         'KM': 'Comoros',
3834         'CG': 'Congo',
3835         'CD': 'Congo, the Democratic Republic of the',
3836         'CK': 'Cook Islands',
3837         'CR': 'Costa Rica',
3838         'CI': 'Côte d\'Ivoire',
3839         'HR': 'Croatia',
3840         'CU': 'Cuba',
3841         'CW': 'Curaçao',
3842         'CY': 'Cyprus',
3843         'CZ': 'Czech Republic',
3844         'DK': 'Denmark',
3845         'DJ': 'Djibouti',
3846         'DM': 'Dominica',
3847         'DO': 'Dominican Republic',
3848         'EC': 'Ecuador',
3849         'EG': 'Egypt',
3850         'SV': 'El Salvador',
3851         'GQ': 'Equatorial Guinea',
3852         'ER': 'Eritrea',
3853         'EE': 'Estonia',
3854         'ET': 'Ethiopia',
3855         'FK': 'Falkland Islands (Malvinas)',
3856         'FO': 'Faroe Islands',
3857         'FJ': 'Fiji',
3858         'FI': 'Finland',
3859         'FR': 'France',
3860         'GF': 'French Guiana',
3861         'PF': 'French Polynesia',
3862         'TF': 'French Southern Territories',
3863         'GA': 'Gabon',
3864         'GM': 'Gambia',
3865         'GE': 'Georgia',
3866         'DE': 'Germany',
3867         'GH': 'Ghana',
3868         'GI': 'Gibraltar',
3869         'GR': 'Greece',
3870         'GL': 'Greenland',
3871         'GD': 'Grenada',
3872         'GP': 'Guadeloupe',
3873         'GU': 'Guam',
3874         'GT': 'Guatemala',
3875         'GG': 'Guernsey',
3876         'GN': 'Guinea',
3877         'GW': 'Guinea-Bissau',
3878         'GY': 'Guyana',
3879         'HT': 'Haiti',
3880         'HM': 'Heard Island and McDonald Islands',
3881         'VA': 'Holy See (Vatican City State)',
3882         'HN': 'Honduras',
3883         'HK': 'Hong Kong',
3884         'HU': 'Hungary',
3885         'IS': 'Iceland',
3886         'IN': 'India',
3887         'ID': 'Indonesia',
3888         'IR': 'Iran, Islamic Republic of',
3889         'IQ': 'Iraq',
3890         'IE': 'Ireland',
3891         'IM': 'Isle of Man',
3892         'IL': 'Israel',
3893         'IT': 'Italy',
3894         'JM': 'Jamaica',
3895         'JP': 'Japan',
3896         'JE': 'Jersey',
3897         'JO': 'Jordan',
3898         'KZ': 'Kazakhstan',
3899         'KE': 'Kenya',
3900         'KI': 'Kiribati',
3901         'KP': 'Korea, Democratic People\'s Republic of',
3902         'KR': 'Korea, Republic of',
3903         'KW': 'Kuwait',
3904         'KG': 'Kyrgyzstan',
3905         'LA': 'Lao People\'s Democratic Republic',
3906         'LV': 'Latvia',
3907         'LB': 'Lebanon',
3908         'LS': 'Lesotho',
3909         'LR': 'Liberia',
3910         'LY': 'Libya',
3911         'LI': 'Liechtenstein',
3912         'LT': 'Lithuania',
3913         'LU': 'Luxembourg',
3914         'MO': 'Macao',
3915         'MK': 'Macedonia, the Former Yugoslav Republic of',
3916         'MG': 'Madagascar',
3917         'MW': 'Malawi',
3918         'MY': 'Malaysia',
3919         'MV': 'Maldives',
3920         'ML': 'Mali',
3921         'MT': 'Malta',
3922         'MH': 'Marshall Islands',
3923         'MQ': 'Martinique',
3924         'MR': 'Mauritania',
3925         'MU': 'Mauritius',
3926         'YT': 'Mayotte',
3927         'MX': 'Mexico',
3928         'FM': 'Micronesia, Federated States of',
3929         'MD': 'Moldova, Republic of',
3930         'MC': 'Monaco',
3931         'MN': 'Mongolia',
3932         'ME': 'Montenegro',
3933         'MS': 'Montserrat',
3934         'MA': 'Morocco',
3935         'MZ': 'Mozambique',
3936         'MM': 'Myanmar',
3937         'NA': 'Namibia',
3938         'NR': 'Nauru',
3939         'NP': 'Nepal',
3940         'NL': 'Netherlands',
3941         'NC': 'New Caledonia',
3942         'NZ': 'New Zealand',
3943         'NI': 'Nicaragua',
3944         'NE': 'Niger',
3945         'NG': 'Nigeria',
3946         'NU': 'Niue',
3947         'NF': 'Norfolk Island',
3948         'MP': 'Northern Mariana Islands',
3949         'NO': 'Norway',
3950         'OM': 'Oman',
3951         'PK': 'Pakistan',
3952         'PW': 'Palau',
3953         'PS': 'Palestine, State of',
3954         'PA': 'Panama',
3955         'PG': 'Papua New Guinea',
3956         'PY': 'Paraguay',
3957         'PE': 'Peru',
3958         'PH': 'Philippines',
3959         'PN': 'Pitcairn',
3960         'PL': 'Poland',
3961         'PT': 'Portugal',
3962         'PR': 'Puerto Rico',
3963         'QA': 'Qatar',
3964         'RE': 'Réunion',
3965         'RO': 'Romania',
3966         'RU': 'Russian Federation',
3967         'RW': 'Rwanda',
3968         'BL': 'Saint Barthélemy',
3969         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3970         'KN': 'Saint Kitts and Nevis',
3971         'LC': 'Saint Lucia',
3972         'MF': 'Saint Martin (French part)',
3973         'PM': 'Saint Pierre and Miquelon',
3974         'VC': 'Saint Vincent and the Grenadines',
3975         'WS': 'Samoa',
3976         'SM': 'San Marino',
3977         'ST': 'Sao Tome and Principe',
3978         'SA': 'Saudi Arabia',
3979         'SN': 'Senegal',
3980         'RS': 'Serbia',
3981         'SC': 'Seychelles',
3982         'SL': 'Sierra Leone',
3983         'SG': 'Singapore',
3984         'SX': 'Sint Maarten (Dutch part)',
3985         'SK': 'Slovakia',
3986         'SI': 'Slovenia',
3987         'SB': 'Solomon Islands',
3988         'SO': 'Somalia',
3989         'ZA': 'South Africa',
3990         'GS': 'South Georgia and the South Sandwich Islands',
3991         'SS': 'South Sudan',
3992         'ES': 'Spain',
3993         'LK': 'Sri Lanka',
3994         'SD': 'Sudan',
3995         'SR': 'Suriname',
3996         'SJ': 'Svalbard and Jan Mayen',
3997         'SZ': 'Swaziland',
3998         'SE': 'Sweden',
3999         'CH': 'Switzerland',
4000         'SY': 'Syrian Arab Republic',
4001         'TW': 'Taiwan, Province of China',
4002         'TJ': 'Tajikistan',
4003         'TZ': 'Tanzania, United Republic of',
4004         'TH': 'Thailand',
4005         'TL': 'Timor-Leste',
4006         'TG': 'Togo',
4007         'TK': 'Tokelau',
4008         'TO': 'Tonga',
4009         'TT': 'Trinidad and Tobago',
4010         'TN': 'Tunisia',
4011         'TR': 'Turkey',
4012         'TM': 'Turkmenistan',
4013         'TC': 'Turks and Caicos Islands',
4014         'TV': 'Tuvalu',
4015         'UG': 'Uganda',
4016         'UA': 'Ukraine',
4017         'AE': 'United Arab Emirates',
4018         'GB': 'United Kingdom',
4019         'US': 'United States',
4020         'UM': 'United States Minor Outlying Islands',
4021         'UY': 'Uruguay',
4022         'UZ': 'Uzbekistan',
4023         'VU': 'Vanuatu',
4024         'VE': 'Venezuela, Bolivarian Republic of',
4025         'VN': 'Viet Nam',
4026         'VG': 'Virgin Islands, British',
4027         'VI': 'Virgin Islands, U.S.',
4028         'WF': 'Wallis and Futuna',
4029         'EH': 'Western Sahara',
4030         'YE': 'Yemen',
4031         'ZM': 'Zambia',
4032         'ZW': 'Zimbabwe',
4033         # Not ISO 3166 codes, but used for IP blocks
4034         'AP': 'Asia/Pacific Region',
4035         'EU': 'Europe',
4036     }
4037
4038     @classmethod
4039     def short2full(cls, code):
4040         """Convert an ISO 3166-2 country code to the corresponding full name"""
4041         return cls._country_map.get(code.upper())
4042
4043
4044 class GeoUtils:
4045     # Major IPv4 address blocks per country
4046     _country_ip_map = {
4047         'AD': '46.172.224.0/19',
4048         'AE': '94.200.0.0/13',
4049         'AF': '149.54.0.0/17',
4050         'AG': '209.59.64.0/18',
4051         'AI': '204.14.248.0/21',
4052         'AL': '46.99.0.0/16',
4053         'AM': '46.70.0.0/15',
4054         'AO': '105.168.0.0/13',
4055         'AP': '182.50.184.0/21',
4056         'AQ': '23.154.160.0/24',
4057         'AR': '181.0.0.0/12',
4058         'AS': '202.70.112.0/20',
4059         'AT': '77.116.0.0/14',
4060         'AU': '1.128.0.0/11',
4061         'AW': '181.41.0.0/18',
4062         'AX': '185.217.4.0/22',
4063         'AZ': '5.197.0.0/16',
4064         'BA': '31.176.128.0/17',
4065         'BB': '65.48.128.0/17',
4066         'BD': '114.130.0.0/16',
4067         'BE': '57.0.0.0/8',
4068         'BF': '102.178.0.0/15',
4069         'BG': '95.42.0.0/15',
4070         'BH': '37.131.0.0/17',
4071         'BI': '154.117.192.0/18',
4072         'BJ': '137.255.0.0/16',
4073         'BL': '185.212.72.0/23',
4074         'BM': '196.12.64.0/18',
4075         'BN': '156.31.0.0/16',
4076         'BO': '161.56.0.0/16',
4077         'BQ': '161.0.80.0/20',
4078         'BR': '191.128.0.0/12',
4079         'BS': '24.51.64.0/18',
4080         'BT': '119.2.96.0/19',
4081         'BW': '168.167.0.0/16',
4082         'BY': '178.120.0.0/13',
4083         'BZ': '179.42.192.0/18',
4084         'CA': '99.224.0.0/11',
4085         'CD': '41.243.0.0/16',
4086         'CF': '197.242.176.0/21',
4087         'CG': '160.113.0.0/16',
4088         'CH': '85.0.0.0/13',
4089         'CI': '102.136.0.0/14',
4090         'CK': '202.65.32.0/19',
4091         'CL': '152.172.0.0/14',
4092         'CM': '102.244.0.0/14',
4093         'CN': '36.128.0.0/10',
4094         'CO': '181.240.0.0/12',
4095         'CR': '201.192.0.0/12',
4096         'CU': '152.206.0.0/15',
4097         'CV': '165.90.96.0/19',
4098         'CW': '190.88.128.0/17',
4099         'CY': '31.153.0.0/16',
4100         'CZ': '88.100.0.0/14',
4101         'DE': '53.0.0.0/8',
4102         'DJ': '197.241.0.0/17',
4103         'DK': '87.48.0.0/12',
4104         'DM': '192.243.48.0/20',
4105         'DO': '152.166.0.0/15',
4106         'DZ': '41.96.0.0/12',
4107         'EC': '186.68.0.0/15',
4108         'EE': '90.190.0.0/15',
4109         'EG': '156.160.0.0/11',
4110         'ER': '196.200.96.0/20',
4111         'ES': '88.0.0.0/11',
4112         'ET': '196.188.0.0/14',
4113         'EU': '2.16.0.0/13',
4114         'FI': '91.152.0.0/13',
4115         'FJ': '144.120.0.0/16',
4116         'FK': '80.73.208.0/21',
4117         'FM': '119.252.112.0/20',
4118         'FO': '88.85.32.0/19',
4119         'FR': '90.0.0.0/9',
4120         'GA': '41.158.0.0/15',
4121         'GB': '25.0.0.0/8',
4122         'GD': '74.122.88.0/21',
4123         'GE': '31.146.0.0/16',
4124         'GF': '161.22.64.0/18',
4125         'GG': '62.68.160.0/19',
4126         'GH': '154.160.0.0/12',
4127         'GI': '95.164.0.0/16',
4128         'GL': '88.83.0.0/19',
4129         'GM': '160.182.0.0/15',
4130         'GN': '197.149.192.0/18',
4131         'GP': '104.250.0.0/19',
4132         'GQ': '105.235.224.0/20',
4133         'GR': '94.64.0.0/13',
4134         'GT': '168.234.0.0/16',
4135         'GU': '168.123.0.0/16',
4136         'GW': '197.214.80.0/20',
4137         'GY': '181.41.64.0/18',
4138         'HK': '113.252.0.0/14',
4139         'HN': '181.210.0.0/16',
4140         'HR': '93.136.0.0/13',
4141         'HT': '148.102.128.0/17',
4142         'HU': '84.0.0.0/14',
4143         'ID': '39.192.0.0/10',
4144         'IE': '87.32.0.0/12',
4145         'IL': '79.176.0.0/13',
4146         'IM': '5.62.80.0/20',
4147         'IN': '117.192.0.0/10',
4148         'IO': '203.83.48.0/21',
4149         'IQ': '37.236.0.0/14',
4150         'IR': '2.176.0.0/12',
4151         'IS': '82.221.0.0/16',
4152         'IT': '79.0.0.0/10',
4153         'JE': '87.244.64.0/18',
4154         'JM': '72.27.0.0/17',
4155         'JO': '176.29.0.0/16',
4156         'JP': '133.0.0.0/8',
4157         'KE': '105.48.0.0/12',
4158         'KG': '158.181.128.0/17',
4159         'KH': '36.37.128.0/17',
4160         'KI': '103.25.140.0/22',
4161         'KM': '197.255.224.0/20',
4162         'KN': '198.167.192.0/19',
4163         'KP': '175.45.176.0/22',
4164         'KR': '175.192.0.0/10',
4165         'KW': '37.36.0.0/14',
4166         'KY': '64.96.0.0/15',
4167         'KZ': '2.72.0.0/13',
4168         'LA': '115.84.64.0/18',
4169         'LB': '178.135.0.0/16',
4170         'LC': '24.92.144.0/20',
4171         'LI': '82.117.0.0/19',
4172         'LK': '112.134.0.0/15',
4173         'LR': '102.183.0.0/16',
4174         'LS': '129.232.0.0/17',
4175         'LT': '78.56.0.0/13',
4176         'LU': '188.42.0.0/16',
4177         'LV': '46.109.0.0/16',
4178         'LY': '41.252.0.0/14',
4179         'MA': '105.128.0.0/11',
4180         'MC': '88.209.64.0/18',
4181         'MD': '37.246.0.0/16',
4182         'ME': '178.175.0.0/17',
4183         'MF': '74.112.232.0/21',
4184         'MG': '154.126.0.0/17',
4185         'MH': '117.103.88.0/21',
4186         'MK': '77.28.0.0/15',
4187         'ML': '154.118.128.0/18',
4188         'MM': '37.111.0.0/17',
4189         'MN': '49.0.128.0/17',
4190         'MO': '60.246.0.0/16',
4191         'MP': '202.88.64.0/20',
4192         'MQ': '109.203.224.0/19',
4193         'MR': '41.188.64.0/18',
4194         'MS': '208.90.112.0/22',
4195         'MT': '46.11.0.0/16',
4196         'MU': '105.16.0.0/12',
4197         'MV': '27.114.128.0/18',
4198         'MW': '102.70.0.0/15',
4199         'MX': '187.192.0.0/11',
4200         'MY': '175.136.0.0/13',
4201         'MZ': '197.218.0.0/15',
4202         'NA': '41.182.0.0/16',
4203         'NC': '101.101.0.0/18',
4204         'NE': '197.214.0.0/18',
4205         'NF': '203.17.240.0/22',
4206         'NG': '105.112.0.0/12',
4207         'NI': '186.76.0.0/15',
4208         'NL': '145.96.0.0/11',
4209         'NO': '84.208.0.0/13',
4210         'NP': '36.252.0.0/15',
4211         'NR': '203.98.224.0/19',
4212         'NU': '49.156.48.0/22',
4213         'NZ': '49.224.0.0/14',
4214         'OM': '5.36.0.0/15',
4215         'PA': '186.72.0.0/15',
4216         'PE': '186.160.0.0/14',
4217         'PF': '123.50.64.0/18',
4218         'PG': '124.240.192.0/19',
4219         'PH': '49.144.0.0/13',
4220         'PK': '39.32.0.0/11',
4221         'PL': '83.0.0.0/11',
4222         'PM': '70.36.0.0/20',
4223         'PR': '66.50.0.0/16',
4224         'PS': '188.161.0.0/16',
4225         'PT': '85.240.0.0/13',
4226         'PW': '202.124.224.0/20',
4227         'PY': '181.120.0.0/14',
4228         'QA': '37.210.0.0/15',
4229         'RE': '102.35.0.0/16',
4230         'RO': '79.112.0.0/13',
4231         'RS': '93.86.0.0/15',
4232         'RU': '5.136.0.0/13',
4233         'RW': '41.186.0.0/16',
4234         'SA': '188.48.0.0/13',
4235         'SB': '202.1.160.0/19',
4236         'SC': '154.192.0.0/11',
4237         'SD': '102.120.0.0/13',
4238         'SE': '78.64.0.0/12',
4239         'SG': '8.128.0.0/10',
4240         'SI': '188.196.0.0/14',
4241         'SK': '78.98.0.0/15',
4242         'SL': '102.143.0.0/17',
4243         'SM': '89.186.32.0/19',
4244         'SN': '41.82.0.0/15',
4245         'SO': '154.115.192.0/18',
4246         'SR': '186.179.128.0/17',
4247         'SS': '105.235.208.0/21',
4248         'ST': '197.159.160.0/19',
4249         'SV': '168.243.0.0/16',
4250         'SX': '190.102.0.0/20',
4251         'SY': '5.0.0.0/16',
4252         'SZ': '41.84.224.0/19',
4253         'TC': '65.255.48.0/20',
4254         'TD': '154.68.128.0/19',
4255         'TG': '196.168.0.0/14',
4256         'TH': '171.96.0.0/13',
4257         'TJ': '85.9.128.0/18',
4258         'TK': '27.96.24.0/21',
4259         'TL': '180.189.160.0/20',
4260         'TM': '95.85.96.0/19',
4261         'TN': '197.0.0.0/11',
4262         'TO': '175.176.144.0/21',
4263         'TR': '78.160.0.0/11',
4264         'TT': '186.44.0.0/15',
4265         'TV': '202.2.96.0/19',
4266         'TW': '120.96.0.0/11',
4267         'TZ': '156.156.0.0/14',
4268         'UA': '37.52.0.0/14',
4269         'UG': '102.80.0.0/13',
4270         'US': '6.0.0.0/8',
4271         'UY': '167.56.0.0/13',
4272         'UZ': '84.54.64.0/18',
4273         'VA': '212.77.0.0/19',
4274         'VC': '207.191.240.0/21',
4275         'VE': '186.88.0.0/13',
4276         'VG': '66.81.192.0/20',
4277         'VI': '146.226.0.0/16',
4278         'VN': '14.160.0.0/11',
4279         'VU': '202.80.32.0/20',
4280         'WF': '117.20.32.0/21',
4281         'WS': '202.4.32.0/19',
4282         'YE': '134.35.0.0/16',
4283         'YT': '41.242.116.0/22',
4284         'ZA': '41.0.0.0/11',
4285         'ZM': '102.144.0.0/13',
4286         'ZW': '102.177.192.0/18',
4287     }
4288
4289     @classmethod
4290     def random_ipv4(cls, code_or_block):
4291         if len(code_or_block) == 2:
4292             block = cls._country_ip_map.get(code_or_block.upper())
4293             if not block:
4294                 return None
4295         else:
4296             block = code_or_block
4297         addr, preflen = block.split('/')
4298         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4299         addr_max = addr_min | (0xffffffff >> int(preflen))
4300         return str(socket.inet_ntoa(
4301             struct.pack('!L', random.randint(addr_min, addr_max))))
4302
4303
4304 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4305 # released into Public Domain
4306 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4307
4308 def long_to_bytes(n, blocksize=0):
4309     """long_to_bytes(n:long, blocksize:int) : string
4310     Convert a long integer to a byte string.
4311
4312     If optional blocksize is given and greater than zero, pad the front of the
4313     byte string with binary zeros so that the length is a multiple of
4314     blocksize.
4315     """
4316     # after much testing, this algorithm was deemed to be the fastest
4317     s = b''
4318     n = int(n)
4319     while n > 0:
4320         s = struct.pack('>I', n & 0xffffffff) + s
4321         n = n >> 32
4322     # strip off leading zeros
4323     for i in range(len(s)):
4324         if s[i] != b'\000'[0]:
4325             break
4326     else:
4327         # only happens when n == 0
4328         s = b'\000'
4329         i = 0
4330     s = s[i:]
4331     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4332     # de-padding being done above, but sigh...
4333     if blocksize > 0 and len(s) % blocksize:
4334         s = (blocksize - len(s) % blocksize) * b'\000' + s
4335     return s
4336
4337
4338 def bytes_to_long(s):
4339     """bytes_to_long(string) : long
4340     Convert a byte string to a long integer.
4341
4342     This is (essentially) the inverse of long_to_bytes().
4343     """
4344     acc = 0
4345     length = len(s)
4346     if length % 4:
4347         extra = (4 - length % 4)
4348         s = b'\000' * extra + s
4349         length = length + extra
4350     for i in range(0, length, 4):
4351         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4352     return acc
4353
4354
4355 def ohdave_rsa_encrypt(data, exponent, modulus):
4356     '''
4357     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4358
4359     Input:
4360         data: data to encrypt, bytes-like object
4361         exponent, modulus: parameter e and N of RSA algorithm, both integer
4362     Output: hex string of encrypted data
4363
4364     Limitation: supports one block encryption only
4365     '''
4366
4367     payload = int(binascii.hexlify(data[::-1]), 16)
4368     encrypted = pow(payload, exponent, modulus)
4369     return '%x' % encrypted
4370
4371
4372 def pkcs1pad(data, length):
4373     """
4374     Padding input data with PKCS#1 scheme
4375
4376     @param {int[]} data        input data
4377     @param {int}   length      target length
4378     @returns {int[]}           padded data
4379     """
4380     if len(data) > length - 11:
4381         raise ValueError('Input data too long for PKCS#1 padding')
4382
4383     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4384     return [0, 2] + pseudo_random + [0] + data
4385
4386
4387 def _base_n_table(n, table):
4388     if not table and not n:
4389         raise ValueError('Either table or n must be specified')
4390     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4391
4392     if n and n != len(table):
4393         raise ValueError(f'base {n} exceeds table length {len(table)}')
4394     return table
4395
4396
4397 def encode_base_n(num, n=None, table=None):
4398     """Convert given int to a base-n string"""
4399     table = _base_n_table(n, table)
4400     if not num:
4401         return table[0]
4402
4403     result, base = '', len(table)
4404     while num:
4405         result = table[num % base] + result
4406         num = num // base
4407     return result
4408
4409
4410 def decode_base_n(string, n=None, table=None):
4411     """Convert given base-n string to int"""
4412     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4413     result, base = 0, len(table)
4414     for char in string:
4415         result = result * base + table[char]
4416     return result
4417
4418
4419 def decode_packed_codes(code):
4420     mobj = re.search(PACKED_CODES_RE, code)
4421     obfuscated_code, base, count, symbols = mobj.groups()
4422     base = int(base)
4423     count = int(count)
4424     symbols = symbols.split('|')
4425     symbol_table = {}
4426
4427     while count:
4428         count -= 1
4429         base_n_count = encode_base_n(count, base)
4430         symbol_table[base_n_count] = symbols[count] or base_n_count
4431
4432     return re.sub(
4433         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4434         obfuscated_code)
4435
4436
4437 def caesar(s, alphabet, shift):
4438     if shift == 0:
4439         return s
4440     l = len(alphabet)
4441     return ''.join(
4442         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4443         for c in s)
4444
4445
4446 def rot47(s):
4447     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4448
4449
4450 def parse_m3u8_attributes(attrib):
4451     info = {}
4452     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4453         if val.startswith('"'):
4454             val = val[1:-1]
4455         info[key] = val
4456     return info
4457
4458
4459 def urshift(val, n):
4460     return val >> n if val >= 0 else (val + 0x100000000) >> n
4461
4462
4463 def write_xattr(path, key, value):
4464     # Windows: Write xattrs to NTFS Alternate Data Streams:
4465     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4466     if compat_os_name == 'nt':
4467         assert ':' not in key
4468         assert os.path.exists(path)
4469
4470         try:
4471             with open(f'{path}:{key}', 'wb') as f:
4472                 f.write(value)
4473         except OSError as e:
4474             raise XAttrMetadataError(e.errno, e.strerror)
4475         return
4476
4477     # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
4478
4479     setxattr = None
4480     if callable(getattr(os, 'setxattr', None)):
4481         setxattr = os.setxattr
4482     elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4483         # Unicode arguments are not supported in pyxattr until version 0.5.0
4484         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4485         if version_tuple(xattr.__version__) >= (0, 5, 0):
4486             setxattr = xattr.set
4487     elif xattr:
4488         setxattr = xattr.setxattr
4489
4490     if setxattr:
4491         try:
4492             setxattr(path, key, value)
4493         except OSError as e:
4494             raise XAttrMetadataError(e.errno, e.strerror)
4495         return
4496
4497     # UNIX Method 2. Use setfattr/xattr executables
4498     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4499            else 'xattr' if check_executable('xattr', ['-h']) else None)
4500     if not exe:
4501         raise XAttrUnavailableError(
4502             'Couldn\'t find a tool to set the xattrs. Install either the "xattr" or "pyxattr" Python modules or the '
4503             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4504
4505     value = value.decode()
4506     try:
4507         _, stderr, returncode = Popen.run(
4508             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4509             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4510     except OSError as e:
4511         raise XAttrMetadataError(e.errno, e.strerror)
4512     if returncode:
4513         raise XAttrMetadataError(returncode, stderr)
4514
4515
4516 def random_birthday(year_field, month_field, day_field):
4517     start_date = dt.date(1950, 1, 1)
4518     end_date = dt.date(1995, 12, 31)
4519     offset = random.randint(0, (end_date - start_date).days)
4520     random_date = start_date + dt.timedelta(offset)
4521     return {
4522         year_field: str(random_date.year),
4523         month_field: str(random_date.month),
4524         day_field: str(random_date.day),
4525     }
4526
4527
4528 def find_available_port(interface=''):
4529     try:
4530         with socket.socket() as sock:
4531             sock.bind((interface, 0))
4532             return sock.getsockname()[1]
4533     except OSError:
4534         return None
4535
4536
4537 # Templates for internet shortcut files, which are plain text files.
4538 DOT_URL_LINK_TEMPLATE = '''\
4539 [InternetShortcut]
4540 URL=%(url)s
4541 '''
4542
4543 DOT_WEBLOC_LINK_TEMPLATE = '''\
4544 <?xml version="1.0" encoding="UTF-8"?>
4545 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4546 <plist version="1.0">
4547 <dict>
4548 \t<key>URL</key>
4549 \t<string>%(url)s</string>
4550 </dict>
4551 </plist>
4552 '''
4553
4554 DOT_DESKTOP_LINK_TEMPLATE = '''\
4555 [Desktop Entry]
4556 Encoding=UTF-8
4557 Name=%(filename)s
4558 Type=Link
4559 URL=%(url)s
4560 Icon=text-html
4561 '''
4562
4563 LINK_TEMPLATES = {
4564     'url': DOT_URL_LINK_TEMPLATE,
4565     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4566     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4567 }
4568
4569
4570 def iri_to_uri(iri):
4571     """
4572     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4573
4574     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4575     """
4576
4577     iri_parts = urllib.parse.urlparse(iri)
4578
4579     if '[' in iri_parts.netloc:
4580         raise ValueError('IPv6 URIs are not, yet, supported.')
4581         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4582
4583     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4584
4585     net_location = ''
4586     if iri_parts.username:
4587         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4588         if iri_parts.password is not None:
4589             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4590         net_location += '@'
4591
4592     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4593     # The 'idna' encoding produces ASCII text.
4594     if iri_parts.port is not None and iri_parts.port != 80:
4595         net_location += ':' + str(iri_parts.port)
4596
4597     return urllib.parse.urlunparse(
4598         (iri_parts.scheme,
4599             net_location,
4600
4601             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4602
4603             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4604             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4605
4606             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4607             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4608
4609             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4610
4611     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4612
4613
4614 def to_high_limit_path(path):
4615     if sys.platform in ['win32', 'cygwin']:
4616         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4617         return '\\\\?\\' + os.path.abspath(path)
4618
4619     return path
4620
4621
4622 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4623     val = traversal.traverse_obj(obj, *variadic(field))
4624     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4625         return default
4626     return template % func(val)
4627
4628
4629 def clean_podcast_url(url):
4630     url = re.sub(r'''(?x)
4631         (?:
4632             (?:
4633                 chtbl\.com/track|
4634                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4635                 play\.podtrac\.com|
4636                 chrt\.fm/track|
4637                 mgln\.ai/e
4638             )(?:/[^/.]+)?|
4639             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4640             flex\.acast\.com|
4641             pd(?:
4642                 cn\.co| # https://podcorn.com/analytics-prefix/
4643                 st\.fm # https://podsights.com/docs/
4644             )/e|
4645             [0-9]\.gum\.fm|
4646             pscrb\.fm/rss/p
4647         )/''', '', url)
4648     return re.sub(r'^\w+://(\w+://)', r'\1', url)
4649
4650
4651 _HEX_TABLE = '0123456789abcdef'
4652
4653
4654 def random_uuidv4():
4655     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4656
4657
4658 def make_dir(path, to_screen=None):
4659     try:
4660         dn = os.path.dirname(path)
4661         if dn:
4662             os.makedirs(dn, exist_ok=True)
4663         return True
4664     except OSError as err:
4665         if callable(to_screen) is not None:
4666             to_screen(f'unable to create directory {err}')
4667         return False
4668
4669
4670 def get_executable_path():
4671     from ..update import _get_variant_and_executable_path
4672
4673     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4674
4675
4676 def get_user_config_dirs(package_name):
4677     # .config (e.g. ~/.config/package_name)
4678     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4679     yield os.path.join(xdg_config_home, package_name)
4680
4681     # appdata (%APPDATA%/package_name)
4682     appdata_dir = os.getenv('appdata')
4683     if appdata_dir:
4684         yield os.path.join(appdata_dir, package_name)
4685
4686     # home (~/.package_name)
4687     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4688
4689
4690 def get_system_config_dirs(package_name):
4691     # /etc/package_name
4692     yield os.path.join('/etc', package_name)
4693
4694
4695 def time_seconds(**kwargs):
4696     """
4697     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4698     """
4699     return time.time() + dt.timedelta(**kwargs).total_seconds()
4700
4701
4702 # create a JSON Web Signature (jws) with HS256 algorithm
4703 # the resulting format is in JWS Compact Serialization
4704 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4705 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4706 def jwt_encode_hs256(payload_data, key, headers={}):
4707     header_data = {
4708         'alg': 'HS256',
4709         'typ': 'JWT',
4710     }
4711     if headers:
4712         header_data.update(headers)
4713     header_b64 = base64.b64encode(json.dumps(header_data).encode())
4714     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4715     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4716     signature_b64 = base64.b64encode(h.digest())
4717     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4718     return token
4719
4720
4721 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4722 def jwt_decode_hs256(jwt):
4723     header_b64, payload_b64, signature_b64 = jwt.split('.')
4724     # add trailing ='s that may have been stripped, superfluous ='s are ignored
4725     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4726     return payload_data
4727
4728
4729 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4730
4731
4732 @functools.cache
4733 def supports_terminal_sequences(stream):
4734     if compat_os_name == 'nt':
4735         if not WINDOWS_VT_MODE:
4736             return False
4737     elif not os.getenv('TERM'):
4738         return False
4739     try:
4740         return stream.isatty()
4741     except BaseException:
4742         return False
4743
4744
4745 def windows_enable_vt_mode():
4746     """Ref: https://bugs.python.org/issue30075 """
4747     if get_windows_version() < (10, 0, 10586):
4748         return
4749
4750     import ctypes
4751     import ctypes.wintypes
4752     import msvcrt
4753
4754     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4755
4756     dll = ctypes.WinDLL('kernel32', use_last_error=False)
4757     handle = os.open('CONOUT$', os.O_RDWR)
4758     try:
4759         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4760         dw_original_mode = ctypes.wintypes.DWORD()
4761         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4762         if not success:
4763             raise Exception('GetConsoleMode failed')
4764
4765         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4766             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4767         if not success:
4768             raise Exception('SetConsoleMode failed')
4769     finally:
4770         os.close(handle)
4771
4772     global WINDOWS_VT_MODE
4773     WINDOWS_VT_MODE = True
4774     supports_terminal_sequences.cache_clear()
4775
4776
4777 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4778
4779
4780 def remove_terminal_sequences(string):
4781     return _terminal_sequences_re.sub('', string)
4782
4783
4784 def number_of_digits(number):
4785     return len('%d' % number)
4786
4787
4788 def join_nonempty(*values, delim='-', from_dict=None):
4789     if from_dict is not None:
4790         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4791     return delim.join(map(str, filter(None, values)))
4792
4793
4794 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4795     """
4796     Find the largest format dimensions in terms of video width and, for each thumbnail:
4797     * Modify the URL: Match the width with the provided regex and replace with the former width
4798     * Update dimensions
4799
4800     This function is useful with video services that scale the provided thumbnails on demand
4801     """
4802     _keys = ('width', 'height')
4803     max_dimensions = max(
4804         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4805         default=(0, 0))
4806     if not max_dimensions[0]:
4807         return thumbnails
4808     return [
4809         merge_dicts(
4810             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4811             dict(zip(_keys, max_dimensions)), thumbnail)
4812         for thumbnail in thumbnails
4813     ]
4814
4815
4816 def parse_http_range(range):
4817     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4818     if not range:
4819         return None, None, None
4820     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4821     if not crg:
4822         return None, None, None
4823     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4824
4825
4826 def read_stdin(what):
4827     if what:
4828         eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4829         write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4830     return sys.stdin
4831
4832
4833 def determine_file_encoding(data):
4834     """
4835     Detect the text encoding used
4836     @returns (encoding, bytes to skip)
4837     """
4838
4839     # BOM marks are given priority over declarations
4840     for bom, enc in BOMS:
4841         if data.startswith(bom):
4842             return enc, len(bom)
4843
4844     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4845     # We ignore the endianness to get a good enough match
4846     data = data.replace(b'\0', b'')
4847     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4848     return mobj.group(1).decode() if mobj else None, 0
4849
4850
4851 class Config:
4852     own_args = None
4853     parsed_args = None
4854     filename = None
4855     __initialized = False
4856
4857     def __init__(self, parser, label=None):
4858         self.parser, self.label = parser, label
4859         self._loaded_paths, self.configs = set(), []
4860
4861     def init(self, args=None, filename=None):
4862         assert not self.__initialized
4863         self.own_args, self.filename = args, filename
4864         return self.load_configs()
4865
4866     def load_configs(self):
4867         directory = ''
4868         if self.filename:
4869             location = os.path.realpath(self.filename)
4870             directory = os.path.dirname(location)
4871             if location in self._loaded_paths:
4872                 return False
4873             self._loaded_paths.add(location)
4874
4875         self.__initialized = True
4876         opts, _ = self.parser.parse_known_args(self.own_args)
4877         self.parsed_args = self.own_args
4878         for location in opts.config_locations or []:
4879             if location == '-':
4880                 if location in self._loaded_paths:
4881                     continue
4882                 self._loaded_paths.add(location)
4883                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4884                 continue
4885             location = os.path.join(directory, expand_path(location))
4886             if os.path.isdir(location):
4887                 location = os.path.join(location, 'yt-dlp.conf')
4888             if not os.path.exists(location):
4889                 self.parser.error(f'config location {location} does not exist')
4890             self.append_config(self.read_file(location), location)
4891         return True
4892
4893     def __str__(self):
4894         label = join_nonempty(
4895             self.label, 'config', f'"{self.filename}"' if self.filename else '',
4896             delim=' ')
4897         return join_nonempty(
4898             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4899             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4900             delim='\n')
4901
4902     @staticmethod
4903     def read_file(filename, default=[]):
4904         try:
4905             optionf = open(filename, 'rb')
4906         except OSError:
4907             return default  # silently skip if file is not present
4908         try:
4909             enc, skip = determine_file_encoding(optionf.read(512))
4910             optionf.seek(skip, io.SEEK_SET)
4911         except OSError:
4912             enc = None  # silently skip read errors
4913         try:
4914             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4915             contents = optionf.read().decode(enc or preferredencoding())
4916             res = shlex.split(contents, comments=True)
4917         except Exception as err:
4918             raise ValueError(f'Unable to parse "{filename}": {err}')
4919         finally:
4920             optionf.close()
4921         return res
4922
4923     @staticmethod
4924     def hide_login_info(opts):
4925         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4926         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4927
4928         def _scrub_eq(o):
4929             m = eqre.match(o)
4930             if m:
4931                 return m.group('key') + '=PRIVATE'
4932             else:
4933                 return o
4934
4935         opts = list(map(_scrub_eq, opts))
4936         for idx, opt in enumerate(opts):
4937             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4938                 opts[idx + 1] = 'PRIVATE'
4939         return opts
4940
4941     def append_config(self, *args, label=None):
4942         config = type(self)(self.parser, label)
4943         config._loaded_paths = self._loaded_paths
4944         if config.init(*args):
4945             self.configs.append(config)
4946
4947     @property
4948     def all_args(self):
4949         for config in reversed(self.configs):
4950             yield from config.all_args
4951         yield from self.parsed_args or []
4952
4953     def parse_known_args(self, **kwargs):
4954         return self.parser.parse_known_args(self.all_args, **kwargs)
4955
4956     def parse_args(self):
4957         return self.parser.parse_args(self.all_args)
4958
4959
4960 def merge_headers(*dicts):
4961     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4962     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
4963
4964
4965 def cached_method(f):
4966     """Cache a method"""
4967     signature = inspect.signature(f)
4968
4969     @functools.wraps(f)
4970     def wrapper(self, *args, **kwargs):
4971         bound_args = signature.bind(self, *args, **kwargs)
4972         bound_args.apply_defaults()
4973         key = tuple(bound_args.arguments.values())[1:]
4974
4975         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
4976         if key not in cache:
4977             cache[key] = f(self, *args, **kwargs)
4978         return cache[key]
4979     return wrapper
4980
4981
4982 class classproperty:
4983     """property access for class methods with optional caching"""
4984     def __new__(cls, func=None, *args, **kwargs):
4985         if not func:
4986             return functools.partial(cls, *args, **kwargs)
4987         return super().__new__(cls)
4988
4989     def __init__(self, func, *, cache=False):
4990         functools.update_wrapper(self, func)
4991         self.func = func
4992         self._cache = {} if cache else None
4993
4994     def __get__(self, _, cls):
4995         if self._cache is None:
4996             return self.func(cls)
4997         elif cls not in self._cache:
4998             self._cache[cls] = self.func(cls)
4999         return self._cache[cls]
5000
5001
5002 class function_with_repr:
5003     def __init__(self, func, repr_=None):
5004         functools.update_wrapper(self, func)
5005         self.func, self.__repr = func, repr_
5006
5007     def __call__(self, *args, **kwargs):
5008         return self.func(*args, **kwargs)
5009
5010     @classmethod
5011     def set_repr(cls, repr_):
5012         return functools.partial(cls, repr_=repr_)
5013
5014     def __repr__(self):
5015         if self.__repr:
5016             return self.__repr
5017         return f'{self.func.__module__}.{self.func.__qualname__}'
5018
5019
5020 class Namespace(types.SimpleNamespace):
5021     """Immutable namespace"""
5022
5023     def __iter__(self):
5024         return iter(self.__dict__.values())
5025
5026     @property
5027     def items_(self):
5028         return self.__dict__.items()
5029
5030
5031 MEDIA_EXTENSIONS = Namespace(
5032     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5033     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5034     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5035     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5036     thumbnails=('jpg', 'png', 'webp'),
5037     storyboards=('mhtml', ),
5038     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5039     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5040 )
5041 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5042 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5043
5044 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5045
5046
5047 class RetryManager:
5048     """Usage:
5049         for retry in RetryManager(...):
5050             try:
5051                 ...
5052             except SomeException as err:
5053                 retry.error = err
5054                 continue
5055     """
5056     attempt, _error = 0, None
5057
5058     def __init__(self, _retries, _error_callback, **kwargs):
5059         self.retries = _retries or 0
5060         self.error_callback = functools.partial(_error_callback, **kwargs)
5061
5062     def _should_retry(self):
5063         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5064
5065     @property
5066     def error(self):
5067         if self._error is NO_DEFAULT:
5068             return None
5069         return self._error
5070
5071     @error.setter
5072     def error(self, value):
5073         self._error = value
5074
5075     def __iter__(self):
5076         while self._should_retry():
5077             self.error = NO_DEFAULT
5078             self.attempt += 1
5079             yield self
5080             if self.error:
5081                 self.error_callback(self.error, self.attempt, self.retries)
5082
5083     @staticmethod
5084     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5085         """Utility function for reporting retries"""
5086         if count > retries:
5087             if error:
5088                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5089             raise e
5090
5091         if not count:
5092             return warn(e)
5093         elif isinstance(e, ExtractorError):
5094             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5095         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5096
5097         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5098         if delay:
5099             info(f'Sleeping {delay:.2f} seconds ...')
5100             time.sleep(delay)
5101
5102
5103 def make_archive_id(ie, video_id):
5104     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5105     return f'{ie_key.lower()} {video_id}'
5106
5107
5108 def truncate_string(s, left, right=0):
5109     assert left > 3 and right >= 0
5110     if s is None or len(s) <= left + right:
5111         return s
5112     return f'{s[:left - 3]}...{s[-right:] if right else ""}'
5113
5114
5115 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5116     assert 'all' in alias_dict, '"all" alias is required'
5117     requested = list(start or [])
5118     for val in options:
5119         discard = val.startswith('-')
5120         if discard:
5121             val = val[1:]
5122
5123         if val in alias_dict:
5124             val = alias_dict[val] if not discard else [
5125                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5126             # NB: Do not allow regex in aliases for performance
5127             requested = orderedSet_from_options(val, alias_dict, start=requested)
5128             continue
5129
5130         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5131                    else [val] if val in alias_dict['all'] else None)
5132         if current is None:
5133             raise ValueError(val)
5134
5135         if discard:
5136             for item in current:
5137                 while item in requested:
5138                     requested.remove(item)
5139         else:
5140             requested.extend(current)
5141
5142     return orderedSet(requested)
5143
5144
5145 # TODO: Rewrite
5146 class FormatSorter:
5147     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5148
5149     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5150                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5151                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5152     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5153                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5154                     'fps', 'fs_approx', 'source', 'id')
5155
5156     settings = {
5157         'vcodec': {'type': 'ordered', 'regex': True,
5158                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5159         'acodec': {'type': 'ordered', 'regex': True,
5160                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5161         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5162                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5163         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5164                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5165         'vext': {'type': 'ordered', 'field': 'video_ext',
5166                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5167                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5168         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5169                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5170                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5171         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5172         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5173                        'field': ('vcodec', 'acodec'),
5174                        'function': lambda it: int(any(v != 'none' for v in it))},
5175         'ie_pref': {'priority': True, 'type': 'extractor'},
5176         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5177         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5178         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5179         'quality': {'convert': 'float', 'default': -1},
5180         'filesize': {'convert': 'bytes'},
5181         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5182         'id': {'convert': 'string', 'field': 'format_id'},
5183         'height': {'convert': 'float_none'},
5184         'width': {'convert': 'float_none'},
5185         'fps': {'convert': 'float_none'},
5186         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5187         'tbr': {'convert': 'float_none'},
5188         'vbr': {'convert': 'float_none'},
5189         'abr': {'convert': 'float_none'},
5190         'asr': {'convert': 'float_none'},
5191         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5192
5193         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5194         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5195                'function': lambda it: next(filter(None, it), None)},
5196         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5197                  'function': lambda it: next(filter(None, it), None)},
5198         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5199         'res': {'type': 'multiple', 'field': ('height', 'width'),
5200                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5201
5202         # Actual field names
5203         'format_id': {'type': 'alias', 'field': 'id'},
5204         'preference': {'type': 'alias', 'field': 'ie_pref'},
5205         'language_preference': {'type': 'alias', 'field': 'lang'},
5206         'source_preference': {'type': 'alias', 'field': 'source'},
5207         'protocol': {'type': 'alias', 'field': 'proto'},
5208         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5209         'audio_channels': {'type': 'alias', 'field': 'channels'},
5210
5211         # Deprecated
5212         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5213         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5214         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5215         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5216         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5217         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5218         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5219         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5220         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5221         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5222         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5223         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5224         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5225         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5226         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5227         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5228         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5229         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5230         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5231         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5232     }
5233
5234     def __init__(self, ydl, field_preference):
5235         self.ydl = ydl
5236         self._order = []
5237         self.evaluate_params(self.ydl.params, field_preference)
5238         if ydl.params.get('verbose'):
5239             self.print_verbose_info(self.ydl.write_debug)
5240
5241     def _get_field_setting(self, field, key):
5242         if field not in self.settings:
5243             if key in ('forced', 'priority'):
5244                 return False
5245             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5246                                         'deprecated and may be removed in a future version')
5247             self.settings[field] = {}
5248         propObj = self.settings[field]
5249         if key not in propObj:
5250             type = propObj.get('type')
5251             if key == 'field':
5252                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5253             elif key == 'convert':
5254                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5255             else:
5256                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5257             propObj[key] = default
5258         return propObj[key]
5259
5260     def _resolve_field_value(self, field, value, convertNone=False):
5261         if value is None:
5262             if not convertNone:
5263                 return None
5264         else:
5265             value = value.lower()
5266         conversion = self._get_field_setting(field, 'convert')
5267         if conversion == 'ignore':
5268             return None
5269         if conversion == 'string':
5270             return value
5271         elif conversion == 'float_none':
5272             return float_or_none(value)
5273         elif conversion == 'bytes':
5274             return parse_bytes(value)
5275         elif conversion == 'order':
5276             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5277             use_regex = self._get_field_setting(field, 'regex')
5278             list_length = len(order_list)
5279             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5280             if use_regex and value is not None:
5281                 for i, regex in enumerate(order_list):
5282                     if regex and re.match(regex, value):
5283                         return list_length - i
5284                 return list_length - empty_pos  # not in list
5285             else:  # not regex or  value = None
5286                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5287         else:
5288             if value.isnumeric():
5289                 return float(value)
5290             else:
5291                 self.settings[field]['convert'] = 'string'
5292                 return value
5293
5294     def evaluate_params(self, params, sort_extractor):
5295         self._use_free_order = params.get('prefer_free_formats', False)
5296         self._sort_user = params.get('format_sort', [])
5297         self._sort_extractor = sort_extractor
5298
5299         def add_item(field, reverse, closest, limit_text):
5300             field = field.lower()
5301             if field in self._order:
5302                 return
5303             self._order.append(field)
5304             limit = self._resolve_field_value(field, limit_text)
5305             data = {
5306                 'reverse': reverse,
5307                 'closest': False if limit is None else closest,
5308                 'limit_text': limit_text,
5309                 'limit': limit}
5310             if field in self.settings:
5311                 self.settings[field].update(data)
5312             else:
5313                 self.settings[field] = data
5314
5315         sort_list = (
5316             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5317             + (tuple() if params.get('format_sort_force', False)
5318                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5319             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5320
5321         for item in sort_list:
5322             match = re.match(self.regex, item)
5323             if match is None:
5324                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5325             field = match.group('field')
5326             if field is None:
5327                 continue
5328             if self._get_field_setting(field, 'type') == 'alias':
5329                 alias, field = field, self._get_field_setting(field, 'field')
5330                 if self._get_field_setting(alias, 'deprecated'):
5331                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5332                                                 f'be removed in a future version. Please use {field} instead')
5333             reverse = match.group('reverse') is not None
5334             closest = match.group('separator') == '~'
5335             limit_text = match.group('limit')
5336
5337             has_limit = limit_text is not None
5338             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5339             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5340
5341             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5342             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5343             limit_count = len(limits)
5344             for (i, f) in enumerate(fields):
5345                 add_item(f, reverse, closest,
5346                          limits[i] if i < limit_count
5347                          else limits[0] if has_limit and not has_multiple_limits
5348                          else None)
5349
5350     def print_verbose_info(self, write_debug):
5351         if self._sort_user:
5352             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5353         if self._sort_extractor:
5354             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5355         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5356             '+' if self._get_field_setting(field, 'reverse') else '', field,
5357             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5358                           self._get_field_setting(field, 'limit_text'),
5359                           self._get_field_setting(field, 'limit'))
5360             if self._get_field_setting(field, 'limit_text') is not None else '')
5361             for field in self._order if self._get_field_setting(field, 'visible')]))
5362
5363     def _calculate_field_preference_from_value(self, format, field, type, value):
5364         reverse = self._get_field_setting(field, 'reverse')
5365         closest = self._get_field_setting(field, 'closest')
5366         limit = self._get_field_setting(field, 'limit')
5367
5368         if type == 'extractor':
5369             maximum = self._get_field_setting(field, 'max')
5370             if value is None or (maximum is not None and value >= maximum):
5371                 value = -1
5372         elif type == 'boolean':
5373             in_list = self._get_field_setting(field, 'in_list')
5374             not_in_list = self._get_field_setting(field, 'not_in_list')
5375             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5376         elif type == 'ordered':
5377             value = self._resolve_field_value(field, value, True)
5378
5379         # try to convert to number
5380         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5381         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5382         if is_num:
5383             value = val_num
5384
5385         return ((-10, 0) if value is None
5386                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5387                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5388                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5389                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5390                 else (-1, value, 0))
5391
5392     def _calculate_field_preference(self, format, field):
5393         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5394         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5395         if type == 'multiple':
5396             type = 'field'  # Only 'field' is allowed in multiple for now
5397             actual_fields = self._get_field_setting(field, 'field')
5398
5399             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5400         else:
5401             value = get_value(field)
5402         return self._calculate_field_preference_from_value(format, field, type, value)
5403
5404     def calculate_preference(self, format):
5405         # Determine missing protocol
5406         if not format.get('protocol'):
5407             format['protocol'] = determine_protocol(format)
5408
5409         # Determine missing ext
5410         if not format.get('ext') and 'url' in format:
5411             format['ext'] = determine_ext(format['url'])
5412         if format.get('vcodec') == 'none':
5413             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5414             format['video_ext'] = 'none'
5415         else:
5416             format['video_ext'] = format['ext']
5417             format['audio_ext'] = 'none'
5418         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5419         #    format['preference'] = -1000
5420
5421         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5422             # HEVC-over-FLV is out-of-spec by FLV's original spec
5423             # ref. https://trac.ffmpeg.org/ticket/6389
5424             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5425             format['preference'] = -100
5426
5427         # Determine missing bitrates
5428         if format.get('vcodec') == 'none':
5429             format['vbr'] = 0
5430         if format.get('acodec') == 'none':
5431             format['abr'] = 0
5432         if not format.get('vbr') and format.get('vcodec') != 'none':
5433             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5434         if not format.get('abr') and format.get('acodec') != 'none':
5435             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5436         if not format.get('tbr'):
5437             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5438
5439         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5440
5441
5442 def filesize_from_tbr(tbr, duration):
5443     """
5444     @param tbr:      Total bitrate in kbps (1000 bits/sec)
5445     @param duration: Duration in seconds
5446     @returns         Filesize in bytes
5447     """
5448     if tbr is None or duration is None:
5449         return None
5450     return int(duration * tbr * (1000 / 8))
5451
5452
5453 # XXX: Temporary
5454 class _YDLLogger:
5455     def __init__(self, ydl=None):
5456         self._ydl = ydl
5457
5458     def debug(self, message):
5459         if self._ydl:
5460             self._ydl.write_debug(message)
5461
5462     def info(self, message):
5463         if self._ydl:
5464             self._ydl.to_screen(message)
5465
5466     def warning(self, message, *, once=False):
5467         if self._ydl:
5468             self._ydl.report_warning(message, once)
5469
5470     def error(self, message, *, is_error=True):
5471         if self._ydl:
5472             self._ydl.report_error(message, is_error=is_error)
5473
5474     def stdout(self, message):
5475         if self._ydl:
5476             self._ydl.to_stdout(message)
5477
5478     def stderr(self, message):
5479         if self._ydl:
5480             self._ydl.to_stderr(message)