yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import netrc
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import shlex
  35 import socket
  36 import ssl
  37 import struct
  38 import subprocess
  39 import sys
  40 import tempfile
  41 import time
  42 import traceback
  43 import types
  44 import unicodedata
  45 import urllib.error
  46 import urllib.parse
  47 import urllib.request
  48 import xml.etree.ElementTree
  49 import zlib
  50
  51 from . import traversal
  52
  53 from ..compat import functools  # isort: split
  54 from ..compat import (
  55     compat_etree_fromstring,
  56     compat_expanduser,
  57     compat_HTMLParseError,
  58     compat_os_name,
  59     compat_shlex_quote,
  60 )
  61 from ..dependencies import brotli, certifi, websockets, xattr
  62 from ..socks import ProxyType, sockssocket
  63
  64 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  65
  66 # This is not clearly defined otherwise
  67 compiled_regex_type = type(re.compile(''))
  68
  69
  70 def random_user_agent():
  71     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  72     _CHROME_VERSIONS = (
  73         '90.0.4430.212',
  74         '90.0.4430.24',
  75         '90.0.4430.70',
  76         '90.0.4430.72',
  77         '90.0.4430.85',
  78         '90.0.4430.93',
  79         '91.0.4472.101',
  80         '91.0.4472.106',
  81         '91.0.4472.114',
  82         '91.0.4472.124',
  83         '91.0.4472.164',
  84         '91.0.4472.19',
  85         '91.0.4472.77',
  86         '92.0.4515.107',
  87         '92.0.4515.115',
  88         '92.0.4515.131',
  89         '92.0.4515.159',
  90         '92.0.4515.43',
  91         '93.0.4556.0',
  92         '93.0.4577.15',
  93         '93.0.4577.63',
  94         '93.0.4577.82',
  95         '94.0.4606.41',
  96         '94.0.4606.54',
  97         '94.0.4606.61',
  98         '94.0.4606.71',
  99         '94.0.4606.81',
 100         '94.0.4606.85',
 101         '95.0.4638.17',
 102         '95.0.4638.50',
 103         '95.0.4638.54',
 104         '95.0.4638.69',
 105         '95.0.4638.74',
 106         '96.0.4664.18',
 107         '96.0.4664.45',
 108         '96.0.4664.55',
 109         '96.0.4664.93',
 110         '97.0.4692.20',
 111     )
 112     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 113
 114
 115 SUPPORTED_ENCODINGS = [
 116     'gzip', 'deflate'
 117 ]
 118 if brotli:
 119     SUPPORTED_ENCODINGS.append('br')
 120
 121 std_headers = {
 122     'User-Agent': random_user_agent(),
 123     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 124     'Accept-Language': 'en-us,en;q=0.5',
 125     'Sec-Fetch-Mode': 'navigate',
 126 }
 127
 128
 129 USER_AGENTS = {
 130     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 131 }
 132
 133
 134 class NO_DEFAULT:
 135     pass
 136
 137
 138 def IDENTITY(x):
 139     return x
 140
 141
 142 ENGLISH_MONTH_NAMES = [
 143     'January', 'February', 'March', 'April', 'May', 'June',
 144     'July', 'August', 'September', 'October', 'November', 'December']
 145
 146 MONTH_NAMES = {
 147     'en': ENGLISH_MONTH_NAMES,
 148     'fr': [
 149         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 150         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 151     # these follow the genitive grammatical case (dopełniacz)
 152     # some websites might be using nominative, which will require another month list
 153     # https://en.wikibooks.org/wiki/Polish/Noun_cases
 154     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
 155            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
 156 }
 157
 158 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 159 TIMEZONE_NAMES = {
 160     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 161     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 162     'EST': -5, 'EDT': -4,  # Eastern
 163     'CST': -6, 'CDT': -5,  # Central
 164     'MST': -7, 'MDT': -6,  # Mountain
 165     'PST': -8, 'PDT': -7   # Pacific
 166 }
 167
 168 # needed for sanitizing filenames in restricted mode
 169 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 170                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 171                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 172
 173 DATE_FORMATS = (
 174     '%d %B %Y',
 175     '%d %b %Y',
 176     '%B %d %Y',
 177     '%B %dst %Y',
 178     '%B %dnd %Y',
 179     '%B %drd %Y',
 180     '%B %dth %Y',
 181     '%b %d %Y',
 182     '%b %dst %Y',
 183     '%b %dnd %Y',
 184     '%b %drd %Y',
 185     '%b %dth %Y',
 186     '%b %dst %Y %I:%M',
 187     '%b %dnd %Y %I:%M',
 188     '%b %drd %Y %I:%M',
 189     '%b %dth %Y %I:%M',
 190     '%Y %m %d',
 191     '%Y-%m-%d',
 192     '%Y.%m.%d.',
 193     '%Y/%m/%d',
 194     '%Y/%m/%d %H:%M',
 195     '%Y/%m/%d %H:%M:%S',
 196     '%Y%m%d%H%M',
 197     '%Y%m%d%H%M%S',
 198     '%Y%m%d',
 199     '%Y-%m-%d %H:%M',
 200     '%Y-%m-%d %H:%M:%S',
 201     '%Y-%m-%d %H:%M:%S.%f',
 202     '%Y-%m-%d %H:%M:%S:%f',
 203     '%d.%m.%Y %H:%M',
 204     '%d.%m.%Y %H.%M',
 205     '%Y-%m-%dT%H:%M:%SZ',
 206     '%Y-%m-%dT%H:%M:%S.%fZ',
 207     '%Y-%m-%dT%H:%M:%S.%f0Z',
 208     '%Y-%m-%dT%H:%M:%S',
 209     '%Y-%m-%dT%H:%M:%S.%f',
 210     '%Y-%m-%dT%H:%M',
 211     '%b %d %Y at %H:%M',
 212     '%b %d %Y at %H:%M:%S',
 213     '%B %d %Y at %H:%M',
 214     '%B %d %Y at %H:%M:%S',
 215     '%H:%M %d-%b-%Y',
 216 )
 217
 218 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 219 DATE_FORMATS_DAY_FIRST.extend([
 220     '%d-%m-%Y',
 221     '%d.%m.%Y',
 222     '%d.%m.%y',
 223     '%d/%m/%Y',
 224     '%d/%m/%y',
 225     '%d/%m/%Y %H:%M:%S',
 226     '%d-%m-%Y %H:%M',
 227     '%H:%M %d/%m/%Y',
 228 ])
 229
 230 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 231 DATE_FORMATS_MONTH_FIRST.extend([
 232     '%m-%d-%Y',
 233     '%m.%d.%Y',
 234     '%m/%d/%Y',
 235     '%m/%d/%y',
 236     '%m/%d/%Y %H:%M:%S',
 237 ])
 238
 239 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 240 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 241
 242 NUMBER_RE = r'\d+(?:\.\d+)?'
 243
 244
 245 @functools.cache
 246 def preferredencoding():
 247     """Get preferred encoding.
 248
 249     Returns the best encoding scheme for the system, based on
 250     locale.getpreferredencoding() and some further tweaks.
 251     """
 252     try:
 253         pref = locale.getpreferredencoding()
 254         'TEST'.encode(pref)
 255     except Exception:
 256         pref = 'UTF-8'
 257
 258     return pref
 259
 260
 261 def write_json_file(obj, fn):
 262     """ Encode obj as JSON and write it to fn, atomically if possible """
 263
 264     tf = tempfile.NamedTemporaryFile(
 265         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 266         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 267
 268     try:
 269         with tf:
 270             json.dump(obj, tf, ensure_ascii=False)
 271         if sys.platform == 'win32':
 272             # Need to remove existing file on Windows, else os.rename raises
 273             # WindowsError or FileExistsError.
 274             with contextlib.suppress(OSError):
 275                 os.unlink(fn)
 276         with contextlib.suppress(OSError):
 277             mask = os.umask(0)
 278             os.umask(mask)
 279             os.chmod(tf.name, 0o666 & ~mask)
 280         os.rename(tf.name, fn)
 281     except Exception:
 282         with contextlib.suppress(OSError):
 283             os.remove(tf.name)
 284         raise
 285
 286
 287 def find_xpath_attr(node, xpath, key, val=None):
 288     """ Find the xpath xpath[@key=val] """
 289     assert re.match(r'^[a-zA-Z_-]+$', key)
 290     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 291     return node.find(expr)
 292
 293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 294 # the namespace parameter
 295
 296
 297 def xpath_with_ns(path, ns_map):
 298     components = [c.split(':') for c in path.split('/')]
 299     replaced = []
 300     for c in components:
 301         if len(c) == 1:
 302             replaced.append(c[0])
 303         else:
 304             ns, tag = c
 305             replaced.append('{%s}%s' % (ns_map[ns], tag))
 306     return '/'.join(replaced)
 307
 308
 309 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 310     def _find_xpath(xpath):
 311         return node.find(xpath)
 312
 313     if isinstance(xpath, str):
 314         n = _find_xpath(xpath)
 315     else:
 316         for xp in xpath:
 317             n = _find_xpath(xp)
 318             if n is not None:
 319                 break
 320
 321     if n is None:
 322         if default is not NO_DEFAULT:
 323             return default
 324         elif fatal:
 325             name = xpath if name is None else name
 326             raise ExtractorError('Could not find XML element %s' % name)
 327         else:
 328             return None
 329     return n
 330
 331
 332 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 333     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 334     if n is None or n == default:
 335         return n
 336     if n.text is None:
 337         if default is not NO_DEFAULT:
 338             return default
 339         elif fatal:
 340             name = xpath if name is None else name
 341             raise ExtractorError('Could not find XML element\'s text %s' % name)
 342         else:
 343             return None
 344     return n.text
 345
 346
 347 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 348     n = find_xpath_attr(node, xpath, key)
 349     if n is None:
 350         if default is not NO_DEFAULT:
 351             return default
 352         elif fatal:
 353             name = f'{xpath}[@{key}]' if name is None else name
 354             raise ExtractorError('Could not find XML attribute %s' % name)
 355         else:
 356             return None
 357     return n.attrib[key]
 358
 359
 360 def get_element_by_id(id, html, **kwargs):
 361     """Return the content of the tag with the specified ID in the passed HTML document"""
 362     return get_element_by_attribute('id', id, html, **kwargs)
 363
 364
 365 def get_element_html_by_id(id, html, **kwargs):
 366     """Return the html of the tag with the specified ID in the passed HTML document"""
 367     return get_element_html_by_attribute('id', id, html, **kwargs)
 368
 369
 370 def get_element_by_class(class_name, html):
 371     """Return the content of the first tag with the specified class in the passed HTML document"""
 372     retval = get_elements_by_class(class_name, html)
 373     return retval[0] if retval else None
 374
 375
 376 def get_element_html_by_class(class_name, html):
 377     """Return the html of the first tag with the specified class in the passed HTML document"""
 378     retval = get_elements_html_by_class(class_name, html)
 379     return retval[0] if retval else None
 380
 381
 382 def get_element_by_attribute(attribute, value, html, **kwargs):
 383     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 384     return retval[0] if retval else None
 385
 386
 387 def get_element_html_by_attribute(attribute, value, html, **kargs):
 388     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 389     return retval[0] if retval else None
 390
 391
 392 def get_elements_by_class(class_name, html, **kargs):
 393     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 394     return get_elements_by_attribute(
 395         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 396         html, escape_value=False)
 397
 398
 399 def get_elements_html_by_class(class_name, html):
 400     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 401     return get_elements_html_by_attribute(
 402         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 403         html, escape_value=False)
 404
 405
 406 def get_elements_by_attribute(*args, **kwargs):
 407     """Return the content of the tag with the specified attribute in the passed HTML document"""
 408     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 409
 410
 411 def get_elements_html_by_attribute(*args, **kwargs):
 412     """Return the html of the tag with the specified attribute in the passed HTML document"""
 413     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 414
 415
 416 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 417     """
 418     Return the text (content) and the html (whole) of the tag with the specified
 419     attribute in the passed HTML document
 420     """
 421     if not value:
 422         return
 423
 424     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 425
 426     value = re.escape(value) if escape_value else value
 427
 428     partial_element_re = rf'''(?x)
 429         <(?P<tag>{tag})
 430          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 431          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 432         '''
 433
 434     for m in re.finditer(partial_element_re, html):
 435         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 436
 437         yield (
 438             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 439             whole
 440         )
 441
 442
 443 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 444     """
 445     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 446     closing tag for the first opening tag it has encountered, and can be used
 447     as a context manager
 448     """
 449
 450     class HTMLBreakOnClosingTagException(Exception):
 451         pass
 452
 453     def __init__(self):
 454         self.tagstack = collections.deque()
 455         html.parser.HTMLParser.__init__(self)
 456
 457     def __enter__(self):
 458         return self
 459
 460     def __exit__(self, *_):
 461         self.close()
 462
 463     def close(self):
 464         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 465         # so data remains buffered; we no longer have any interest in it, thus
 466         # override this method to discard it
 467         pass
 468
 469     def handle_starttag(self, tag, _):
 470         self.tagstack.append(tag)
 471
 472     def handle_endtag(self, tag):
 473         if not self.tagstack:
 474             raise compat_HTMLParseError('no tags in the stack')
 475         while self.tagstack:
 476             inner_tag = self.tagstack.pop()
 477             if inner_tag == tag:
 478                 break
 479         else:
 480             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 481         if not self.tagstack:
 482             raise self.HTMLBreakOnClosingTagException()
 483
 484
 485 # XXX: This should be far less strict
 486 def get_element_text_and_html_by_tag(tag, html):
 487     """
 488     For the first element with the specified tag in the passed HTML document
 489     return its' content (text) and the whole element (html)
 490     """
 491     def find_or_raise(haystack, needle, exc):
 492         try:
 493             return haystack.index(needle)
 494         except ValueError:
 495             raise exc
 496     closing_tag = f'</{tag}>'
 497     whole_start = find_or_raise(
 498         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 499     content_start = find_or_raise(
 500         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 501     content_start += whole_start + 1
 502     with HTMLBreakOnClosingTagParser() as parser:
 503         parser.feed(html[whole_start:content_start])
 504         if not parser.tagstack or parser.tagstack[0] != tag:
 505             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 506         offset = content_start
 507         while offset < len(html):
 508             next_closing_tag_start = find_or_raise(
 509                 html[offset:], closing_tag,
 510                 compat_HTMLParseError(f'closing {tag} tag not found'))
 511             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 512             try:
 513                 parser.feed(html[offset:offset + next_closing_tag_end])
 514                 offset += next_closing_tag_end
 515             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 516                 return html[content_start:offset + next_closing_tag_start], \
 517                     html[whole_start:offset + next_closing_tag_end]
 518         raise compat_HTMLParseError('unexpected end of html')
 519
 520
 521 class HTMLAttributeParser(html.parser.HTMLParser):
 522     """Trivial HTML parser to gather the attributes for a single element"""
 523
 524     def __init__(self):
 525         self.attrs = {}
 526         html.parser.HTMLParser.__init__(self)
 527
 528     def handle_starttag(self, tag, attrs):
 529         self.attrs = dict(attrs)
 530         raise compat_HTMLParseError('done')
 531
 532
 533 class HTMLListAttrsParser(html.parser.HTMLParser):
 534     """HTML parser to gather the attributes for the elements of a list"""
 535
 536     def __init__(self):
 537         html.parser.HTMLParser.__init__(self)
 538         self.items = []
 539         self._level = 0
 540
 541     def handle_starttag(self, tag, attrs):
 542         if tag == 'li' and self._level == 0:
 543             self.items.append(dict(attrs))
 544         self._level += 1
 545
 546     def handle_endtag(self, tag):
 547         self._level -= 1
 548
 549
 550 def extract_attributes(html_element):
 551     """Given a string for an HTML element such as
 552     <el
 553          a="foo" B="bar" c="&98;az" d=boz
 554          empty= noval entity="&amp;"
 555          sq='"' dq="'"
 556     >
 557     Decode and return a dictionary of attributes.
 558     {
 559         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 560         'empty': '', 'noval': None, 'entity': '&',
 561         'sq': '"', 'dq': '\''
 562     }.
 563     """
 564     parser = HTMLAttributeParser()
 565     with contextlib.suppress(compat_HTMLParseError):
 566         parser.feed(html_element)
 567         parser.close()
 568     return parser.attrs
 569
 570
 571 def parse_list(webpage):
 572     """Given a string for an series of HTML <li> elements,
 573     return a dictionary of their attributes"""
 574     parser = HTMLListAttrsParser()
 575     parser.feed(webpage)
 576     parser.close()
 577     return parser.items
 578
 579
 580 def clean_html(html):
 581     """Clean an HTML snippet into a readable string"""
 582
 583     if html is None:  # Convenience for sanitizing descriptions etc.
 584         return html
 585
 586     html = re.sub(r'\s+', ' ', html)
 587     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 588     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 589     # Strip html tags
 590     html = re.sub('<.*?>', '', html)
 591     # Replace html entities
 592     html = unescapeHTML(html)
 593     return html.strip()
 594
 595
 596 class LenientJSONDecoder(json.JSONDecoder):
 597     # TODO: Write tests
 598     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 599         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 600         self._close_attempts = 2 * close_objects
 601         super().__init__(*args, **kwargs)
 602
 603     @staticmethod
 604     def _close_object(err):
 605         doc = err.doc[:err.pos]
 606         # We need to add comma first to get the correct error message
 607         if err.msg.startswith('Expecting \',\''):
 608             return doc + ','
 609         elif not doc.endswith(','):
 610             return
 611
 612         if err.msg.startswith('Expecting property name'):
 613             return doc[:-1] + '}'
 614         elif err.msg.startswith('Expecting value'):
 615             return doc[:-1] + ']'
 616
 617     def decode(self, s):
 618         if self.transform_source:
 619             s = self.transform_source(s)
 620         for attempt in range(self._close_attempts + 1):
 621             try:
 622                 if self.ignore_extra:
 623                     return self.raw_decode(s.lstrip())[0]
 624                 return super().decode(s)
 625             except json.JSONDecodeError as e:
 626                 if e.pos is None:
 627                     raise
 628                 elif attempt < self._close_attempts:
 629                     s = self._close_object(e)
 630                     if s is not None:
 631                         continue
 632                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 633         assert False, 'Too many attempts to decode JSON'
 634
 635
 636 def sanitize_open(filename, open_mode):
 637     """Try to open the given filename, and slightly tweak it if this fails.
 638
 639     Attempts to open the given filename. If this fails, it tries to change
 640     the filename slightly, step by step, until it's either able to open it
 641     or it fails and raises a final exception, like the standard open()
 642     function.
 643
 644     It returns the tuple (stream, definitive_file_name).
 645     """
 646     if filename == '-':
 647         if sys.platform == 'win32':
 648             import msvcrt
 649
 650             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 651             with contextlib.suppress(io.UnsupportedOperation):
 652                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 653         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 654
 655     for attempt in range(2):
 656         try:
 657             try:
 658                 if sys.platform == 'win32':
 659                     # FIXME: An exclusive lock also locks the file from being read.
 660                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 661                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 662                     raise LockingUnsupportedError()
 663                 stream = locked_file(filename, open_mode, block=False).__enter__()
 664             except OSError:
 665                 stream = open(filename, open_mode)
 666             return stream, filename
 667         except OSError as err:
 668             if attempt or err.errno in (errno.EACCES,):
 669                 raise
 670             old_filename, filename = filename, sanitize_path(filename)
 671             if old_filename == filename:
 672                 raise
 673
 674
 675 def timeconvert(timestr):
 676     """Convert RFC 2822 defined time string into system timestamp"""
 677     timestamp = None
 678     timetuple = email.utils.parsedate_tz(timestr)
 679     if timetuple is not None:
 680         timestamp = email.utils.mktime_tz(timetuple)
 681     return timestamp
 682
 683
 684 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 685     """Sanitizes a string so it could be used as part of a filename.
 686     @param restricted   Use a stricter subset of allowed characters
 687     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 688                         If unset, yt-dlp's new sanitization rules are in effect
 689     """
 690     if s == '':
 691         return ''
 692
 693     def replace_insane(char):
 694         if restricted and char in ACCENT_CHARS:
 695             return ACCENT_CHARS[char]
 696         elif not restricted and char == '\n':
 697             return '\0 '
 698         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 699             # Replace with their full-width unicode counterparts
 700             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 701         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 702             return ''
 703         elif char == '"':
 704             return '' if restricted else '\''
 705         elif char == ':':
 706             return '\0_\0-' if restricted else '\0 \0-'
 707         elif char in '\\/|*<>':
 708             return '\0_'
 709         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 710             return '\0_'
 711         return char
 712
 713     # Replace look-alike Unicode glyphs
 714     if restricted and (is_id is NO_DEFAULT or not is_id):
 715         s = unicodedata.normalize('NFKC', s)
 716     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 717     result = ''.join(map(replace_insane, s))
 718     if is_id is NO_DEFAULT:
 719         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 720         STRIP_RE = r'(?:\0.|[ _-])*'
 721         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 722     result = result.replace('\0', '') or '_'
 723
 724     if not is_id:
 725         while '__' in result:
 726             result = result.replace('__', '_')
 727         result = result.strip('_')
 728         # Common case of "Foreign band name - English song title"
 729         if restricted and result.startswith('-_'):
 730             result = result[2:]
 731         if result.startswith('-'):
 732             result = '_' + result[len('-'):]
 733         result = result.lstrip('.')
 734         if not result:
 735             result = '_'
 736     return result
 737
 738
 739 def sanitize_path(s, force=False):
 740     """Sanitizes and normalizes path on Windows"""
 741     if sys.platform == 'win32':
 742         force = False
 743         drive_or_unc, _ = os.path.splitdrive(s)
 744     elif force:
 745         drive_or_unc = ''
 746     else:
 747         return s
 748
 749     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 750     if drive_or_unc:
 751         norm_path.pop(0)
 752     sanitized_path = [
 753         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 754         for path_part in norm_path]
 755     if drive_or_unc:
 756         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 757     elif force and s and s[0] == os.path.sep:
 758         sanitized_path.insert(0, os.path.sep)
 759     return os.path.join(*sanitized_path)
 760
 761
 762 def sanitize_url(url, *, scheme='http'):
 763     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 764     # the number of unwanted failures due to missing protocol
 765     if url is None:
 766         return
 767     elif url.startswith('//'):
 768         return f'{scheme}:{url}'
 769     # Fix some common typos seen so far
 770     COMMON_TYPOS = (
 771         # https://github.com/ytdl-org/youtube-dl/issues/15649
 772         (r'^httpss://', r'https://'),
 773         # https://bx1.be/lives/direct-tv/
 774         (r'^rmtp([es]?)://', r'rtmp\1://'),
 775     )
 776     for mistake, fixup in COMMON_TYPOS:
 777         if re.match(mistake, url):
 778             return re.sub(mistake, fixup, url)
 779     return url
 780
 781
 782 def extract_basic_auth(url):
 783     parts = urllib.parse.urlsplit(url)
 784     if parts.username is None:
 785         return url, None
 786     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 787         parts.hostname if parts.port is None
 788         else '%s:%d' % (parts.hostname, parts.port))))
 789     auth_payload = base64.b64encode(
 790         ('%s:%s' % (parts.username, parts.password or '')).encode())
 791     return url, f'Basic {auth_payload.decode()}'
 792
 793
 794 def sanitized_Request(url, *args, **kwargs):
 795     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 796     if auth_header is not None:
 797         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 798         headers['Authorization'] = auth_header
 799     return urllib.request.Request(url, *args, **kwargs)
 800
 801
 802 def expand_path(s):
 803     """Expand shell variables and ~"""
 804     return os.path.expandvars(compat_expanduser(s))
 805
 806
 807 def orderedSet(iterable, *, lazy=False):
 808     """Remove all duplicates from the input iterable"""
 809     def _iter():
 810         seen = []  # Do not use set since the items can be unhashable
 811         for x in iterable:
 812             if x not in seen:
 813                 seen.append(x)
 814                 yield x
 815
 816     return _iter() if lazy else list(_iter())
 817
 818
 819 def _htmlentity_transform(entity_with_semicolon):
 820     """Transforms an HTML entity to a character."""
 821     entity = entity_with_semicolon[:-1]
 822
 823     # Known non-numeric HTML entity
 824     if entity in html.entities.name2codepoint:
 825         return chr(html.entities.name2codepoint[entity])
 826
 827     # TODO: HTML5 allows entities without a semicolon.
 828     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 829     if entity_with_semicolon in html.entities.html5:
 830         return html.entities.html5[entity_with_semicolon]
 831
 832     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 833     if mobj is not None:
 834         numstr = mobj.group(1)
 835         if numstr.startswith('x'):
 836             base = 16
 837             numstr = '0%s' % numstr
 838         else:
 839             base = 10
 840         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 841         with contextlib.suppress(ValueError):
 842             return chr(int(numstr, base))
 843
 844     # Unknown entity in name, return its literal representation
 845     return '&%s;' % entity
 846
 847
 848 def unescapeHTML(s):
 849     if s is None:
 850         return None
 851     assert isinstance(s, str)
 852
 853     return re.sub(
 854         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 855
 856
 857 def escapeHTML(text):
 858     return (
 859         text
 860         .replace('&', '&amp;')
 861         .replace('<', '&lt;')
 862         .replace('>', '&gt;')
 863         .replace('"', '&quot;')
 864         .replace("'", '&#39;')
 865     )
 866
 867
 868 class netrc_from_content(netrc.netrc):
 869     def __init__(self, content):
 870         self.hosts, self.macros = {}, {}
 871         with io.StringIO(content) as stream:
 872             self._parse('-', stream, False)
 873
 874
 875 class Popen(subprocess.Popen):
 876     if sys.platform == 'win32':
 877         _startupinfo = subprocess.STARTUPINFO()
 878         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 879     else:
 880         _startupinfo = None
 881
 882     @staticmethod
 883     def _fix_pyinstaller_ld_path(env):
 884         """Restore LD_LIBRARY_PATH when using PyInstaller
 885             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 886                  https://github.com/yt-dlp/yt-dlp/issues/4573
 887         """
 888         if not hasattr(sys, '_MEIPASS'):
 889             return
 890
 891         def _fix(key):
 892             orig = env.get(f'{key}_ORIG')
 893             if orig is None:
 894                 env.pop(key, None)
 895             else:
 896                 env[key] = orig
 897
 898         _fix('LD_LIBRARY_PATH')  # Linux
 899         _fix('DYLD_LIBRARY_PATH')  # macOS
 900
 901     def __init__(self, *args, env=None, text=False, **kwargs):
 902         if env is None:
 903             env = os.environ.copy()
 904         self._fix_pyinstaller_ld_path(env)
 905
 906         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 907         if text is True:
 908             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 909             kwargs.setdefault('encoding', 'utf-8')
 910             kwargs.setdefault('errors', 'replace')
 911         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 912
 913     def communicate_or_kill(self, *args, **kwargs):
 914         try:
 915             return self.communicate(*args, **kwargs)
 916         except BaseException:  # Including KeyboardInterrupt
 917             self.kill(timeout=None)
 918             raise
 919
 920     def kill(self, *, timeout=0):
 921         super().kill()
 922         if timeout != 0:
 923             self.wait(timeout=timeout)
 924
 925     @classmethod
 926     def run(cls, *args, timeout=None, **kwargs):
 927         with cls(*args, **kwargs) as proc:
 928             default = '' if proc.__text_mode else b''
 929             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 930             return stdout or default, stderr or default, proc.returncode
 931
 932
 933 def encodeArgument(s):
 934     # Legacy code that uses byte strings
 935     # Uncomment the following line after fixing all post processors
 936     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 937     return s if isinstance(s, str) else s.decode('ascii')
 938
 939
 940 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 941
 942
 943 def timetuple_from_msec(msec):
 944     secs, msec = divmod(msec, 1000)
 945     mins, secs = divmod(secs, 60)
 946     hrs, mins = divmod(mins, 60)
 947     return _timetuple(hrs, mins, secs, msec)
 948
 949
 950 def formatSeconds(secs, delim=':', msec=False):
 951     time = timetuple_from_msec(secs * 1000)
 952     if time.hours:
 953         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 954     elif time.minutes:
 955         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 956     else:
 957         ret = '%d' % time.seconds
 958     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 959
 960
 961 def _ssl_load_windows_store_certs(ssl_context, storename):
 962     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 963     try:
 964         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 965                  if encoding == 'x509_asn' and (
 966                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 967     except PermissionError:
 968         return
 969     for cert in certs:
 970         with contextlib.suppress(ssl.SSLError):
 971             ssl_context.load_verify_locations(cadata=cert)
 972
 973
 974 def make_HTTPS_handler(params, **kwargs):
 975     opts_check_certificate = not params.get('nocheckcertificate')
 976     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 977     context.check_hostname = opts_check_certificate
 978     if params.get('legacyserverconnect'):
 979         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 980         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 981         context.set_ciphers('DEFAULT')
 982     elif (
 983         sys.version_info < (3, 10)
 984         and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
 985         and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
 986     ):
 987         # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
 988         # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
 989         # in some situations [2][3].
 990         # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
 991         # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
 992         # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
 993         # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
 994         # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
 995         # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
 996         # 4. https://peps.python.org/pep-0644/
 997         # 5. https://peps.python.org/pep-0644/#libressl-support
 998         # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
 999         context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1000         context.minimum_version = ssl.TLSVersion.TLSv1_2
1001
1002     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1003     if opts_check_certificate:
1004         if certifi and 'no-certifi' not in params.get('compat_opts', []):
1005             context.load_verify_locations(cafile=certifi.where())
1006         else:
1007             try:
1008                 context.load_default_certs()
1009                 # Work around the issue in load_default_certs when there are bad certificates. See:
1010                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1011                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1012             except ssl.SSLError:
1013                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1014                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1015                     for storename in ('CA', 'ROOT'):
1016                         _ssl_load_windows_store_certs(context, storename)
1017                 context.set_default_verify_paths()
1018
1019     client_certfile = params.get('client_certificate')
1020     if client_certfile:
1021         try:
1022             context.load_cert_chain(
1023                 client_certfile, keyfile=params.get('client_certificate_key'),
1024                 password=params.get('client_certificate_password'))
1025         except ssl.SSLError:
1026             raise YoutubeDLError('Unable to load client certificate')
1027
1028     # Some servers may reject requests if ALPN extension is not sent. See:
1029     # https://github.com/python/cpython/issues/85140
1030     # https://github.com/yt-dlp/yt-dlp/issues/3878
1031     with contextlib.suppress(NotImplementedError):
1032         context.set_alpn_protocols(['http/1.1'])
1033
1034     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1035
1036
1037 def bug_reports_message(before=';'):
1038     from ..update import REPOSITORY
1039
1040     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1041            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1042
1043     before = before.rstrip()
1044     if not before or before.endswith(('.', '!', '?')):
1045         msg = msg[0].title() + msg[1:]
1046
1047     return (before + ' ' if before else '') + msg
1048
1049
1050 class YoutubeDLError(Exception):
1051     """Base exception for YoutubeDL errors."""
1052     msg = None
1053
1054     def __init__(self, msg=None):
1055         if msg is not None:
1056             self.msg = msg
1057         elif self.msg is None:
1058             self.msg = type(self).__name__
1059         super().__init__(self.msg)
1060
1061
1062 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1063 if hasattr(ssl, 'CertificateError'):
1064     network_exceptions.append(ssl.CertificateError)
1065 network_exceptions = tuple(network_exceptions)
1066
1067
1068 class ExtractorError(YoutubeDLError):
1069     """Error during info extraction."""
1070
1071     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1072         """ tb, if given, is the original traceback (so that it can be printed out).
1073         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1074         """
1075         if sys.exc_info()[0] in network_exceptions:
1076             expected = True
1077
1078         self.orig_msg = str(msg)
1079         self.traceback = tb
1080         self.expected = expected
1081         self.cause = cause
1082         self.video_id = video_id
1083         self.ie = ie
1084         self.exc_info = sys.exc_info()  # preserve original exception
1085         if isinstance(self.exc_info[1], ExtractorError):
1086             self.exc_info = self.exc_info[1].exc_info
1087         super().__init__(self.__msg)
1088
1089     @property
1090     def __msg(self):
1091         return ''.join((
1092             format_field(self.ie, None, '[%s] '),
1093             format_field(self.video_id, None, '%s: '),
1094             self.orig_msg,
1095             format_field(self.cause, None, ' (caused by %r)'),
1096             '' if self.expected else bug_reports_message()))
1097
1098     def format_traceback(self):
1099         return join_nonempty(
1100             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1101             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1102             delim='\n') or None
1103
1104     def __setattr__(self, name, value):
1105         super().__setattr__(name, value)
1106         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1107             self.msg = self.__msg or type(self).__name__
1108             self.args = (self.msg, )  # Cannot be property
1109
1110
1111 class UnsupportedError(ExtractorError):
1112     def __init__(self, url):
1113         super().__init__(
1114             'Unsupported URL: %s' % url, expected=True)
1115         self.url = url
1116
1117
1118 class RegexNotFoundError(ExtractorError):
1119     """Error when a regex didn't match"""
1120     pass
1121
1122
1123 class GeoRestrictedError(ExtractorError):
1124     """Geographic restriction Error exception.
1125
1126     This exception may be thrown when a video is not available from your
1127     geographic location due to geographic restrictions imposed by a website.
1128     """
1129
1130     def __init__(self, msg, countries=None, **kwargs):
1131         kwargs['expected'] = True
1132         super().__init__(msg, **kwargs)
1133         self.countries = countries
1134
1135
1136 class UserNotLive(ExtractorError):
1137     """Error when a channel/user is not live"""
1138
1139     def __init__(self, msg=None, **kwargs):
1140         kwargs['expected'] = True
1141         super().__init__(msg or 'The channel is not currently live', **kwargs)
1142
1143
1144 class DownloadError(YoutubeDLError):
1145     """Download Error exception.
1146
1147     This exception may be thrown by FileDownloader objects if they are not
1148     configured to continue on errors. They will contain the appropriate
1149     error message.
1150     """
1151
1152     def __init__(self, msg, exc_info=None):
1153         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1154         super().__init__(msg)
1155         self.exc_info = exc_info
1156
1157
1158 class EntryNotInPlaylist(YoutubeDLError):
1159     """Entry not in playlist exception.
1160
1161     This exception will be thrown by YoutubeDL when a requested entry
1162     is not found in the playlist info_dict
1163     """
1164     msg = 'Entry not found in info'
1165
1166
1167 class SameFileError(YoutubeDLError):
1168     """Same File exception.
1169
1170     This exception will be thrown by FileDownloader objects if they detect
1171     multiple files would have to be downloaded to the same file on disk.
1172     """
1173     msg = 'Fixed output name but more than one file to download'
1174
1175     def __init__(self, filename=None):
1176         if filename is not None:
1177             self.msg += f': {filename}'
1178         super().__init__(self.msg)
1179
1180
1181 class PostProcessingError(YoutubeDLError):
1182     """Post Processing exception.
1183
1184     This exception may be raised by PostProcessor's .run() method to
1185     indicate an error in the postprocessing task.
1186     """
1187
1188
1189 class DownloadCancelled(YoutubeDLError):
1190     """ Exception raised when the download queue should be interrupted """
1191     msg = 'The download was cancelled'
1192
1193
1194 class ExistingVideoReached(DownloadCancelled):
1195     """ --break-on-existing triggered """
1196     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1197
1198
1199 class RejectedVideoReached(DownloadCancelled):
1200     """ --break-match-filter triggered """
1201     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1202
1203
1204 class MaxDownloadsReached(DownloadCancelled):
1205     """ --max-downloads limit has been reached. """
1206     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1207
1208
1209 class ReExtractInfo(YoutubeDLError):
1210     """ Video info needs to be re-extracted. """
1211
1212     def __init__(self, msg, expected=False):
1213         super().__init__(msg)
1214         self.expected = expected
1215
1216
1217 class ThrottledDownload(ReExtractInfo):
1218     """ Download speed below --throttled-rate. """
1219     msg = 'The download speed is below throttle limit'
1220
1221     def __init__(self):
1222         super().__init__(self.msg, expected=False)
1223
1224
1225 class UnavailableVideoError(YoutubeDLError):
1226     """Unavailable Format exception.
1227
1228     This exception will be thrown when a video is requested
1229     in a format that is not available for that video.
1230     """
1231     msg = 'Unable to download video'
1232
1233     def __init__(self, err=None):
1234         if err is not None:
1235             self.msg += f': {err}'
1236         super().__init__(self.msg)
1237
1238
1239 class ContentTooShortError(YoutubeDLError):
1240     """Content Too Short exception.
1241
1242     This exception may be raised by FileDownloader objects when a file they
1243     download is too small for what the server announced first, indicating
1244     the connection was probably interrupted.
1245     """
1246
1247     def __init__(self, downloaded, expected):
1248         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1249         # Both in bytes
1250         self.downloaded = downloaded
1251         self.expected = expected
1252
1253
1254 class XAttrMetadataError(YoutubeDLError):
1255     def __init__(self, code=None, msg='Unknown error'):
1256         super().__init__(msg)
1257         self.code = code
1258         self.msg = msg
1259
1260         # Parsing code and msg
1261         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1262                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1263             self.reason = 'NO_SPACE'
1264         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1265             self.reason = 'VALUE_TOO_LONG'
1266         else:
1267             self.reason = 'NOT_SUPPORTED'
1268
1269
1270 class XAttrUnavailableError(YoutubeDLError):
1271     pass
1272
1273
1274 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1275     hc = http_class(*args, **kwargs)
1276     source_address = ydl_handler._params.get('source_address')
1277
1278     if source_address is not None:
1279         # This is to workaround _create_connection() from socket where it will try all
1280         # address data from getaddrinfo() including IPv6. This filters the result from
1281         # getaddrinfo() based on the source_address value.
1282         # This is based on the cpython socket.create_connection() function.
1283         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1284         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1285             host, port = address
1286             err = None
1287             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1288             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1289             ip_addrs = [addr for addr in addrs if addr[0] == af]
1290             if addrs and not ip_addrs:
1291                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1292                 raise OSError(
1293                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1294                     % (ip_version, source_address[0]))
1295             for res in ip_addrs:
1296                 af, socktype, proto, canonname, sa = res
1297                 sock = None
1298                 try:
1299                     sock = socket.socket(af, socktype, proto)
1300                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1301                         sock.settimeout(timeout)
1302                     sock.bind(source_address)
1303                     sock.connect(sa)
1304                     err = None  # Explicitly break reference cycle
1305                     return sock
1306                 except OSError as _:
1307                     err = _
1308                     if sock is not None:
1309                         sock.close()
1310             if err is not None:
1311                 raise err
1312             else:
1313                 raise OSError('getaddrinfo returns an empty list')
1314         if hasattr(hc, '_create_connection'):
1315             hc._create_connection = _create_connection
1316         hc.source_address = (source_address, 0)
1317
1318     return hc
1319
1320
1321 class YoutubeDLHandler(urllib.request.HTTPHandler):
1322     """Handler for HTTP requests and responses.
1323
1324     This class, when installed with an OpenerDirector, automatically adds
1325     the standard headers to every HTTP request and handles gzipped, deflated and
1326     brotli responses from web servers.
1327
1328     Part of this code was copied from:
1329
1330     http://techknack.net/python-urllib2-handlers/
1331
1332     Andrew Rowls, the author of that code, agreed to release it to the
1333     public domain.
1334     """
1335
1336     def __init__(self, params, *args, **kwargs):
1337         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1338         self._params = params
1339
1340     def http_open(self, req):
1341         conn_class = http.client.HTTPConnection
1342
1343         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1344         if socks_proxy:
1345             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1346             del req.headers['Ytdl-socks-proxy']
1347
1348         return self.do_open(functools.partial(
1349             _create_http_connection, self, conn_class, False),
1350             req)
1351
1352     @staticmethod
1353     def deflate(data):
1354         if not data:
1355             return data
1356         try:
1357             return zlib.decompress(data, -zlib.MAX_WBITS)
1358         except zlib.error:
1359             return zlib.decompress(data)
1360
1361     @staticmethod
1362     def brotli(data):
1363         if not data:
1364             return data
1365         return brotli.decompress(data)
1366
1367     @staticmethod
1368     def gz(data):
1369         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
1370         try:
1371             return gz.read()
1372         except OSError as original_oserror:
1373             # There may be junk add the end of the file
1374             # See http://stackoverflow.com/q/4928560/35070 for details
1375             for i in range(1, 1024):
1376                 try:
1377                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
1378                     return gz.read()
1379                 except OSError:
1380                     continue
1381             else:
1382                 raise original_oserror
1383
1384     def http_request(self, req):
1385         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1386         # always respected by websites, some tend to give out URLs with non percent-encoded
1387         # non-ASCII characters (see telemb.py, ard.py [#3412])
1388         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1389         # To work around aforementioned issue we will replace request's original URL with
1390         # percent-encoded one
1391         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1392         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1393         url = req.get_full_url()
1394         url_escaped = escape_url(url)
1395
1396         # Substitute URL if any change after escaping
1397         if url != url_escaped:
1398             req = update_Request(req, url=url_escaped)
1399
1400         for h, v in self._params.get('http_headers', std_headers).items():
1401             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1402             # The dict keys are capitalized because of this bug by urllib
1403             if h.capitalize() not in req.headers:
1404                 req.add_header(h, v)
1405
1406         if 'Youtubedl-no-compression' in req.headers:  # deprecated
1407             req.headers.pop('Youtubedl-no-compression', None)
1408             req.add_header('Accept-encoding', 'identity')
1409
1410         if 'Accept-encoding' not in req.headers:
1411             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1412
1413         return super().do_request_(req)
1414
1415     def http_response(self, req, resp):
1416         old_resp = resp
1417
1418         # Content-Encoding header lists the encodings in order that they were applied [1].
1419         # To decompress, we simply do the reverse.
1420         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
1421         decoded_response = None
1422         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
1423             if encoding == 'gzip':
1424                 decoded_response = self.gz(decoded_response or resp.read())
1425             elif encoding == 'deflate':
1426                 decoded_response = self.deflate(decoded_response or resp.read())
1427             elif encoding == 'br' and brotli:
1428                 decoded_response = self.brotli(decoded_response or resp.read())
1429
1430         if decoded_response is not None:
1431             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
1432             resp.msg = old_resp.msg
1433         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1434         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1435         if 300 <= resp.code < 400:
1436             location = resp.headers.get('Location')
1437             if location:
1438                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1439                 location = location.encode('iso-8859-1').decode()
1440                 location_escaped = escape_url(location)
1441                 if location != location_escaped:
1442                     del resp.headers['Location']
1443                     resp.headers['Location'] = location_escaped
1444         return resp
1445
1446     https_request = http_request
1447     https_response = http_response
1448
1449
1450 def make_socks_conn_class(base_class, socks_proxy):
1451     assert issubclass(base_class, (
1452         http.client.HTTPConnection, http.client.HTTPSConnection))
1453
1454     url_components = urllib.parse.urlparse(socks_proxy)
1455     if url_components.scheme.lower() == 'socks5':
1456         socks_type = ProxyType.SOCKS5
1457     elif url_components.scheme.lower() in ('socks', 'socks4'):
1458         socks_type = ProxyType.SOCKS4
1459     elif url_components.scheme.lower() == 'socks4a':
1460         socks_type = ProxyType.SOCKS4A
1461
1462     def unquote_if_non_empty(s):
1463         if not s:
1464             return s
1465         return urllib.parse.unquote_plus(s)
1466
1467     proxy_args = (
1468         socks_type,
1469         url_components.hostname, url_components.port or 1080,
1470         True,  # Remote DNS
1471         unquote_if_non_empty(url_components.username),
1472         unquote_if_non_empty(url_components.password),
1473     )
1474
1475     class SocksConnection(base_class):
1476         def connect(self):
1477             self.sock = sockssocket()
1478             self.sock.setproxy(*proxy_args)
1479             if isinstance(self.timeout, (int, float)):
1480                 self.sock.settimeout(self.timeout)
1481             self.sock.connect((self.host, self.port))
1482
1483             if isinstance(self, http.client.HTTPSConnection):
1484                 if hasattr(self, '_context'):  # Python > 2.6
1485                     self.sock = self._context.wrap_socket(
1486                         self.sock, server_hostname=self.host)
1487                 else:
1488                     self.sock = ssl.wrap_socket(self.sock)
1489
1490     return SocksConnection
1491
1492
1493 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1494     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1495         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1496         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1497         self._params = params
1498
1499     def https_open(self, req):
1500         kwargs = {}
1501         conn_class = self._https_conn_class
1502
1503         if hasattr(self, '_context'):  # python > 2.6
1504             kwargs['context'] = self._context
1505         if hasattr(self, '_check_hostname'):  # python 3.x
1506             kwargs['check_hostname'] = self._check_hostname
1507
1508         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1509         if socks_proxy:
1510             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1511             del req.headers['Ytdl-socks-proxy']
1512
1513         try:
1514             return self.do_open(
1515                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1516         except urllib.error.URLError as e:
1517             if (isinstance(e.reason, ssl.SSLError)
1518                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1519                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1520             raise
1521
1522
1523 def is_path_like(f):
1524     return isinstance(f, (str, bytes, os.PathLike))
1525
1526
1527 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1528     def __init__(self, cookiejar=None):
1529         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1530
1531     def http_response(self, request, response):
1532         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1533
1534     https_request = urllib.request.HTTPCookieProcessor.http_request
1535     https_response = http_response
1536
1537
1538 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1539     """YoutubeDL redirect handler
1540
1541     The code is based on HTTPRedirectHandler implementation from CPython [1].
1542
1543     This redirect handler fixes and improves the logic to better align with RFC7261
1544      and what browsers tend to do [2][3]
1545
1546     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1547     2. https://datatracker.ietf.org/doc/html/rfc7231
1548     3. https://github.com/python/cpython/issues/91306
1549     """
1550
1551     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1552
1553     def redirect_request(self, req, fp, code, msg, headers, newurl):
1554         if code not in (301, 302, 303, 307, 308):
1555             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1556
1557         new_method = req.get_method()
1558         new_data = req.data
1559         remove_headers = []
1560         # A 303 must either use GET or HEAD for subsequent request
1561         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1562         if code == 303 and req.get_method() != 'HEAD':
1563             new_method = 'GET'
1564         # 301 and 302 redirects are commonly turned into a GET from a POST
1565         # for subsequent requests by browsers, so we'll do the same.
1566         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1567         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1568         elif code in (301, 302) and req.get_method() == 'POST':
1569             new_method = 'GET'
1570
1571         # only remove payload if method changed (e.g. POST to GET)
1572         if new_method != req.get_method():
1573             new_data = None
1574             remove_headers.extend(['Content-Length', 'Content-Type'])
1575
1576         new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
1577
1578         return urllib.request.Request(
1579             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1580             unverifiable=True, method=new_method, data=new_data)
1581
1582
1583 def extract_timezone(date_str):
1584     m = re.search(
1585         r'''(?x)
1586             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1587             (?P<tz>Z|                                            # just the UTC Z, or
1588                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1589                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1590                    [ ]?                                          # optional space
1591                 (?P<sign>\+|-)                                   # +/-
1592                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1593             $)
1594         ''', date_str)
1595     if not m:
1596         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1597         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1598         if timezone is not None:
1599             date_str = date_str[:-len(m.group('tz'))]
1600         timezone = datetime.timedelta(hours=timezone or 0)
1601     else:
1602         date_str = date_str[:-len(m.group('tz'))]
1603         if not m.group('sign'):
1604             timezone = datetime.timedelta()
1605         else:
1606             sign = 1 if m.group('sign') == '+' else -1
1607             timezone = datetime.timedelta(
1608                 hours=sign * int(m.group('hours')),
1609                 minutes=sign * int(m.group('minutes')))
1610     return timezone, date_str
1611
1612
1613 def parse_iso8601(date_str, delimiter='T', timezone=None):
1614     """ Return a UNIX timestamp from the given date """
1615
1616     if date_str is None:
1617         return None
1618
1619     date_str = re.sub(r'\.[0-9]+', '', date_str)
1620
1621     if timezone is None:
1622         timezone, date_str = extract_timezone(date_str)
1623
1624     with contextlib.suppress(ValueError):
1625         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1626         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1627         return calendar.timegm(dt.timetuple())
1628
1629
1630 def date_formats(day_first=True):
1631     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1632
1633
1634 def unified_strdate(date_str, day_first=True):
1635     """Return a string with the date in the format YYYYMMDD"""
1636
1637     if date_str is None:
1638         return None
1639     upload_date = None
1640     # Replace commas
1641     date_str = date_str.replace(',', ' ')
1642     # Remove AM/PM + timezone
1643     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1644     _, date_str = extract_timezone(date_str)
1645
1646     for expression in date_formats(day_first):
1647         with contextlib.suppress(ValueError):
1648             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1649     if upload_date is None:
1650         timetuple = email.utils.parsedate_tz(date_str)
1651         if timetuple:
1652             with contextlib.suppress(ValueError):
1653                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1654     if upload_date is not None:
1655         return str(upload_date)
1656
1657
1658 def unified_timestamp(date_str, day_first=True):
1659     if not isinstance(date_str, str):
1660         return None
1661
1662     date_str = re.sub(r'\s+', ' ', re.sub(
1663         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1664
1665     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1666     timezone, date_str = extract_timezone(date_str)
1667
1668     # Remove AM/PM + timezone
1669     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1670
1671     # Remove unrecognized timezones from ISO 8601 alike timestamps
1672     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1673     if m:
1674         date_str = date_str[:-len(m.group('tz'))]
1675
1676     # Python only supports microseconds, so remove nanoseconds
1677     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1678     if m:
1679         date_str = m.group(1)
1680
1681     for expression in date_formats(day_first):
1682         with contextlib.suppress(ValueError):
1683             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1684             return calendar.timegm(dt.timetuple())
1685
1686     timetuple = email.utils.parsedate_tz(date_str)
1687     if timetuple:
1688         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1689
1690
1691 def determine_ext(url, default_ext='unknown_video'):
1692     if url is None or '.' not in url:
1693         return default_ext
1694     guess = url.partition('?')[0].rpartition('.')[2]
1695     if re.match(r'^[A-Za-z0-9]+$', guess):
1696         return guess
1697     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1698     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1699         return guess.rstrip('/')
1700     else:
1701         return default_ext
1702
1703
1704 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1705     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1706
1707
1708 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1709     R"""
1710     Return a datetime object from a string.
1711     Supported format:
1712         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1713
1714     @param format       strftime format of DATE
1715     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1716                         auto: round to the unit provided in date_str (if applicable).
1717     """
1718     auto_precision = False
1719     if precision == 'auto':
1720         auto_precision = True
1721         precision = 'microsecond'
1722     today = datetime_round(datetime.datetime.utcnow(), precision)
1723     if date_str in ('now', 'today'):
1724         return today
1725     if date_str == 'yesterday':
1726         return today - datetime.timedelta(days=1)
1727     match = re.match(
1728         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1729         date_str)
1730     if match is not None:
1731         start_time = datetime_from_str(match.group('start'), precision, format)
1732         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1733         unit = match.group('unit')
1734         if unit == 'month' or unit == 'year':
1735             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1736             unit = 'day'
1737         else:
1738             if unit == 'week':
1739                 unit = 'day'
1740                 time *= 7
1741             delta = datetime.timedelta(**{unit + 's': time})
1742             new_date = start_time + delta
1743         if auto_precision:
1744             return datetime_round(new_date, unit)
1745         return new_date
1746
1747     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1748
1749
1750 def date_from_str(date_str, format='%Y%m%d', strict=False):
1751     R"""
1752     Return a date object from a string using datetime_from_str
1753
1754     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1755                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1756     """
1757     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1758         raise ValueError(f'Invalid date format "{date_str}"')
1759     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1760
1761
1762 def datetime_add_months(dt, months):
1763     """Increment/Decrement a datetime object by months."""
1764     month = dt.month + months - 1
1765     year = dt.year + month // 12
1766     month = month % 12 + 1
1767     day = min(dt.day, calendar.monthrange(year, month)[1])
1768     return dt.replace(year, month, day)
1769
1770
1771 def datetime_round(dt, precision='day'):
1772     """
1773     Round a datetime object's time to a specific precision
1774     """
1775     if precision == 'microsecond':
1776         return dt
1777
1778     unit_seconds = {
1779         'day': 86400,
1780         'hour': 3600,
1781         'minute': 60,
1782         'second': 1,
1783     }
1784     roundto = lambda x, n: ((x + n / 2) // n) * n
1785     timestamp = calendar.timegm(dt.timetuple())
1786     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1787
1788
1789 def hyphenate_date(date_str):
1790     """
1791     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1792     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1793     if match is not None:
1794         return '-'.join(match.groups())
1795     else:
1796         return date_str
1797
1798
1799 class DateRange:
1800     """Represents a time interval between two dates"""
1801
1802     def __init__(self, start=None, end=None):
1803         """start and end must be strings in the format accepted by date"""
1804         if start is not None:
1805             self.start = date_from_str(start, strict=True)
1806         else:
1807             self.start = datetime.datetime.min.date()
1808         if end is not None:
1809             self.end = date_from_str(end, strict=True)
1810         else:
1811             self.end = datetime.datetime.max.date()
1812         if self.start > self.end:
1813             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1814
1815     @classmethod
1816     def day(cls, day):
1817         """Returns a range that only contains the given day"""
1818         return cls(day, day)
1819
1820     def __contains__(self, date):
1821         """Check if the date is in the range"""
1822         if not isinstance(date, datetime.date):
1823             date = date_from_str(date)
1824         return self.start <= date <= self.end
1825
1826     def __repr__(self):
1827         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1828
1829     def __eq__(self, other):
1830         return (isinstance(other, DateRange)
1831                 and self.start == other.start and self.end == other.end)
1832
1833
1834 @functools.cache
1835 def system_identifier():
1836     python_implementation = platform.python_implementation()
1837     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1838         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1839     libc_ver = []
1840     with contextlib.suppress(OSError):  # We may not have access to the executable
1841         libc_ver = platform.libc_ver()
1842
1843     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1844         platform.python_version(),
1845         python_implementation,
1846         platform.machine(),
1847         platform.architecture()[0],
1848         platform.platform(),
1849         ssl.OPENSSL_VERSION,
1850         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1851     )
1852
1853
1854 @functools.cache
1855 def get_windows_version():
1856     ''' Get Windows version. returns () if it's not running on Windows '''
1857     if compat_os_name == 'nt':
1858         return version_tuple(platform.win32_ver()[1])
1859     else:
1860         return ()
1861
1862
1863 def write_string(s, out=None, encoding=None):
1864     assert isinstance(s, str)
1865     out = out or sys.stderr
1866     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1867     if not out:
1868         return
1869
1870     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1871         s = re.sub(r'([\r\n]+)', r' \1', s)
1872
1873     enc, buffer = None, out
1874     if 'b' in getattr(out, 'mode', ''):
1875         enc = encoding or preferredencoding()
1876     elif hasattr(out, 'buffer'):
1877         buffer = out.buffer
1878         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1879
1880     buffer.write(s.encode(enc, 'ignore') if enc else s)
1881     out.flush()
1882
1883
1884 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1885     from .. import _IN_CLI
1886     if _IN_CLI:
1887         if msg in deprecation_warning._cache:
1888             return
1889         deprecation_warning._cache.add(msg)
1890         if printer:
1891             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1892         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1893     else:
1894         import warnings
1895         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1896
1897
1898 deprecation_warning._cache = set()
1899
1900
1901 def bytes_to_intlist(bs):
1902     if not bs:
1903         return []
1904     if isinstance(bs[0], int):  # Python 3
1905         return list(bs)
1906     else:
1907         return [ord(c) for c in bs]
1908
1909
1910 def intlist_to_bytes(xs):
1911     if not xs:
1912         return b''
1913     return struct.pack('%dB' % len(xs), *xs)
1914
1915
1916 class LockingUnsupportedError(OSError):
1917     msg = 'File locking is not supported'
1918
1919     def __init__(self):
1920         super().__init__(self.msg)
1921
1922
1923 # Cross-platform file locking
1924 if sys.platform == 'win32':
1925     import ctypes
1926     import ctypes.wintypes
1927     import msvcrt
1928
1929     class OVERLAPPED(ctypes.Structure):
1930         _fields_ = [
1931             ('Internal', ctypes.wintypes.LPVOID),
1932             ('InternalHigh', ctypes.wintypes.LPVOID),
1933             ('Offset', ctypes.wintypes.DWORD),
1934             ('OffsetHigh', ctypes.wintypes.DWORD),
1935             ('hEvent', ctypes.wintypes.HANDLE),
1936         ]
1937
1938     kernel32 = ctypes.WinDLL('kernel32')
1939     LockFileEx = kernel32.LockFileEx
1940     LockFileEx.argtypes = [
1941         ctypes.wintypes.HANDLE,     # hFile
1942         ctypes.wintypes.DWORD,      # dwFlags
1943         ctypes.wintypes.DWORD,      # dwReserved
1944         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1945         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1946         ctypes.POINTER(OVERLAPPED)  # Overlapped
1947     ]
1948     LockFileEx.restype = ctypes.wintypes.BOOL
1949     UnlockFileEx = kernel32.UnlockFileEx
1950     UnlockFileEx.argtypes = [
1951         ctypes.wintypes.HANDLE,     # hFile
1952         ctypes.wintypes.DWORD,      # dwReserved
1953         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1954         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1955         ctypes.POINTER(OVERLAPPED)  # Overlapped
1956     ]
1957     UnlockFileEx.restype = ctypes.wintypes.BOOL
1958     whole_low = 0xffffffff
1959     whole_high = 0x7fffffff
1960
1961     def _lock_file(f, exclusive, block):
1962         overlapped = OVERLAPPED()
1963         overlapped.Offset = 0
1964         overlapped.OffsetHigh = 0
1965         overlapped.hEvent = 0
1966         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1967
1968         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1969                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1970                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1971             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1972             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1973
1974     def _unlock_file(f):
1975         assert f._lock_file_overlapped_p
1976         handle = msvcrt.get_osfhandle(f.fileno())
1977         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1978             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1979
1980 else:
1981     try:
1982         import fcntl
1983
1984         def _lock_file(f, exclusive, block):
1985             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1986             if not block:
1987                 flags |= fcntl.LOCK_NB
1988             try:
1989                 fcntl.flock(f, flags)
1990             except BlockingIOError:
1991                 raise
1992             except OSError:  # AOSP does not have flock()
1993                 fcntl.lockf(f, flags)
1994
1995         def _unlock_file(f):
1996             with contextlib.suppress(OSError):
1997                 return fcntl.flock(f, fcntl.LOCK_UN)
1998             with contextlib.suppress(OSError):
1999                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
2000             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
2001
2002     except ImportError:
2003
2004         def _lock_file(f, exclusive, block):
2005             raise LockingUnsupportedError()
2006
2007         def _unlock_file(f):
2008             raise LockingUnsupportedError()
2009
2010
2011 class locked_file:
2012     locked = False
2013
2014     def __init__(self, filename, mode, block=True, encoding=None):
2015         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2016             raise NotImplementedError(mode)
2017         self.mode, self.block = mode, block
2018
2019         writable = any(f in mode for f in 'wax+')
2020         readable = any(f in mode for f in 'r+')
2021         flags = functools.reduce(operator.ior, (
2022             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2023             getattr(os, 'O_BINARY', 0),  # Windows only
2024             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2025             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2026             os.O_APPEND if 'a' in mode else 0,
2027             os.O_EXCL if 'x' in mode else 0,
2028             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2029         ))
2030
2031         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2032
2033     def __enter__(self):
2034         exclusive = 'r' not in self.mode
2035         try:
2036             _lock_file(self.f, exclusive, self.block)
2037             self.locked = True
2038         except OSError:
2039             self.f.close()
2040             raise
2041         if 'w' in self.mode:
2042             try:
2043                 self.f.truncate()
2044             except OSError as e:
2045                 if e.errno not in (
2046                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2047                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2048                 ):
2049                     raise
2050         return self
2051
2052     def unlock(self):
2053         if not self.locked:
2054             return
2055         try:
2056             _unlock_file(self.f)
2057         finally:
2058             self.locked = False
2059
2060     def __exit__(self, *_):
2061         try:
2062             self.unlock()
2063         finally:
2064             self.f.close()
2065
2066     open = __enter__
2067     close = __exit__
2068
2069     def __getattr__(self, attr):
2070         return getattr(self.f, attr)
2071
2072     def __iter__(self):
2073         return iter(self.f)
2074
2075
2076 @functools.cache
2077 def get_filesystem_encoding():
2078     encoding = sys.getfilesystemencoding()
2079     return encoding if encoding is not None else 'utf-8'
2080
2081
2082 def shell_quote(args):
2083     quoted_args = []
2084     encoding = get_filesystem_encoding()
2085     for a in args:
2086         if isinstance(a, bytes):
2087             # We may get a filename encoded with 'encodeFilename'
2088             a = a.decode(encoding)
2089         quoted_args.append(compat_shlex_quote(a))
2090     return ' '.join(quoted_args)
2091
2092
2093 def smuggle_url(url, data):
2094     """ Pass additional data in a URL for internal use. """
2095
2096     url, idata = unsmuggle_url(url, {})
2097     data.update(idata)
2098     sdata = urllib.parse.urlencode(
2099         {'__youtubedl_smuggle': json.dumps(data)})
2100     return url + '#' + sdata
2101
2102
2103 def unsmuggle_url(smug_url, default=None):
2104     if '#__youtubedl_smuggle' not in smug_url:
2105         return smug_url, default
2106     url, _, sdata = smug_url.rpartition('#')
2107     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2108     data = json.loads(jsond)
2109     return url, data
2110
2111
2112 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2113     """ Formats numbers with decimal sufixes like K, M, etc """
2114     num, factor = float_or_none(num), float(factor)
2115     if num is None or num < 0:
2116         return None
2117     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2118     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2119     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2120     if factor == 1024:
2121         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2122     converted = num / (factor ** exponent)
2123     return fmt % (converted, suffix)
2124
2125
2126 def format_bytes(bytes):
2127     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2128
2129
2130 def lookup_unit_table(unit_table, s, strict=False):
2131     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2132     units_re = '|'.join(re.escape(u) for u in unit_table)
2133     m = (re.fullmatch if strict else re.match)(
2134         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2135     if not m:
2136         return None
2137
2138     num = float(m.group('num').replace(',', '.'))
2139     mult = unit_table[m.group('unit')]
2140     return round(num * mult)
2141
2142
2143 def parse_bytes(s):
2144     """Parse a string indicating a byte quantity into an integer"""
2145     return lookup_unit_table(
2146         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2147         s.upper(), strict=True)
2148
2149
2150 def parse_filesize(s):
2151     if s is None:
2152         return None
2153
2154     # The lower-case forms are of course incorrect and unofficial,
2155     # but we support those too
2156     _UNIT_TABLE = {
2157         'B': 1,
2158         'b': 1,
2159         'bytes': 1,
2160         'KiB': 1024,
2161         'KB': 1000,
2162         'kB': 1024,
2163         'Kb': 1000,
2164         'kb': 1000,
2165         'kilobytes': 1000,
2166         'kibibytes': 1024,
2167         'MiB': 1024 ** 2,
2168         'MB': 1000 ** 2,
2169         'mB': 1024 ** 2,
2170         'Mb': 1000 ** 2,
2171         'mb': 1000 ** 2,
2172         'megabytes': 1000 ** 2,
2173         'mebibytes': 1024 ** 2,
2174         'GiB': 1024 ** 3,
2175         'GB': 1000 ** 3,
2176         'gB': 1024 ** 3,
2177         'Gb': 1000 ** 3,
2178         'gb': 1000 ** 3,
2179         'gigabytes': 1000 ** 3,
2180         'gibibytes': 1024 ** 3,
2181         'TiB': 1024 ** 4,
2182         'TB': 1000 ** 4,
2183         'tB': 1024 ** 4,
2184         'Tb': 1000 ** 4,
2185         'tb': 1000 ** 4,
2186         'terabytes': 1000 ** 4,
2187         'tebibytes': 1024 ** 4,
2188         'PiB': 1024 ** 5,
2189         'PB': 1000 ** 5,
2190         'pB': 1024 ** 5,
2191         'Pb': 1000 ** 5,
2192         'pb': 1000 ** 5,
2193         'petabytes': 1000 ** 5,
2194         'pebibytes': 1024 ** 5,
2195         'EiB': 1024 ** 6,
2196         'EB': 1000 ** 6,
2197         'eB': 1024 ** 6,
2198         'Eb': 1000 ** 6,
2199         'eb': 1000 ** 6,
2200         'exabytes': 1000 ** 6,
2201         'exbibytes': 1024 ** 6,
2202         'ZiB': 1024 ** 7,
2203         'ZB': 1000 ** 7,
2204         'zB': 1024 ** 7,
2205         'Zb': 1000 ** 7,
2206         'zb': 1000 ** 7,
2207         'zettabytes': 1000 ** 7,
2208         'zebibytes': 1024 ** 7,
2209         'YiB': 1024 ** 8,
2210         'YB': 1000 ** 8,
2211         'yB': 1024 ** 8,
2212         'Yb': 1000 ** 8,
2213         'yb': 1000 ** 8,
2214         'yottabytes': 1000 ** 8,
2215         'yobibytes': 1024 ** 8,
2216     }
2217
2218     return lookup_unit_table(_UNIT_TABLE, s)
2219
2220
2221 def parse_count(s):
2222     if s is None:
2223         return None
2224
2225     s = re.sub(r'^[^\d]+\s', '', s).strip()
2226
2227     if re.match(r'^[\d,.]+$', s):
2228         return str_to_int(s)
2229
2230     _UNIT_TABLE = {
2231         'k': 1000,
2232         'K': 1000,
2233         'm': 1000 ** 2,
2234         'M': 1000 ** 2,
2235         'kk': 1000 ** 2,
2236         'KK': 1000 ** 2,
2237         'b': 1000 ** 3,
2238         'B': 1000 ** 3,
2239     }
2240
2241     ret = lookup_unit_table(_UNIT_TABLE, s)
2242     if ret is not None:
2243         return ret
2244
2245     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2246     if mobj:
2247         return str_to_int(mobj.group(1))
2248
2249
2250 def parse_resolution(s, *, lenient=False):
2251     if s is None:
2252         return {}
2253
2254     if lenient:
2255         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2256     else:
2257         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2258     if mobj:
2259         return {
2260             'width': int(mobj.group('w')),
2261             'height': int(mobj.group('h')),
2262         }
2263
2264     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2265     if mobj:
2266         return {'height': int(mobj.group(1))}
2267
2268     mobj = re.search(r'\b([48])[kK]\b', s)
2269     if mobj:
2270         return {'height': int(mobj.group(1)) * 540}
2271
2272     return {}
2273
2274
2275 def parse_bitrate(s):
2276     if not isinstance(s, str):
2277         return
2278     mobj = re.search(r'\b(\d+)\s*kbps', s)
2279     if mobj:
2280         return int(mobj.group(1))
2281
2282
2283 def month_by_name(name, lang='en'):
2284     """ Return the number of a month by (locale-independently) English name """
2285
2286     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2287
2288     try:
2289         return month_names.index(name) + 1
2290     except ValueError:
2291         return None
2292
2293
2294 def month_by_abbreviation(abbrev):
2295     """ Return the number of a month by (locale-independently) English
2296         abbreviations """
2297
2298     try:
2299         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2300     except ValueError:
2301         return None
2302
2303
2304 def fix_xml_ampersands(xml_str):
2305     """Replace all the '&' by '&amp;' in XML"""
2306     return re.sub(
2307         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2308         '&amp;',
2309         xml_str)
2310
2311
2312 def setproctitle(title):
2313     assert isinstance(title, str)
2314
2315     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2316     try:
2317         import ctypes
2318     except ImportError:
2319         return
2320
2321     try:
2322         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2323     except OSError:
2324         return
2325     except TypeError:
2326         # LoadLibrary in Windows Python 2.7.13 only expects
2327         # a bytestring, but since unicode_literals turns
2328         # every string into a unicode string, it fails.
2329         return
2330     title_bytes = title.encode()
2331     buf = ctypes.create_string_buffer(len(title_bytes))
2332     buf.value = title_bytes
2333     try:
2334         libc.prctl(15, buf, 0, 0, 0)
2335     except AttributeError:
2336         return  # Strange libc, just skip this
2337
2338
2339 def remove_start(s, start):
2340     return s[len(start):] if s is not None and s.startswith(start) else s
2341
2342
2343 def remove_end(s, end):
2344     return s[:-len(end)] if s is not None and s.endswith(end) else s
2345
2346
2347 def remove_quotes(s):
2348     if s is None or len(s) < 2:
2349         return s
2350     for quote in ('"', "'", ):
2351         if s[0] == quote and s[-1] == quote:
2352             return s[1:-1]
2353     return s
2354
2355
2356 def get_domain(url):
2357     """
2358     This implementation is inconsistent, but is kept for compatibility.
2359     Use this only for "webpage_url_domain"
2360     """
2361     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2362
2363
2364 def url_basename(url):
2365     path = urllib.parse.urlparse(url).path
2366     return path.strip('/').split('/')[-1]
2367
2368
2369 def base_url(url):
2370     return re.match(r'https?://[^?#]+/', url).group()
2371
2372
2373 def urljoin(base, path):
2374     if isinstance(path, bytes):
2375         path = path.decode()
2376     if not isinstance(path, str) or not path:
2377         return None
2378     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2379         return path
2380     if isinstance(base, bytes):
2381         base = base.decode()
2382     if not isinstance(base, str) or not re.match(
2383             r'^(?:https?:)?//', base):
2384         return None
2385     return urllib.parse.urljoin(base, path)
2386
2387
2388 class HEADRequest(urllib.request.Request):
2389     def get_method(self):
2390         return 'HEAD'
2391
2392
2393 class PUTRequest(urllib.request.Request):
2394     def get_method(self):
2395         return 'PUT'
2396
2397
2398 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2399     if get_attr and v is not None:
2400         v = getattr(v, get_attr, None)
2401     try:
2402         return int(v) * invscale // scale
2403     except (ValueError, TypeError, OverflowError):
2404         return default
2405
2406
2407 def str_or_none(v, default=None):
2408     return default if v is None else str(v)
2409
2410
2411 def str_to_int(int_str):
2412     """ A more relaxed version of int_or_none """
2413     if isinstance(int_str, int):
2414         return int_str
2415     elif isinstance(int_str, str):
2416         int_str = re.sub(r'[,\.\+]', '', int_str)
2417         return int_or_none(int_str)
2418
2419
2420 def float_or_none(v, scale=1, invscale=1, default=None):
2421     if v is None:
2422         return default
2423     try:
2424         return float(v) * invscale / scale
2425     except (ValueError, TypeError):
2426         return default
2427
2428
2429 def bool_or_none(v, default=None):
2430     return v if isinstance(v, bool) else default
2431
2432
2433 def strip_or_none(v, default=None):
2434     return v.strip() if isinstance(v, str) else default
2435
2436
2437 def url_or_none(url):
2438     if not url or not isinstance(url, str):
2439         return None
2440     url = url.strip()
2441     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2442
2443
2444 def request_to_url(req):
2445     if isinstance(req, urllib.request.Request):
2446         return req.get_full_url()
2447     else:
2448         return req
2449
2450
2451 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
2452     datetime_object = None
2453     try:
2454         if isinstance(timestamp, (int, float)):  # unix timestamp
2455             # Using naive datetime here can break timestamp() in Windows
2456             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2457             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2458             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2459             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2460                                + datetime.timedelta(seconds=timestamp))
2461         elif isinstance(timestamp, str):  # assume YYYYMMDD
2462             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2463         date_format = re.sub(  # Support %s on windows
2464             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2465         return datetime_object.strftime(date_format)
2466     except (ValueError, TypeError, AttributeError):
2467         return default
2468
2469
2470 def parse_duration(s):
2471     if not isinstance(s, str):
2472         return None
2473     s = s.strip()
2474     if not s:
2475         return None
2476
2477     days, hours, mins, secs, ms = [None] * 5
2478     m = re.match(r'''(?x)
2479             (?P<before_secs>
2480                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2481             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2482             (?P<ms>[.:][0-9]+)?Z?$
2483         ''', s)
2484     if m:
2485         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2486     else:
2487         m = re.match(
2488             r'''(?ix)(?:P?
2489                 (?:
2490                     [0-9]+\s*y(?:ears?)?,?\s*
2491                 )?
2492                 (?:
2493                     [0-9]+\s*m(?:onths?)?,?\s*
2494                 )?
2495                 (?:
2496                     [0-9]+\s*w(?:eeks?)?,?\s*
2497                 )?
2498                 (?:
2499                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2500                 )?
2501                 T)?
2502                 (?:
2503                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2504                 )?
2505                 (?:
2506                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2507                 )?
2508                 (?:
2509                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2510                 )?Z?$''', s)
2511         if m:
2512             days, hours, mins, secs, ms = m.groups()
2513         else:
2514             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2515             if m:
2516                 hours, mins = m.groups()
2517             else:
2518                 return None
2519
2520     if ms:
2521         ms = ms.replace(':', '.')
2522     return sum(float(part or 0) * mult for part, mult in (
2523         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2524
2525
2526 def prepend_extension(filename, ext, expected_real_ext=None):
2527     name, real_ext = os.path.splitext(filename)
2528     return (
2529         f'{name}.{ext}{real_ext}'
2530         if not expected_real_ext or real_ext[1:] == expected_real_ext
2531         else f'{filename}.{ext}')
2532
2533
2534 def replace_extension(filename, ext, expected_real_ext=None):
2535     name, real_ext = os.path.splitext(filename)
2536     return '{}.{}'.format(
2537         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2538         ext)
2539
2540
2541 def check_executable(exe, args=[]):
2542     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2543     args can be a list of arguments for a short output (like -version) """
2544     try:
2545         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2546     except OSError:
2547         return False
2548     return exe
2549
2550
2551 def _get_exe_version_output(exe, args):
2552     try:
2553         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2554         # SIGTTOU if yt-dlp is run in the background.
2555         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2556         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2557                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2558         if ret:
2559             return None
2560     except OSError:
2561         return False
2562     return stdout
2563
2564
2565 def detect_exe_version(output, version_re=None, unrecognized='present'):
2566     assert isinstance(output, str)
2567     if version_re is None:
2568         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2569     m = re.search(version_re, output)
2570     if m:
2571         return m.group(1)
2572     else:
2573         return unrecognized
2574
2575
2576 def get_exe_version(exe, args=['--version'],
2577                     version_re=None, unrecognized=('present', 'broken')):
2578     """ Returns the version of the specified executable,
2579     or False if the executable is not present """
2580     unrecognized = variadic(unrecognized)
2581     assert len(unrecognized) in (1, 2)
2582     out = _get_exe_version_output(exe, args)
2583     if out is None:
2584         return unrecognized[-1]
2585     return out and detect_exe_version(out, version_re, unrecognized[0])
2586
2587
2588 def frange(start=0, stop=None, step=1):
2589     """Float range"""
2590     if stop is None:
2591         start, stop = 0, start
2592     sign = [-1, 1][step > 0] if step else 0
2593     while sign * start < sign * stop:
2594         yield start
2595         start += step
2596
2597
2598 class LazyList(collections.abc.Sequence):
2599     """Lazy immutable list from an iterable
2600     Note that slices of a LazyList are lists and not LazyList"""
2601
2602     class IndexError(IndexError):
2603         pass
2604
2605     def __init__(self, iterable, *, reverse=False, _cache=None):
2606         self._iterable = iter(iterable)
2607         self._cache = [] if _cache is None else _cache
2608         self._reversed = reverse
2609
2610     def __iter__(self):
2611         if self._reversed:
2612             # We need to consume the entire iterable to iterate in reverse
2613             yield from self.exhaust()
2614             return
2615         yield from self._cache
2616         for item in self._iterable:
2617             self._cache.append(item)
2618             yield item
2619
2620     def _exhaust(self):
2621         self._cache.extend(self._iterable)
2622         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2623         return self._cache
2624
2625     def exhaust(self):
2626         """Evaluate the entire iterable"""
2627         return self._exhaust()[::-1 if self._reversed else 1]
2628
2629     @staticmethod
2630     def _reverse_index(x):
2631         return None if x is None else ~x
2632
2633     def __getitem__(self, idx):
2634         if isinstance(idx, slice):
2635             if self._reversed:
2636                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2637             start, stop, step = idx.start, idx.stop, idx.step or 1
2638         elif isinstance(idx, int):
2639             if self._reversed:
2640                 idx = self._reverse_index(idx)
2641             start, stop, step = idx, idx, 0
2642         else:
2643             raise TypeError('indices must be integers or slices')
2644         if ((start or 0) < 0 or (stop or 0) < 0
2645                 or (start is None and step < 0)
2646                 or (stop is None and step > 0)):
2647             # We need to consume the entire iterable to be able to slice from the end
2648             # Obviously, never use this with infinite iterables
2649             self._exhaust()
2650             try:
2651                 return self._cache[idx]
2652             except IndexError as e:
2653                 raise self.IndexError(e) from e
2654         n = max(start or 0, stop or 0) - len(self._cache) + 1
2655         if n > 0:
2656             self._cache.extend(itertools.islice(self._iterable, n))
2657         try:
2658             return self._cache[idx]
2659         except IndexError as e:
2660             raise self.IndexError(e) from e
2661
2662     def __bool__(self):
2663         try:
2664             self[-1] if self._reversed else self[0]
2665         except self.IndexError:
2666             return False
2667         return True
2668
2669     def __len__(self):
2670         self._exhaust()
2671         return len(self._cache)
2672
2673     def __reversed__(self):
2674         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2675
2676     def __copy__(self):
2677         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2678
2679     def __repr__(self):
2680         # repr and str should mimic a list. So we exhaust the iterable
2681         return repr(self.exhaust())
2682
2683     def __str__(self):
2684         return repr(self.exhaust())
2685
2686
2687 class PagedList:
2688
2689     class IndexError(IndexError):
2690         pass
2691
2692     def __len__(self):
2693         # This is only useful for tests
2694         return len(self.getslice())
2695
2696     def __init__(self, pagefunc, pagesize, use_cache=True):
2697         self._pagefunc = pagefunc
2698         self._pagesize = pagesize
2699         self._pagecount = float('inf')
2700         self._use_cache = use_cache
2701         self._cache = {}
2702
2703     def getpage(self, pagenum):
2704         page_results = self._cache.get(pagenum)
2705         if page_results is None:
2706             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2707         if self._use_cache:
2708             self._cache[pagenum] = page_results
2709         return page_results
2710
2711     def getslice(self, start=0, end=None):
2712         return list(self._getslice(start, end))
2713
2714     def _getslice(self, start, end):
2715         raise NotImplementedError('This method must be implemented by subclasses')
2716
2717     def __getitem__(self, idx):
2718         assert self._use_cache, 'Indexing PagedList requires cache'
2719         if not isinstance(idx, int) or idx < 0:
2720             raise TypeError('indices must be non-negative integers')
2721         entries = self.getslice(idx, idx + 1)
2722         if not entries:
2723             raise self.IndexError()
2724         return entries[0]
2725
2726
2727 class OnDemandPagedList(PagedList):
2728     """Download pages until a page with less than maximum results"""
2729
2730     def _getslice(self, start, end):
2731         for pagenum in itertools.count(start // self._pagesize):
2732             firstid = pagenum * self._pagesize
2733             nextfirstid = pagenum * self._pagesize + self._pagesize
2734             if start >= nextfirstid:
2735                 continue
2736
2737             startv = (
2738                 start % self._pagesize
2739                 if firstid <= start < nextfirstid
2740                 else 0)
2741             endv = (
2742                 ((end - 1) % self._pagesize) + 1
2743                 if (end is not None and firstid <= end <= nextfirstid)
2744                 else None)
2745
2746             try:
2747                 page_results = self.getpage(pagenum)
2748             except Exception:
2749                 self._pagecount = pagenum - 1
2750                 raise
2751             if startv != 0 or endv is not None:
2752                 page_results = page_results[startv:endv]
2753             yield from page_results
2754
2755             # A little optimization - if current page is not "full", ie. does
2756             # not contain page_size videos then we can assume that this page
2757             # is the last one - there are no more ids on further pages -
2758             # i.e. no need to query again.
2759             if len(page_results) + startv < self._pagesize:
2760                 break
2761
2762             # If we got the whole page, but the next page is not interesting,
2763             # break out early as well
2764             if end == nextfirstid:
2765                 break
2766
2767
2768 class InAdvancePagedList(PagedList):
2769     """PagedList with total number of pages known in advance"""
2770
2771     def __init__(self, pagefunc, pagecount, pagesize):
2772         PagedList.__init__(self, pagefunc, pagesize, True)
2773         self._pagecount = pagecount
2774
2775     def _getslice(self, start, end):
2776         start_page = start // self._pagesize
2777         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2778         skip_elems = start - start_page * self._pagesize
2779         only_more = None if end is None else end - start
2780         for pagenum in range(start_page, end_page):
2781             page_results = self.getpage(pagenum)
2782             if skip_elems:
2783                 page_results = page_results[skip_elems:]
2784                 skip_elems = None
2785             if only_more is not None:
2786                 if len(page_results) < only_more:
2787                     only_more -= len(page_results)
2788                 else:
2789                     yield from page_results[:only_more]
2790                     break
2791             yield from page_results
2792
2793
2794 class PlaylistEntries:
2795     MissingEntry = object()
2796     is_exhausted = False
2797
2798     def __init__(self, ydl, info_dict):
2799         self.ydl = ydl
2800
2801         # _entries must be assigned now since infodict can change during iteration
2802         entries = info_dict.get('entries')
2803         if entries is None:
2804             raise EntryNotInPlaylist('There are no entries')
2805         elif isinstance(entries, list):
2806             self.is_exhausted = True
2807
2808         requested_entries = info_dict.get('requested_entries')
2809         self.is_incomplete = requested_entries is not None
2810         if self.is_incomplete:
2811             assert self.is_exhausted
2812             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2813             for i, entry in zip(requested_entries, entries):
2814                 self._entries[i - 1] = entry
2815         elif isinstance(entries, (list, PagedList, LazyList)):
2816             self._entries = entries
2817         else:
2818             self._entries = LazyList(entries)
2819
2820     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2821         (?P<start>[+-]?\d+)?
2822         (?P<range>[:-]
2823             (?P<end>[+-]?\d+|inf(?:inite)?)?
2824             (?::(?P<step>[+-]?\d+))?
2825         )?''')
2826
2827     @classmethod
2828     def parse_playlist_items(cls, string):
2829         for segment in string.split(','):
2830             if not segment:
2831                 raise ValueError('There is two or more consecutive commas')
2832             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2833             if not mobj:
2834                 raise ValueError(f'{segment!r} is not a valid specification')
2835             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2836             if int_or_none(step) == 0:
2837                 raise ValueError(f'Step in {segment!r} cannot be zero')
2838             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2839
2840     def get_requested_items(self):
2841         playlist_items = self.ydl.params.get('playlist_items')
2842         playlist_start = self.ydl.params.get('playliststart', 1)
2843         playlist_end = self.ydl.params.get('playlistend')
2844         # For backwards compatibility, interpret -1 as whole list
2845         if playlist_end in (-1, None):
2846             playlist_end = ''
2847         if not playlist_items:
2848             playlist_items = f'{playlist_start}:{playlist_end}'
2849         elif playlist_start != 1 or playlist_end:
2850             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2851
2852         for index in self.parse_playlist_items(playlist_items):
2853             for i, entry in self[index]:
2854                 yield i, entry
2855                 if not entry:
2856                     continue
2857                 try:
2858                     # The item may have just been added to archive. Don't break due to it
2859                     if not self.ydl.params.get('lazy_playlist'):
2860                         # TODO: Add auto-generated fields
2861                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2862                 except (ExistingVideoReached, RejectedVideoReached):
2863                     return
2864
2865     def get_full_count(self):
2866         if self.is_exhausted and not self.is_incomplete:
2867             return len(self)
2868         elif isinstance(self._entries, InAdvancePagedList):
2869             if self._entries._pagesize == 1:
2870                 return self._entries._pagecount
2871
2872     @functools.cached_property
2873     def _getter(self):
2874         if isinstance(self._entries, list):
2875             def get_entry(i):
2876                 try:
2877                     entry = self._entries[i]
2878                 except IndexError:
2879                     entry = self.MissingEntry
2880                     if not self.is_incomplete:
2881                         raise self.IndexError()
2882                 if entry is self.MissingEntry:
2883                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2884                 return entry
2885         else:
2886             def get_entry(i):
2887                 try:
2888                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2889                 except (LazyList.IndexError, PagedList.IndexError):
2890                     raise self.IndexError()
2891         return get_entry
2892
2893     def __getitem__(self, idx):
2894         if isinstance(idx, int):
2895             idx = slice(idx, idx)
2896
2897         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2898         step = 1 if idx.step is None else idx.step
2899         if idx.start is None:
2900             start = 0 if step > 0 else len(self) - 1
2901         else:
2902             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2903
2904         # NB: Do not call len(self) when idx == [:]
2905         if idx.stop is None:
2906             stop = 0 if step < 0 else float('inf')
2907         else:
2908             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2909         stop += [-1, 1][step > 0]
2910
2911         for i in frange(start, stop, step):
2912             if i < 0:
2913                 continue
2914             try:
2915                 entry = self._getter(i)
2916             except self.IndexError:
2917                 self.is_exhausted = True
2918                 if step > 0:
2919                     break
2920                 continue
2921             yield i + 1, entry
2922
2923     def __len__(self):
2924         return len(tuple(self[:]))
2925
2926     class IndexError(IndexError):
2927         pass
2928
2929
2930 def uppercase_escape(s):
2931     unicode_escape = codecs.getdecoder('unicode_escape')
2932     return re.sub(
2933         r'\\U[0-9a-fA-F]{8}',
2934         lambda m: unicode_escape(m.group(0))[0],
2935         s)
2936
2937
2938 def lowercase_escape(s):
2939     unicode_escape = codecs.getdecoder('unicode_escape')
2940     return re.sub(
2941         r'\\u[0-9a-fA-F]{4}',
2942         lambda m: unicode_escape(m.group(0))[0],
2943         s)
2944
2945
2946 def escape_rfc3986(s):
2947     """Escape non-ASCII characters as suggested by RFC 3986"""
2948     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2949
2950
2951 def escape_url(url):
2952     """Escape URL as suggested by RFC 3986"""
2953     url_parsed = urllib.parse.urlparse(url)
2954     return url_parsed._replace(
2955         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2956         path=escape_rfc3986(url_parsed.path),
2957         params=escape_rfc3986(url_parsed.params),
2958         query=escape_rfc3986(url_parsed.query),
2959         fragment=escape_rfc3986(url_parsed.fragment)
2960     ).geturl()
2961
2962
2963 def parse_qs(url, **kwargs):
2964     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2965
2966
2967 def read_batch_urls(batch_fd):
2968     def fixup(url):
2969         if not isinstance(url, str):
2970             url = url.decode('utf-8', 'replace')
2971         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2972         for bom in BOM_UTF8:
2973             if url.startswith(bom):
2974                 url = url[len(bom):]
2975         url = url.lstrip()
2976         if not url or url.startswith(('#', ';', ']')):
2977             return False
2978         # "#" cannot be stripped out since it is part of the URI
2979         # However, it can be safely stripped out if following a whitespace
2980         return re.split(r'\s#', url, 1)[0].rstrip()
2981
2982     with contextlib.closing(batch_fd) as fd:
2983         return [url for url in map(fixup, fd) if url]
2984
2985
2986 def urlencode_postdata(*args, **kargs):
2987     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2988
2989
2990 def update_url(url, *, query_update=None, **kwargs):
2991     """Replace URL components specified by kwargs
2992        @param url           str or parse url tuple
2993        @param query_update  update query
2994        @returns             str
2995     """
2996     if isinstance(url, str):
2997         if not kwargs and not query_update:
2998             return url
2999         else:
3000             url = urllib.parse.urlparse(url)
3001     if query_update:
3002         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3003         kwargs['query'] = urllib.parse.urlencode({
3004             **urllib.parse.parse_qs(url.query),
3005             **query_update
3006         }, True)
3007     return urllib.parse.urlunparse(url._replace(**kwargs))
3008
3009
3010 def update_url_query(url, query):
3011     return update_url(url, query_update=query)
3012
3013
3014 def update_Request(req, url=None, data=None, headers=None, query=None):
3015     req_headers = req.headers.copy()
3016     req_headers.update(headers or {})
3017     req_data = data or req.data
3018     req_url = update_url_query(url or req.get_full_url(), query)
3019     req_get_method = req.get_method()
3020     if req_get_method == 'HEAD':
3021         req_type = HEADRequest
3022     elif req_get_method == 'PUT':
3023         req_type = PUTRequest
3024     else:
3025         req_type = urllib.request.Request
3026     new_req = req_type(
3027         req_url, data=req_data, headers=req_headers,
3028         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3029     if hasattr(req, 'timeout'):
3030         new_req.timeout = req.timeout
3031     return new_req
3032
3033
3034 def _multipart_encode_impl(data, boundary):
3035     content_type = 'multipart/form-data; boundary=%s' % boundary
3036
3037     out = b''
3038     for k, v in data.items():
3039         out += b'--' + boundary.encode('ascii') + b'\r\n'
3040         if isinstance(k, str):
3041             k = k.encode()
3042         if isinstance(v, str):
3043             v = v.encode()
3044         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3045         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3046         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3047         if boundary.encode('ascii') in content:
3048             raise ValueError('Boundary overlaps with data')
3049         out += content
3050
3051     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3052
3053     return out, content_type
3054
3055
3056 def multipart_encode(data, boundary=None):
3057     '''
3058     Encode a dict to RFC 7578-compliant form-data
3059
3060     data:
3061         A dict where keys and values can be either Unicode or bytes-like
3062         objects.
3063     boundary:
3064         If specified a Unicode object, it's used as the boundary. Otherwise
3065         a random boundary is generated.
3066
3067     Reference: https://tools.ietf.org/html/rfc7578
3068     '''
3069     has_specified_boundary = boundary is not None
3070
3071     while True:
3072         if boundary is None:
3073             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3074
3075         try:
3076             out, content_type = _multipart_encode_impl(data, boundary)
3077             break
3078         except ValueError:
3079             if has_specified_boundary:
3080                 raise
3081             boundary = None
3082
3083     return out, content_type
3084
3085
3086 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3087     if blocked_types is NO_DEFAULT:
3088         blocked_types = (str, bytes, collections.abc.Mapping)
3089     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3090
3091
3092 def variadic(x, allowed_types=NO_DEFAULT):
3093     if not isinstance(allowed_types, (tuple, type)):
3094         deprecation_warning('allowed_types should be a tuple or a type')
3095         allowed_types = tuple(allowed_types)
3096     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3097
3098
3099 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3100     for f in funcs:
3101         try:
3102             val = f(*args, **kwargs)
3103         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3104             pass
3105         else:
3106             if expected_type is None or isinstance(val, expected_type):
3107                 return val
3108
3109
3110 def try_get(src, getter, expected_type=None):
3111     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3112
3113
3114 def filter_dict(dct, cndn=lambda _, v: v is not None):
3115     return {k: v for k, v in dct.items() if cndn(k, v)}
3116
3117
3118 def merge_dicts(*dicts):
3119     merged = {}
3120     for a_dict in dicts:
3121         for k, v in a_dict.items():
3122             if (v is not None and k not in merged
3123                     or isinstance(v, str) and merged[k] == ''):
3124                 merged[k] = v
3125     return merged
3126
3127
3128 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3129     return string if isinstance(string, str) else str(string, encoding, errors)
3130
3131
3132 US_RATINGS = {
3133     'G': 0,
3134     'PG': 10,
3135     'PG-13': 13,
3136     'R': 16,
3137     'NC': 18,
3138 }
3139
3140
3141 TV_PARENTAL_GUIDELINES = {
3142     'TV-Y': 0,
3143     'TV-Y7': 7,
3144     'TV-G': 0,
3145     'TV-PG': 0,
3146     'TV-14': 14,
3147     'TV-MA': 17,
3148 }
3149
3150
3151 def parse_age_limit(s):
3152     # isinstance(False, int) is True. So type() must be used instead
3153     if type(s) is int:  # noqa: E721
3154         return s if 0 <= s <= 21 else None
3155     elif not isinstance(s, str):
3156         return None
3157     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3158     if m:
3159         return int(m.group('age'))
3160     s = s.upper()
3161     if s in US_RATINGS:
3162         return US_RATINGS[s]
3163     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3164     if m:
3165         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3166     return None
3167
3168
3169 def strip_jsonp(code):
3170     return re.sub(
3171         r'''(?sx)^
3172             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3173             (?:\s*&&\s*(?P=func_name))?
3174             \s*\(\s*(?P<callback_data>.*)\);?
3175             \s*?(?://[^\n]*)*$''',
3176         r'\g<callback_data>', code)
3177
3178
3179 def js_to_json(code, vars={}, *, strict=False):
3180     # vars is a dict of var, val pairs to substitute
3181     STRING_QUOTES = '\'"`'
3182     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3183     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3184     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3185     INTEGER_TABLE = (
3186         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3187         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3188     )
3189
3190     def process_escape(match):
3191         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3192         escape = match.group(1) or match.group(2)
3193
3194         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3195                 else R'\u00' if escape == 'x'
3196                 else '' if escape == '\n'
3197                 else escape)
3198
3199     def template_substitute(match):
3200         evaluated = js_to_json(match.group(1), vars, strict=strict)
3201         if evaluated[0] == '"':
3202             return json.loads(evaluated)
3203         return evaluated
3204
3205     def fix_kv(m):
3206         v = m.group(0)
3207         if v in ('true', 'false', 'null'):
3208             return v
3209         elif v in ('undefined', 'void 0'):
3210             return 'null'
3211         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3212             return ''
3213
3214         if v[0] in STRING_QUOTES:
3215             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3216             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3217             return f'"{escaped}"'
3218
3219         for regex, base in INTEGER_TABLE:
3220             im = re.match(regex, v)
3221             if im:
3222                 i = int(im.group(1), base)
3223                 return f'"{i}":' if v.endswith(':') else str(i)
3224
3225         if v in vars:
3226             try:
3227                 if not strict:
3228                     json.loads(vars[v])
3229             except json.JSONDecodeError:
3230                 return json.dumps(vars[v])
3231             else:
3232                 return vars[v]
3233
3234         if not strict:
3235             return f'"{v}"'
3236
3237         raise ValueError(f'Unknown value: {v}')
3238
3239     def create_map(mobj):
3240         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3241
3242     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3243     if not strict:
3244         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3245         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3246         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3247         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3248
3249     return re.sub(rf'''(?sx)
3250         {STRING_RE}|
3251         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3252         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3253         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3254         [0-9]+(?={SKIP_RE}:)|
3255         !+
3256         ''', fix_kv, code)
3257
3258
3259 def qualities(quality_ids):
3260     """ Get a numeric quality value out of a list of possible values """
3261     def q(qid):
3262         try:
3263             return quality_ids.index(qid)
3264         except ValueError:
3265             return -1
3266     return q
3267
3268
3269 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3270
3271
3272 DEFAULT_OUTTMPL = {
3273     'default': '%(title)s [%(id)s].%(ext)s',
3274     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3275 }
3276 OUTTMPL_TYPES = {
3277     'chapter': None,
3278     'subtitle': None,
3279     'thumbnail': None,
3280     'description': 'description',
3281     'annotation': 'annotations.xml',
3282     'infojson': 'info.json',
3283     'link': None,
3284     'pl_video': None,
3285     'pl_thumbnail': None,
3286     'pl_description': 'description',
3287     'pl_infojson': 'info.json',
3288 }
3289
3290 # As of [1] format syntax is:
3291 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3292 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3293 STR_FORMAT_RE_TMPL = r'''(?x)
3294     (?<!%)(?P<prefix>(?:%%)*)
3295     %
3296     (?P<has_key>\((?P<key>{0})\))?
3297     (?P<format>
3298         (?P<conversion>[#0\-+ ]+)?
3299         (?P<min_width>\d+)?
3300         (?P<precision>\.\d+)?
3301         (?P<len_mod>[hlL])?  # unused in python
3302         {1}  # conversion type
3303     )
3304 '''
3305
3306
3307 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
3308
3309
3310 def limit_length(s, length):
3311     """ Add ellipses to overly long strings """
3312     if s is None:
3313         return None
3314     ELLIPSES = '...'
3315     if len(s) > length:
3316         return s[:length - len(ELLIPSES)] + ELLIPSES
3317     return s
3318
3319
3320 def version_tuple(v):
3321     return tuple(int(e) for e in re.split(r'[-.]', v))
3322
3323
3324 def is_outdated_version(version, limit, assume_new=True):
3325     if not version:
3326         return not assume_new
3327     try:
3328         return version_tuple(version) < version_tuple(limit)
3329     except ValueError:
3330         return not assume_new
3331
3332
3333 def ytdl_is_updateable():
3334     """ Returns if yt-dlp can be updated with -U """
3335
3336     from ..update import is_non_updateable
3337
3338     return not is_non_updateable()
3339
3340
3341 def args_to_str(args):
3342     # Get a short string representation for a subprocess command
3343     return ' '.join(compat_shlex_quote(a) for a in args)
3344
3345
3346 def error_to_str(err):
3347     return f'{type(err).__name__}: {err}'
3348
3349
3350 def mimetype2ext(mt, default=NO_DEFAULT):
3351     if not isinstance(mt, str):
3352         if default is not NO_DEFAULT:
3353             return default
3354         return None
3355
3356     MAP = {
3357         # video
3358         '3gpp': '3gp',
3359         'mp2t': 'ts',
3360         'mp4': 'mp4',
3361         'mpeg': 'mpeg',
3362         'mpegurl': 'm3u8',
3363         'quicktime': 'mov',
3364         'webm': 'webm',
3365         'vp9': 'vp9',
3366         'x-flv': 'flv',
3367         'x-m4v': 'm4v',
3368         'x-matroska': 'mkv',
3369         'x-mng': 'mng',
3370         'x-mp4-fragmented': 'mp4',
3371         'x-ms-asf': 'asf',
3372         'x-ms-wmv': 'wmv',
3373         'x-msvideo': 'avi',
3374
3375         # application (streaming playlists)
3376         'dash+xml': 'mpd',
3377         'f4m+xml': 'f4m',
3378         'hds+xml': 'f4m',
3379         'vnd.apple.mpegurl': 'm3u8',
3380         'vnd.ms-sstr+xml': 'ism',
3381         'x-mpegurl': 'm3u8',
3382
3383         # audio
3384         'audio/mp4': 'm4a',
3385         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3386         # Using .mp3 as it's the most popular one
3387         'audio/mpeg': 'mp3',
3388         'audio/webm': 'webm',
3389         'audio/x-matroska': 'mka',
3390         'audio/x-mpegurl': 'm3u',
3391         'midi': 'mid',
3392         'ogg': 'ogg',
3393         'wav': 'wav',
3394         'wave': 'wav',
3395         'x-aac': 'aac',
3396         'x-flac': 'flac',
3397         'x-m4a': 'm4a',
3398         'x-realaudio': 'ra',
3399         'x-wav': 'wav',
3400
3401         # image
3402         'avif': 'avif',
3403         'bmp': 'bmp',
3404         'gif': 'gif',
3405         'jpeg': 'jpg',
3406         'png': 'png',
3407         'svg+xml': 'svg',
3408         'tiff': 'tif',
3409         'vnd.wap.wbmp': 'wbmp',
3410         'webp': 'webp',
3411         'x-icon': 'ico',
3412         'x-jng': 'jng',
3413         'x-ms-bmp': 'bmp',
3414
3415         # caption
3416         'filmstrip+json': 'fs',
3417         'smptett+xml': 'tt',
3418         'ttaf+xml': 'dfxp',
3419         'ttml+xml': 'ttml',
3420         'x-ms-sami': 'sami',
3421
3422         # misc
3423         'gzip': 'gz',
3424         'json': 'json',
3425         'xml': 'xml',
3426         'zip': 'zip',
3427     }
3428
3429     mimetype = mt.partition(';')[0].strip().lower()
3430     _, _, subtype = mimetype.rpartition('/')
3431
3432     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3433     if ext:
3434         return ext
3435     elif default is not NO_DEFAULT:
3436         return default
3437     return subtype.replace('+', '.')
3438
3439
3440 def ext2mimetype(ext_or_url):
3441     if not ext_or_url:
3442         return None
3443     if '.' not in ext_or_url:
3444         ext_or_url = f'file.{ext_or_url}'
3445     return mimetypes.guess_type(ext_or_url)[0]
3446
3447
3448 def parse_codecs(codecs_str):
3449     # http://tools.ietf.org/html/rfc6381
3450     if not codecs_str:
3451         return {}
3452     split_codecs = list(filter(None, map(
3453         str.strip, codecs_str.strip().strip(',').split(','))))
3454     vcodec, acodec, scodec, hdr = None, None, None, None
3455     for full_codec in split_codecs:
3456         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3457         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3458                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3459             if vcodec:
3460                 continue
3461             vcodec = full_codec
3462             if parts[0] in ('dvh1', 'dvhe'):
3463                 hdr = 'DV'
3464             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3465                 hdr = 'HDR10'
3466             elif parts[:2] == ['vp9', '2']:
3467                 hdr = 'HDR10'
3468         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3469                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3470             acodec = acodec or full_codec
3471         elif parts[0] in ('stpp', 'wvtt'):
3472             scodec = scodec or full_codec
3473         else:
3474             write_string(f'WARNING: Unknown codec {full_codec}\n')
3475     if vcodec or acodec or scodec:
3476         return {
3477             'vcodec': vcodec or 'none',
3478             'acodec': acodec or 'none',
3479             'dynamic_range': hdr,
3480             **({'scodec': scodec} if scodec is not None else {}),
3481         }
3482     elif len(split_codecs) == 2:
3483         return {
3484             'vcodec': split_codecs[0],
3485             'acodec': split_codecs[1],
3486         }
3487     return {}
3488
3489
3490 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3491     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3492
3493     allow_mkv = not preferences or 'mkv' in preferences
3494
3495     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3496         return 'mkv'  # TODO: any other format allows this?
3497
3498     # TODO: All codecs supported by parse_codecs isn't handled here
3499     COMPATIBLE_CODECS = {
3500         'mp4': {
3501             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3502             'h264', 'aacl', 'ec-3',  # Set in ISM
3503         },
3504         'webm': {
3505             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3506             'vp9x', 'vp8x',  # in the webm spec
3507         },
3508     }
3509
3510     sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3511     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3512
3513     for ext in preferences or COMPATIBLE_CODECS.keys():
3514         codec_set = COMPATIBLE_CODECS.get(ext, set())
3515         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3516             return ext
3517
3518     COMPATIBLE_EXTS = (
3519         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3520         {'webm', 'weba'},
3521     )
3522     for ext in preferences or vexts:
3523         current_exts = {ext, *vexts, *aexts}
3524         if ext == 'mkv' or current_exts == {ext} or any(
3525                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3526             return ext
3527     return 'mkv' if allow_mkv else preferences[-1]
3528
3529
3530 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3531     getheader = url_handle.headers.get
3532
3533     cd = getheader('Content-Disposition')
3534     if cd:
3535         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3536         if m:
3537             e = determine_ext(m.group('filename'), default_ext=None)
3538             if e:
3539                 return e
3540
3541     meta_ext = getheader('x-amz-meta-name')
3542     if meta_ext:
3543         e = meta_ext.rpartition('.')[2]
3544         if e:
3545             return e
3546
3547     return mimetype2ext(getheader('Content-Type'), default=default)
3548
3549
3550 def encode_data_uri(data, mime_type):
3551     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3552
3553
3554 def age_restricted(content_limit, age_limit):
3555     """ Returns True iff the content should be blocked """
3556
3557     if age_limit is None:  # No limit set
3558         return False
3559     if content_limit is None:
3560         return False  # Content available for everyone
3561     return age_limit < content_limit
3562
3563
3564 # List of known byte-order-marks (BOM)
3565 BOMS = [
3566     (b'\xef\xbb\xbf', 'utf-8'),
3567     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3568     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3569     (b'\xff\xfe', 'utf-16-le'),
3570     (b'\xfe\xff', 'utf-16-be'),
3571 ]
3572
3573
3574 def is_html(first_bytes):
3575     """ Detect whether a file contains HTML by examining its first bytes. """
3576
3577     encoding = 'utf-8'
3578     for bom, enc in BOMS:
3579         while first_bytes.startswith(bom):
3580             encoding, first_bytes = enc, first_bytes[len(bom):]
3581
3582     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3583
3584
3585 def determine_protocol(info_dict):
3586     protocol = info_dict.get('protocol')
3587     if protocol is not None:
3588         return protocol
3589
3590     url = sanitize_url(info_dict['url'])
3591     if url.startswith('rtmp'):
3592         return 'rtmp'
3593     elif url.startswith('mms'):
3594         return 'mms'
3595     elif url.startswith('rtsp'):
3596         return 'rtsp'
3597
3598     ext = determine_ext(url)
3599     if ext == 'm3u8':
3600         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3601     elif ext == 'f4m':
3602         return 'f4m'
3603
3604     return urllib.parse.urlparse(url).scheme
3605
3606
3607 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3608     """ Render a list of rows, each as a list of values.
3609     Text after a \t will be right aligned """
3610     def width(string):
3611         return len(remove_terminal_sequences(string).replace('\t', ''))
3612
3613     def get_max_lens(table):
3614         return [max(width(str(v)) for v in col) for col in zip(*table)]
3615
3616     def filter_using_list(row, filterArray):
3617         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3618
3619     max_lens = get_max_lens(data) if hide_empty else []
3620     header_row = filter_using_list(header_row, max_lens)
3621     data = [filter_using_list(row, max_lens) for row in data]
3622
3623     table = [header_row] + data
3624     max_lens = get_max_lens(table)
3625     extra_gap += 1
3626     if delim:
3627         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3628         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3629     for row in table:
3630         for pos, text in enumerate(map(str, row)):
3631             if '\t' in text:
3632                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3633             else:
3634                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3635     ret = '\n'.join(''.join(row).rstrip() for row in table)
3636     return ret
3637
3638
3639 def _match_one(filter_part, dct, incomplete):
3640     # TODO: Generalize code with YoutubeDL._build_format_filter
3641     STRING_OPERATORS = {
3642         '*=': operator.contains,
3643         '^=': lambda attr, value: attr.startswith(value),
3644         '$=': lambda attr, value: attr.endswith(value),
3645         '~=': lambda attr, value: re.search(value, attr),
3646     }
3647     COMPARISON_OPERATORS = {
3648         **STRING_OPERATORS,
3649         '<=': operator.le,  # "<=" must be defined above "<"
3650         '<': operator.lt,
3651         '>=': operator.ge,
3652         '>': operator.gt,
3653         '=': operator.eq,
3654     }
3655
3656     if isinstance(incomplete, bool):
3657         is_incomplete = lambda _: incomplete
3658     else:
3659         is_incomplete = lambda k: k in incomplete
3660
3661     operator_rex = re.compile(r'''(?x)
3662         (?P<key>[a-z_]+)
3663         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3664         (?:
3665             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3666             (?P<strval>.+?)
3667         )
3668         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3669     m = operator_rex.fullmatch(filter_part.strip())
3670     if m:
3671         m = m.groupdict()
3672         unnegated_op = COMPARISON_OPERATORS[m['op']]
3673         if m['negation']:
3674             op = lambda attr, value: not unnegated_op(attr, value)
3675         else:
3676             op = unnegated_op
3677         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3678         if m['quote']:
3679             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3680         actual_value = dct.get(m['key'])
3681         numeric_comparison = None
3682         if isinstance(actual_value, (int, float)):
3683             # If the original field is a string and matching comparisonvalue is
3684             # a number we should respect the origin of the original field
3685             # and process comparison value as a string (see
3686             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3687             try:
3688                 numeric_comparison = int(comparison_value)
3689             except ValueError:
3690                 numeric_comparison = parse_filesize(comparison_value)
3691                 if numeric_comparison is None:
3692                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3693                 if numeric_comparison is None:
3694                     numeric_comparison = parse_duration(comparison_value)
3695         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3696             raise ValueError('Operator %s only supports string values!' % m['op'])
3697         if actual_value is None:
3698             return is_incomplete(m['key']) or m['none_inclusive']
3699         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3700
3701     UNARY_OPERATORS = {
3702         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3703         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3704     }
3705     operator_rex = re.compile(r'''(?x)
3706         (?P<op>%s)\s*(?P<key>[a-z_]+)
3707         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3708     m = operator_rex.fullmatch(filter_part.strip())
3709     if m:
3710         op = UNARY_OPERATORS[m.group('op')]
3711         actual_value = dct.get(m.group('key'))
3712         if is_incomplete(m.group('key')) and actual_value is None:
3713             return True
3714         return op(actual_value)
3715
3716     raise ValueError('Invalid filter part %r' % filter_part)
3717
3718
3719 def match_str(filter_str, dct, incomplete=False):
3720     """ Filter a dictionary with a simple string syntax.
3721     @returns           Whether the filter passes
3722     @param incomplete  Set of keys that is expected to be missing from dct.
3723                        Can be True/False to indicate all/none of the keys may be missing.
3724                        All conditions on incomplete keys pass if the key is missing
3725     """
3726     return all(
3727         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3728         for filter_part in re.split(r'(?<!\\)&', filter_str))
3729
3730
3731 def match_filter_func(filters, breaking_filters=None):
3732     if not filters and not breaking_filters:
3733         return None
3734     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3735     filters = set(variadic(filters or []))
3736
3737     interactive = '-' in filters
3738     if interactive:
3739         filters.remove('-')
3740
3741     def _match_func(info_dict, incomplete=False):
3742         ret = breaking_filters(info_dict, incomplete)
3743         if ret is not None:
3744             raise RejectedVideoReached(ret)
3745
3746         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3747             return NO_DEFAULT if interactive and not incomplete else None
3748         else:
3749             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3750             filter_str = ') | ('.join(map(str.strip, filters))
3751             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3752     return _match_func
3753
3754
3755 class download_range_func:
3756     def __init__(self, chapters, ranges):
3757         self.chapters, self.ranges = chapters, ranges
3758
3759     def __call__(self, info_dict, ydl):
3760         if not self.ranges and not self.chapters:
3761             yield {}
3762
3763         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3764                    else 'Cannot match chapters since chapter information is unavailable')
3765         for regex in self.chapters or []:
3766             for i, chapter in enumerate(info_dict.get('chapters') or []):
3767                 if re.search(regex, chapter['title']):
3768                     warning = None
3769                     yield {**chapter, 'index': i}
3770         if self.chapters and warning:
3771             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3772
3773         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3774
3775     def __eq__(self, other):
3776         return (isinstance(other, download_range_func)
3777                 and self.chapters == other.chapters and self.ranges == other.ranges)
3778
3779     def __repr__(self):
3780         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3781
3782
3783 def parse_dfxp_time_expr(time_expr):
3784     if not time_expr:
3785         return
3786
3787     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3788     if mobj:
3789         return float(mobj.group('time_offset'))
3790
3791     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3792     if mobj:
3793         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3794
3795
3796 def srt_subtitles_timecode(seconds):
3797     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3798
3799
3800 def ass_subtitles_timecode(seconds):
3801     time = timetuple_from_msec(seconds * 1000)
3802     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3803
3804
3805 def dfxp2srt(dfxp_data):
3806     '''
3807     @param dfxp_data A bytes-like object containing DFXP data
3808     @returns A unicode object containing converted SRT data
3809     '''
3810     LEGACY_NAMESPACES = (
3811         (b'http://www.w3.org/ns/ttml', [
3812             b'http://www.w3.org/2004/11/ttaf1',
3813             b'http://www.w3.org/2006/04/ttaf1',
3814             b'http://www.w3.org/2006/10/ttaf1',
3815         ]),
3816         (b'http://www.w3.org/ns/ttml#styling', [
3817             b'http://www.w3.org/ns/ttml#style',
3818         ]),
3819     )
3820
3821     SUPPORTED_STYLING = [
3822         'color',
3823         'fontFamily',
3824         'fontSize',
3825         'fontStyle',
3826         'fontWeight',
3827         'textDecoration'
3828     ]
3829
3830     _x = functools.partial(xpath_with_ns, ns_map={
3831         'xml': 'http://www.w3.org/XML/1998/namespace',
3832         'ttml': 'http://www.w3.org/ns/ttml',
3833         'tts': 'http://www.w3.org/ns/ttml#styling',
3834     })
3835
3836     styles = {}
3837     default_style = {}
3838
3839     class TTMLPElementParser:
3840         _out = ''
3841         _unclosed_elements = []
3842         _applied_styles = []
3843
3844         def start(self, tag, attrib):
3845             if tag in (_x('ttml:br'), 'br'):
3846                 self._out += '\n'
3847             else:
3848                 unclosed_elements = []
3849                 style = {}
3850                 element_style_id = attrib.get('style')
3851                 if default_style:
3852                     style.update(default_style)
3853                 if element_style_id:
3854                     style.update(styles.get(element_style_id, {}))
3855                 for prop in SUPPORTED_STYLING:
3856                     prop_val = attrib.get(_x('tts:' + prop))
3857                     if prop_val:
3858                         style[prop] = prop_val
3859                 if style:
3860                     font = ''
3861                     for k, v in sorted(style.items()):
3862                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3863                             continue
3864                         if k == 'color':
3865                             font += ' color="%s"' % v
3866                         elif k == 'fontSize':
3867                             font += ' size="%s"' % v
3868                         elif k == 'fontFamily':
3869                             font += ' face="%s"' % v
3870                         elif k == 'fontWeight' and v == 'bold':
3871                             self._out += '<b>'
3872                             unclosed_elements.append('b')
3873                         elif k == 'fontStyle' and v == 'italic':
3874                             self._out += '<i>'
3875                             unclosed_elements.append('i')
3876                         elif k == 'textDecoration' and v == 'underline':
3877                             self._out += '<u>'
3878                             unclosed_elements.append('u')
3879                     if font:
3880                         self._out += '<font' + font + '>'
3881                         unclosed_elements.append('font')
3882                     applied_style = {}
3883                     if self._applied_styles:
3884                         applied_style.update(self._applied_styles[-1])
3885                     applied_style.update(style)
3886                     self._applied_styles.append(applied_style)
3887                 self._unclosed_elements.append(unclosed_elements)
3888
3889         def end(self, tag):
3890             if tag not in (_x('ttml:br'), 'br'):
3891                 unclosed_elements = self._unclosed_elements.pop()
3892                 for element in reversed(unclosed_elements):
3893                     self._out += '</%s>' % element
3894                 if unclosed_elements and self._applied_styles:
3895                     self._applied_styles.pop()
3896
3897         def data(self, data):
3898             self._out += data
3899
3900         def close(self):
3901             return self._out.strip()
3902
3903     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3904     # This will not trigger false positives since only UTF-8 text is being replaced
3905     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3906
3907     def parse_node(node):
3908         target = TTMLPElementParser()
3909         parser = xml.etree.ElementTree.XMLParser(target=target)
3910         parser.feed(xml.etree.ElementTree.tostring(node))
3911         return parser.close()
3912
3913     for k, v in LEGACY_NAMESPACES:
3914         for ns in v:
3915             dfxp_data = dfxp_data.replace(ns, k)
3916
3917     dfxp = compat_etree_fromstring(dfxp_data)
3918     out = []
3919     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3920
3921     if not paras:
3922         raise ValueError('Invalid dfxp/TTML subtitle')
3923
3924     repeat = False
3925     while True:
3926         for style in dfxp.findall(_x('.//ttml:style')):
3927             style_id = style.get('id') or style.get(_x('xml:id'))
3928             if not style_id:
3929                 continue
3930             parent_style_id = style.get('style')
3931             if parent_style_id:
3932                 if parent_style_id not in styles:
3933                     repeat = True
3934                     continue
3935                 styles[style_id] = styles[parent_style_id].copy()
3936             for prop in SUPPORTED_STYLING:
3937                 prop_val = style.get(_x('tts:' + prop))
3938                 if prop_val:
3939                     styles.setdefault(style_id, {})[prop] = prop_val
3940         if repeat:
3941             repeat = False
3942         else:
3943             break
3944
3945     for p in ('body', 'div'):
3946         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3947         if ele is None:
3948             continue
3949         style = styles.get(ele.get('style'))
3950         if not style:
3951             continue
3952         default_style.update(style)
3953
3954     for para, index in zip(paras, itertools.count(1)):
3955         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3956         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3957         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3958         if begin_time is None:
3959             continue
3960         if not end_time:
3961             if not dur:
3962                 continue
3963             end_time = begin_time + dur
3964         out.append('%d\n%s --> %s\n%s\n\n' % (
3965             index,
3966             srt_subtitles_timecode(begin_time),
3967             srt_subtitles_timecode(end_time),
3968             parse_node(para)))
3969
3970     return ''.join(out)
3971
3972
3973 def cli_option(params, command_option, param, separator=None):
3974     param = params.get(param)
3975     return ([] if param is None
3976             else [command_option, str(param)] if separator is None
3977             else [f'{command_option}{separator}{param}'])
3978
3979
3980 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3981     param = params.get(param)
3982     assert param in (True, False, None)
3983     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3984
3985
3986 def cli_valueless_option(params, command_option, param, expected_value=True):
3987     return [command_option] if params.get(param) == expected_value else []
3988
3989
3990 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3991     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3992         if use_compat:
3993             return argdict
3994         else:
3995             argdict = None
3996     if argdict is None:
3997         return default
3998     assert isinstance(argdict, dict)
3999
4000     assert isinstance(keys, (list, tuple))
4001     for key_list in keys:
4002         arg_list = list(filter(
4003             lambda x: x is not None,
4004             [argdict.get(key.lower()) for key in variadic(key_list)]))
4005         if arg_list:
4006             return [arg for args in arg_list for arg in args]
4007     return default
4008
4009
4010 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4011     main_key, exe = main_key.lower(), exe.lower()
4012     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4013     keys = [f'{root_key}{k}' for k in (keys or [''])]
4014     if root_key in keys:
4015         if main_key != exe:
4016             keys.append((main_key, exe))
4017         keys.append('default')
4018     else:
4019         use_compat = False
4020     return cli_configuration_args(argdict, keys, default, use_compat)
4021
4022
4023 class ISO639Utils:
4024     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4025     _lang_map = {
4026         'aa': 'aar',
4027         'ab': 'abk',
4028         'ae': 'ave',
4029         'af': 'afr',
4030         'ak': 'aka',
4031         'am': 'amh',
4032         'an': 'arg',
4033         'ar': 'ara',
4034         'as': 'asm',
4035         'av': 'ava',
4036         'ay': 'aym',
4037         'az': 'aze',
4038         'ba': 'bak',
4039         'be': 'bel',
4040         'bg': 'bul',
4041         'bh': 'bih',
4042         'bi': 'bis',
4043         'bm': 'bam',
4044         'bn': 'ben',
4045         'bo': 'bod',
4046         'br': 'bre',
4047         'bs': 'bos',
4048         'ca': 'cat',
4049         'ce': 'che',
4050         'ch': 'cha',
4051         'co': 'cos',
4052         'cr': 'cre',
4053         'cs': 'ces',
4054         'cu': 'chu',
4055         'cv': 'chv',
4056         'cy': 'cym',
4057         'da': 'dan',
4058         'de': 'deu',
4059         'dv': 'div',
4060         'dz': 'dzo',
4061         'ee': 'ewe',
4062         'el': 'ell',
4063         'en': 'eng',
4064         'eo': 'epo',
4065         'es': 'spa',
4066         'et': 'est',
4067         'eu': 'eus',
4068         'fa': 'fas',
4069         'ff': 'ful',
4070         'fi': 'fin',
4071         'fj': 'fij',
4072         'fo': 'fao',
4073         'fr': 'fra',
4074         'fy': 'fry',
4075         'ga': 'gle',
4076         'gd': 'gla',
4077         'gl': 'glg',
4078         'gn': 'grn',
4079         'gu': 'guj',
4080         'gv': 'glv',
4081         'ha': 'hau',
4082         'he': 'heb',
4083         'iw': 'heb',  # Replaced by he in 1989 revision
4084         'hi': 'hin',
4085         'ho': 'hmo',
4086         'hr': 'hrv',
4087         'ht': 'hat',
4088         'hu': 'hun',
4089         'hy': 'hye',
4090         'hz': 'her',
4091         'ia': 'ina',
4092         'id': 'ind',
4093         'in': 'ind',  # Replaced by id in 1989 revision
4094         'ie': 'ile',
4095         'ig': 'ibo',
4096         'ii': 'iii',
4097         'ik': 'ipk',
4098         'io': 'ido',
4099         'is': 'isl',
4100         'it': 'ita',
4101         'iu': 'iku',
4102         'ja': 'jpn',
4103         'jv': 'jav',
4104         'ka': 'kat',
4105         'kg': 'kon',
4106         'ki': 'kik',
4107         'kj': 'kua',
4108         'kk': 'kaz',
4109         'kl': 'kal',
4110         'km': 'khm',
4111         'kn': 'kan',
4112         'ko': 'kor',
4113         'kr': 'kau',
4114         'ks': 'kas',
4115         'ku': 'kur',
4116         'kv': 'kom',
4117         'kw': 'cor',
4118         'ky': 'kir',
4119         'la': 'lat',
4120         'lb': 'ltz',
4121         'lg': 'lug',
4122         'li': 'lim',
4123         'ln': 'lin',
4124         'lo': 'lao',
4125         'lt': 'lit',
4126         'lu': 'lub',
4127         'lv': 'lav',
4128         'mg': 'mlg',
4129         'mh': 'mah',
4130         'mi': 'mri',
4131         'mk': 'mkd',
4132         'ml': 'mal',
4133         'mn': 'mon',
4134         'mr': 'mar',
4135         'ms': 'msa',
4136         'mt': 'mlt',
4137         'my': 'mya',
4138         'na': 'nau',
4139         'nb': 'nob',
4140         'nd': 'nde',
4141         'ne': 'nep',
4142         'ng': 'ndo',
4143         'nl': 'nld',
4144         'nn': 'nno',
4145         'no': 'nor',
4146         'nr': 'nbl',
4147         'nv': 'nav',
4148         'ny': 'nya',
4149         'oc': 'oci',
4150         'oj': 'oji',
4151         'om': 'orm',
4152         'or': 'ori',
4153         'os': 'oss',
4154         'pa': 'pan',
4155         'pe': 'per',
4156         'pi': 'pli',
4157         'pl': 'pol',
4158         'ps': 'pus',
4159         'pt': 'por',
4160         'qu': 'que',
4161         'rm': 'roh',
4162         'rn': 'run',
4163         'ro': 'ron',
4164         'ru': 'rus',
4165         'rw': 'kin',
4166         'sa': 'san',
4167         'sc': 'srd',
4168         'sd': 'snd',
4169         'se': 'sme',
4170         'sg': 'sag',
4171         'si': 'sin',
4172         'sk': 'slk',
4173         'sl': 'slv',
4174         'sm': 'smo',
4175         'sn': 'sna',
4176         'so': 'som',
4177         'sq': 'sqi',
4178         'sr': 'srp',
4179         'ss': 'ssw',
4180         'st': 'sot',
4181         'su': 'sun',
4182         'sv': 'swe',
4183         'sw': 'swa',
4184         'ta': 'tam',
4185         'te': 'tel',
4186         'tg': 'tgk',
4187         'th': 'tha',
4188         'ti': 'tir',
4189         'tk': 'tuk',
4190         'tl': 'tgl',
4191         'tn': 'tsn',
4192         'to': 'ton',
4193         'tr': 'tur',
4194         'ts': 'tso',
4195         'tt': 'tat',
4196         'tw': 'twi',
4197         'ty': 'tah',
4198         'ug': 'uig',
4199         'uk': 'ukr',
4200         'ur': 'urd',
4201         'uz': 'uzb',
4202         've': 'ven',
4203         'vi': 'vie',
4204         'vo': 'vol',
4205         'wa': 'wln',
4206         'wo': 'wol',
4207         'xh': 'xho',
4208         'yi': 'yid',
4209         'ji': 'yid',  # Replaced by yi in 1989 revision
4210         'yo': 'yor',
4211         'za': 'zha',
4212         'zh': 'zho',
4213         'zu': 'zul',
4214     }
4215
4216     @classmethod
4217     def short2long(cls, code):
4218         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4219         return cls._lang_map.get(code[:2])
4220
4221     @classmethod
4222     def long2short(cls, code):
4223         """Convert language code from ISO 639-2/T to ISO 639-1"""
4224         for short_name, long_name in cls._lang_map.items():
4225             if long_name == code:
4226                 return short_name
4227
4228
4229 class ISO3166Utils:
4230     # From http://data.okfn.org/data/core/country-list
4231     _country_map = {
4232         'AF': 'Afghanistan',
4233         'AX': 'Åland Islands',
4234         'AL': 'Albania',
4235         'DZ': 'Algeria',
4236         'AS': 'American Samoa',
4237         'AD': 'Andorra',
4238         'AO': 'Angola',
4239         'AI': 'Anguilla',
4240         'AQ': 'Antarctica',
4241         'AG': 'Antigua and Barbuda',
4242         'AR': 'Argentina',
4243         'AM': 'Armenia',
4244         'AW': 'Aruba',
4245         'AU': 'Australia',
4246         'AT': 'Austria',
4247         'AZ': 'Azerbaijan',
4248         'BS': 'Bahamas',
4249         'BH': 'Bahrain',
4250         'BD': 'Bangladesh',
4251         'BB': 'Barbados',
4252         'BY': 'Belarus',
4253         'BE': 'Belgium',
4254         'BZ': 'Belize',
4255         'BJ': 'Benin',
4256         'BM': 'Bermuda',
4257         'BT': 'Bhutan',
4258         'BO': 'Bolivia, Plurinational State of',
4259         'BQ': 'Bonaire, Sint Eustatius and Saba',
4260         'BA': 'Bosnia and Herzegovina',
4261         'BW': 'Botswana',
4262         'BV': 'Bouvet Island',
4263         'BR': 'Brazil',
4264         'IO': 'British Indian Ocean Territory',
4265         'BN': 'Brunei Darussalam',
4266         'BG': 'Bulgaria',
4267         'BF': 'Burkina Faso',
4268         'BI': 'Burundi',
4269         'KH': 'Cambodia',
4270         'CM': 'Cameroon',
4271         'CA': 'Canada',
4272         'CV': 'Cape Verde',
4273         'KY': 'Cayman Islands',
4274         'CF': 'Central African Republic',
4275         'TD': 'Chad',
4276         'CL': 'Chile',
4277         'CN': 'China',
4278         'CX': 'Christmas Island',
4279         'CC': 'Cocos (Keeling) Islands',
4280         'CO': 'Colombia',
4281         'KM': 'Comoros',
4282         'CG': 'Congo',
4283         'CD': 'Congo, the Democratic Republic of the',
4284         'CK': 'Cook Islands',
4285         'CR': 'Costa Rica',
4286         'CI': 'Côte d\'Ivoire',
4287         'HR': 'Croatia',
4288         'CU': 'Cuba',
4289         'CW': 'Curaçao',
4290         'CY': 'Cyprus',
4291         'CZ': 'Czech Republic',
4292         'DK': 'Denmark',
4293         'DJ': 'Djibouti',
4294         'DM': 'Dominica',
4295         'DO': 'Dominican Republic',
4296         'EC': 'Ecuador',
4297         'EG': 'Egypt',
4298         'SV': 'El Salvador',
4299         'GQ': 'Equatorial Guinea',
4300         'ER': 'Eritrea',
4301         'EE': 'Estonia',
4302         'ET': 'Ethiopia',
4303         'FK': 'Falkland Islands (Malvinas)',
4304         'FO': 'Faroe Islands',
4305         'FJ': 'Fiji',
4306         'FI': 'Finland',
4307         'FR': 'France',
4308         'GF': 'French Guiana',
4309         'PF': 'French Polynesia',
4310         'TF': 'French Southern Territories',
4311         'GA': 'Gabon',
4312         'GM': 'Gambia',
4313         'GE': 'Georgia',
4314         'DE': 'Germany',
4315         'GH': 'Ghana',
4316         'GI': 'Gibraltar',
4317         'GR': 'Greece',
4318         'GL': 'Greenland',
4319         'GD': 'Grenada',
4320         'GP': 'Guadeloupe',
4321         'GU': 'Guam',
4322         'GT': 'Guatemala',
4323         'GG': 'Guernsey',
4324         'GN': 'Guinea',
4325         'GW': 'Guinea-Bissau',
4326         'GY': 'Guyana',
4327         'HT': 'Haiti',
4328         'HM': 'Heard Island and McDonald Islands',
4329         'VA': 'Holy See (Vatican City State)',
4330         'HN': 'Honduras',
4331         'HK': 'Hong Kong',
4332         'HU': 'Hungary',
4333         'IS': 'Iceland',
4334         'IN': 'India',
4335         'ID': 'Indonesia',
4336         'IR': 'Iran, Islamic Republic of',
4337         'IQ': 'Iraq',
4338         'IE': 'Ireland',
4339         'IM': 'Isle of Man',
4340         'IL': 'Israel',
4341         'IT': 'Italy',
4342         'JM': 'Jamaica',
4343         'JP': 'Japan',
4344         'JE': 'Jersey',
4345         'JO': 'Jordan',
4346         'KZ': 'Kazakhstan',
4347         'KE': 'Kenya',
4348         'KI': 'Kiribati',
4349         'KP': 'Korea, Democratic People\'s Republic of',
4350         'KR': 'Korea, Republic of',
4351         'KW': 'Kuwait',
4352         'KG': 'Kyrgyzstan',
4353         'LA': 'Lao People\'s Democratic Republic',
4354         'LV': 'Latvia',
4355         'LB': 'Lebanon',
4356         'LS': 'Lesotho',
4357         'LR': 'Liberia',
4358         'LY': 'Libya',
4359         'LI': 'Liechtenstein',
4360         'LT': 'Lithuania',
4361         'LU': 'Luxembourg',
4362         'MO': 'Macao',
4363         'MK': 'Macedonia, the Former Yugoslav Republic of',
4364         'MG': 'Madagascar',
4365         'MW': 'Malawi',
4366         'MY': 'Malaysia',
4367         'MV': 'Maldives',
4368         'ML': 'Mali',
4369         'MT': 'Malta',
4370         'MH': 'Marshall Islands',
4371         'MQ': 'Martinique',
4372         'MR': 'Mauritania',
4373         'MU': 'Mauritius',
4374         'YT': 'Mayotte',
4375         'MX': 'Mexico',
4376         'FM': 'Micronesia, Federated States of',
4377         'MD': 'Moldova, Republic of',
4378         'MC': 'Monaco',
4379         'MN': 'Mongolia',
4380         'ME': 'Montenegro',
4381         'MS': 'Montserrat',
4382         'MA': 'Morocco',
4383         'MZ': 'Mozambique',
4384         'MM': 'Myanmar',
4385         'NA': 'Namibia',
4386         'NR': 'Nauru',
4387         'NP': 'Nepal',
4388         'NL': 'Netherlands',
4389         'NC': 'New Caledonia',
4390         'NZ': 'New Zealand',
4391         'NI': 'Nicaragua',
4392         'NE': 'Niger',
4393         'NG': 'Nigeria',
4394         'NU': 'Niue',
4395         'NF': 'Norfolk Island',
4396         'MP': 'Northern Mariana Islands',
4397         'NO': 'Norway',
4398         'OM': 'Oman',
4399         'PK': 'Pakistan',
4400         'PW': 'Palau',
4401         'PS': 'Palestine, State of',
4402         'PA': 'Panama',
4403         'PG': 'Papua New Guinea',
4404         'PY': 'Paraguay',
4405         'PE': 'Peru',
4406         'PH': 'Philippines',
4407         'PN': 'Pitcairn',
4408         'PL': 'Poland',
4409         'PT': 'Portugal',
4410         'PR': 'Puerto Rico',
4411         'QA': 'Qatar',
4412         'RE': 'Réunion',
4413         'RO': 'Romania',
4414         'RU': 'Russian Federation',
4415         'RW': 'Rwanda',
4416         'BL': 'Saint Barthélemy',
4417         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4418         'KN': 'Saint Kitts and Nevis',
4419         'LC': 'Saint Lucia',
4420         'MF': 'Saint Martin (French part)',
4421         'PM': 'Saint Pierre and Miquelon',
4422         'VC': 'Saint Vincent and the Grenadines',
4423         'WS': 'Samoa',
4424         'SM': 'San Marino',
4425         'ST': 'Sao Tome and Principe',
4426         'SA': 'Saudi Arabia',
4427         'SN': 'Senegal',
4428         'RS': 'Serbia',
4429         'SC': 'Seychelles',
4430         'SL': 'Sierra Leone',
4431         'SG': 'Singapore',
4432         'SX': 'Sint Maarten (Dutch part)',
4433         'SK': 'Slovakia',
4434         'SI': 'Slovenia',
4435         'SB': 'Solomon Islands',
4436         'SO': 'Somalia',
4437         'ZA': 'South Africa',
4438         'GS': 'South Georgia and the South Sandwich Islands',
4439         'SS': 'South Sudan',
4440         'ES': 'Spain',
4441         'LK': 'Sri Lanka',
4442         'SD': 'Sudan',
4443         'SR': 'Suriname',
4444         'SJ': 'Svalbard and Jan Mayen',
4445         'SZ': 'Swaziland',
4446         'SE': 'Sweden',
4447         'CH': 'Switzerland',
4448         'SY': 'Syrian Arab Republic',
4449         'TW': 'Taiwan, Province of China',
4450         'TJ': 'Tajikistan',
4451         'TZ': 'Tanzania, United Republic of',
4452         'TH': 'Thailand',
4453         'TL': 'Timor-Leste',
4454         'TG': 'Togo',
4455         'TK': 'Tokelau',
4456         'TO': 'Tonga',
4457         'TT': 'Trinidad and Tobago',
4458         'TN': 'Tunisia',
4459         'TR': 'Turkey',
4460         'TM': 'Turkmenistan',
4461         'TC': 'Turks and Caicos Islands',
4462         'TV': 'Tuvalu',
4463         'UG': 'Uganda',
4464         'UA': 'Ukraine',
4465         'AE': 'United Arab Emirates',
4466         'GB': 'United Kingdom',
4467         'US': 'United States',
4468         'UM': 'United States Minor Outlying Islands',
4469         'UY': 'Uruguay',
4470         'UZ': 'Uzbekistan',
4471         'VU': 'Vanuatu',
4472         'VE': 'Venezuela, Bolivarian Republic of',
4473         'VN': 'Viet Nam',
4474         'VG': 'Virgin Islands, British',
4475         'VI': 'Virgin Islands, U.S.',
4476         'WF': 'Wallis and Futuna',
4477         'EH': 'Western Sahara',
4478         'YE': 'Yemen',
4479         'ZM': 'Zambia',
4480         'ZW': 'Zimbabwe',
4481         # Not ISO 3166 codes, but used for IP blocks
4482         'AP': 'Asia/Pacific Region',
4483         'EU': 'Europe',
4484     }
4485
4486     @classmethod
4487     def short2full(cls, code):
4488         """Convert an ISO 3166-2 country code to the corresponding full name"""
4489         return cls._country_map.get(code.upper())
4490
4491
4492 class GeoUtils:
4493     # Major IPv4 address blocks per country
4494     _country_ip_map = {
4495         'AD': '46.172.224.0/19',
4496         'AE': '94.200.0.0/13',
4497         'AF': '149.54.0.0/17',
4498         'AG': '209.59.64.0/18',
4499         'AI': '204.14.248.0/21',
4500         'AL': '46.99.0.0/16',
4501         'AM': '46.70.0.0/15',
4502         'AO': '105.168.0.0/13',
4503         'AP': '182.50.184.0/21',
4504         'AQ': '23.154.160.0/24',
4505         'AR': '181.0.0.0/12',
4506         'AS': '202.70.112.0/20',
4507         'AT': '77.116.0.0/14',
4508         'AU': '1.128.0.0/11',
4509         'AW': '181.41.0.0/18',
4510         'AX': '185.217.4.0/22',
4511         'AZ': '5.197.0.0/16',
4512         'BA': '31.176.128.0/17',
4513         'BB': '65.48.128.0/17',
4514         'BD': '114.130.0.0/16',
4515         'BE': '57.0.0.0/8',
4516         'BF': '102.178.0.0/15',
4517         'BG': '95.42.0.0/15',
4518         'BH': '37.131.0.0/17',
4519         'BI': '154.117.192.0/18',
4520         'BJ': '137.255.0.0/16',
4521         'BL': '185.212.72.0/23',
4522         'BM': '196.12.64.0/18',
4523         'BN': '156.31.0.0/16',
4524         'BO': '161.56.0.0/16',
4525         'BQ': '161.0.80.0/20',
4526         'BR': '191.128.0.0/12',
4527         'BS': '24.51.64.0/18',
4528         'BT': '119.2.96.0/19',
4529         'BW': '168.167.0.0/16',
4530         'BY': '178.120.0.0/13',
4531         'BZ': '179.42.192.0/18',
4532         'CA': '99.224.0.0/11',
4533         'CD': '41.243.0.0/16',
4534         'CF': '197.242.176.0/21',
4535         'CG': '160.113.0.0/16',
4536         'CH': '85.0.0.0/13',
4537         'CI': '102.136.0.0/14',
4538         'CK': '202.65.32.0/19',
4539         'CL': '152.172.0.0/14',
4540         'CM': '102.244.0.0/14',
4541         'CN': '36.128.0.0/10',
4542         'CO': '181.240.0.0/12',
4543         'CR': '201.192.0.0/12',
4544         'CU': '152.206.0.0/15',
4545         'CV': '165.90.96.0/19',
4546         'CW': '190.88.128.0/17',
4547         'CY': '31.153.0.0/16',
4548         'CZ': '88.100.0.0/14',
4549         'DE': '53.0.0.0/8',
4550         'DJ': '197.241.0.0/17',
4551         'DK': '87.48.0.0/12',
4552         'DM': '192.243.48.0/20',
4553         'DO': '152.166.0.0/15',
4554         'DZ': '41.96.0.0/12',
4555         'EC': '186.68.0.0/15',
4556         'EE': '90.190.0.0/15',
4557         'EG': '156.160.0.0/11',
4558         'ER': '196.200.96.0/20',
4559         'ES': '88.0.0.0/11',
4560         'ET': '196.188.0.0/14',
4561         'EU': '2.16.0.0/13',
4562         'FI': '91.152.0.0/13',
4563         'FJ': '144.120.0.0/16',
4564         'FK': '80.73.208.0/21',
4565         'FM': '119.252.112.0/20',
4566         'FO': '88.85.32.0/19',
4567         'FR': '90.0.0.0/9',
4568         'GA': '41.158.0.0/15',
4569         'GB': '25.0.0.0/8',
4570         'GD': '74.122.88.0/21',
4571         'GE': '31.146.0.0/16',
4572         'GF': '161.22.64.0/18',
4573         'GG': '62.68.160.0/19',
4574         'GH': '154.160.0.0/12',
4575         'GI': '95.164.0.0/16',
4576         'GL': '88.83.0.0/19',
4577         'GM': '160.182.0.0/15',
4578         'GN': '197.149.192.0/18',
4579         'GP': '104.250.0.0/19',
4580         'GQ': '105.235.224.0/20',
4581         'GR': '94.64.0.0/13',
4582         'GT': '168.234.0.0/16',
4583         'GU': '168.123.0.0/16',
4584         'GW': '197.214.80.0/20',
4585         'GY': '181.41.64.0/18',
4586         'HK': '113.252.0.0/14',
4587         'HN': '181.210.0.0/16',
4588         'HR': '93.136.0.0/13',
4589         'HT': '148.102.128.0/17',
4590         'HU': '84.0.0.0/14',
4591         'ID': '39.192.0.0/10',
4592         'IE': '87.32.0.0/12',
4593         'IL': '79.176.0.0/13',
4594         'IM': '5.62.80.0/20',
4595         'IN': '117.192.0.0/10',
4596         'IO': '203.83.48.0/21',
4597         'IQ': '37.236.0.0/14',
4598         'IR': '2.176.0.0/12',
4599         'IS': '82.221.0.0/16',
4600         'IT': '79.0.0.0/10',
4601         'JE': '87.244.64.0/18',
4602         'JM': '72.27.0.0/17',
4603         'JO': '176.29.0.0/16',
4604         'JP': '133.0.0.0/8',
4605         'KE': '105.48.0.0/12',
4606         'KG': '158.181.128.0/17',
4607         'KH': '36.37.128.0/17',
4608         'KI': '103.25.140.0/22',
4609         'KM': '197.255.224.0/20',
4610         'KN': '198.167.192.0/19',
4611         'KP': '175.45.176.0/22',
4612         'KR': '175.192.0.0/10',
4613         'KW': '37.36.0.0/14',
4614         'KY': '64.96.0.0/15',
4615         'KZ': '2.72.0.0/13',
4616         'LA': '115.84.64.0/18',
4617         'LB': '178.135.0.0/16',
4618         'LC': '24.92.144.0/20',
4619         'LI': '82.117.0.0/19',
4620         'LK': '112.134.0.0/15',
4621         'LR': '102.183.0.0/16',
4622         'LS': '129.232.0.0/17',
4623         'LT': '78.56.0.0/13',
4624         'LU': '188.42.0.0/16',
4625         'LV': '46.109.0.0/16',
4626         'LY': '41.252.0.0/14',
4627         'MA': '105.128.0.0/11',
4628         'MC': '88.209.64.0/18',
4629         'MD': '37.246.0.0/16',
4630         'ME': '178.175.0.0/17',
4631         'MF': '74.112.232.0/21',
4632         'MG': '154.126.0.0/17',
4633         'MH': '117.103.88.0/21',
4634         'MK': '77.28.0.0/15',
4635         'ML': '154.118.128.0/18',
4636         'MM': '37.111.0.0/17',
4637         'MN': '49.0.128.0/17',
4638         'MO': '60.246.0.0/16',
4639         'MP': '202.88.64.0/20',
4640         'MQ': '109.203.224.0/19',
4641         'MR': '41.188.64.0/18',
4642         'MS': '208.90.112.0/22',
4643         'MT': '46.11.0.0/16',
4644         'MU': '105.16.0.0/12',
4645         'MV': '27.114.128.0/18',
4646         'MW': '102.70.0.0/15',
4647         'MX': '187.192.0.0/11',
4648         'MY': '175.136.0.0/13',
4649         'MZ': '197.218.0.0/15',
4650         'NA': '41.182.0.0/16',
4651         'NC': '101.101.0.0/18',
4652         'NE': '197.214.0.0/18',
4653         'NF': '203.17.240.0/22',
4654         'NG': '105.112.0.0/12',
4655         'NI': '186.76.0.0/15',
4656         'NL': '145.96.0.0/11',
4657         'NO': '84.208.0.0/13',
4658         'NP': '36.252.0.0/15',
4659         'NR': '203.98.224.0/19',
4660         'NU': '49.156.48.0/22',
4661         'NZ': '49.224.0.0/14',
4662         'OM': '5.36.0.0/15',
4663         'PA': '186.72.0.0/15',
4664         'PE': '186.160.0.0/14',
4665         'PF': '123.50.64.0/18',
4666         'PG': '124.240.192.0/19',
4667         'PH': '49.144.0.0/13',
4668         'PK': '39.32.0.0/11',
4669         'PL': '83.0.0.0/11',
4670         'PM': '70.36.0.0/20',
4671         'PR': '66.50.0.0/16',
4672         'PS': '188.161.0.0/16',
4673         'PT': '85.240.0.0/13',
4674         'PW': '202.124.224.0/20',
4675         'PY': '181.120.0.0/14',
4676         'QA': '37.210.0.0/15',
4677         'RE': '102.35.0.0/16',
4678         'RO': '79.112.0.0/13',
4679         'RS': '93.86.0.0/15',
4680         'RU': '5.136.0.0/13',
4681         'RW': '41.186.0.0/16',
4682         'SA': '188.48.0.0/13',
4683         'SB': '202.1.160.0/19',
4684         'SC': '154.192.0.0/11',
4685         'SD': '102.120.0.0/13',
4686         'SE': '78.64.0.0/12',
4687         'SG': '8.128.0.0/10',
4688         'SI': '188.196.0.0/14',
4689         'SK': '78.98.0.0/15',
4690         'SL': '102.143.0.0/17',
4691         'SM': '89.186.32.0/19',
4692         'SN': '41.82.0.0/15',
4693         'SO': '154.115.192.0/18',
4694         'SR': '186.179.128.0/17',
4695         'SS': '105.235.208.0/21',
4696         'ST': '197.159.160.0/19',
4697         'SV': '168.243.0.0/16',
4698         'SX': '190.102.0.0/20',
4699         'SY': '5.0.0.0/16',
4700         'SZ': '41.84.224.0/19',
4701         'TC': '65.255.48.0/20',
4702         'TD': '154.68.128.0/19',
4703         'TG': '196.168.0.0/14',
4704         'TH': '171.96.0.0/13',
4705         'TJ': '85.9.128.0/18',
4706         'TK': '27.96.24.0/21',
4707         'TL': '180.189.160.0/20',
4708         'TM': '95.85.96.0/19',
4709         'TN': '197.0.0.0/11',
4710         'TO': '175.176.144.0/21',
4711         'TR': '78.160.0.0/11',
4712         'TT': '186.44.0.0/15',
4713         'TV': '202.2.96.0/19',
4714         'TW': '120.96.0.0/11',
4715         'TZ': '156.156.0.0/14',
4716         'UA': '37.52.0.0/14',
4717         'UG': '102.80.0.0/13',
4718         'US': '6.0.0.0/8',
4719         'UY': '167.56.0.0/13',
4720         'UZ': '84.54.64.0/18',
4721         'VA': '212.77.0.0/19',
4722         'VC': '207.191.240.0/21',
4723         'VE': '186.88.0.0/13',
4724         'VG': '66.81.192.0/20',
4725         'VI': '146.226.0.0/16',
4726         'VN': '14.160.0.0/11',
4727         'VU': '202.80.32.0/20',
4728         'WF': '117.20.32.0/21',
4729         'WS': '202.4.32.0/19',
4730         'YE': '134.35.0.0/16',
4731         'YT': '41.242.116.0/22',
4732         'ZA': '41.0.0.0/11',
4733         'ZM': '102.144.0.0/13',
4734         'ZW': '102.177.192.0/18',
4735     }
4736
4737     @classmethod
4738     def random_ipv4(cls, code_or_block):
4739         if len(code_or_block) == 2:
4740             block = cls._country_ip_map.get(code_or_block.upper())
4741             if not block:
4742                 return None
4743         else:
4744             block = code_or_block
4745         addr, preflen = block.split('/')
4746         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4747         addr_max = addr_min | (0xffffffff >> int(preflen))
4748         return str(socket.inet_ntoa(
4749             struct.pack('!L', random.randint(addr_min, addr_max))))
4750
4751
4752 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4753     def __init__(self, proxies=None):
4754         # Set default handlers
4755         for type in ('http', 'https'):
4756             setattr(self, '%s_open' % type,
4757                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4758                         meth(r, proxy, type))
4759         urllib.request.ProxyHandler.__init__(self, proxies)
4760
4761     def proxy_open(self, req, proxy, type):
4762         req_proxy = req.headers.get('Ytdl-request-proxy')
4763         if req_proxy is not None:
4764             proxy = req_proxy
4765             del req.headers['Ytdl-request-proxy']
4766
4767         if proxy == '__noproxy__':
4768             return None  # No Proxy
4769         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4770             req.add_header('Ytdl-socks-proxy', proxy)
4771             # yt-dlp's http/https handlers do wrapping the socket with socks
4772             return None
4773         return urllib.request.ProxyHandler.proxy_open(
4774             self, req, proxy, type)
4775
4776
4777 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4778 # released into Public Domain
4779 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4780
4781 def long_to_bytes(n, blocksize=0):
4782     """long_to_bytes(n:long, blocksize:int) : string
4783     Convert a long integer to a byte string.
4784
4785     If optional blocksize is given and greater than zero, pad the front of the
4786     byte string with binary zeros so that the length is a multiple of
4787     blocksize.
4788     """
4789     # after much testing, this algorithm was deemed to be the fastest
4790     s = b''
4791     n = int(n)
4792     while n > 0:
4793         s = struct.pack('>I', n & 0xffffffff) + s
4794         n = n >> 32
4795     # strip off leading zeros
4796     for i in range(len(s)):
4797         if s[i] != b'\000'[0]:
4798             break
4799     else:
4800         # only happens when n == 0
4801         s = b'\000'
4802         i = 0
4803     s = s[i:]
4804     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4805     # de-padding being done above, but sigh...
4806     if blocksize > 0 and len(s) % blocksize:
4807         s = (blocksize - len(s) % blocksize) * b'\000' + s
4808     return s
4809
4810
4811 def bytes_to_long(s):
4812     """bytes_to_long(string) : long
4813     Convert a byte string to a long integer.
4814
4815     This is (essentially) the inverse of long_to_bytes().
4816     """
4817     acc = 0
4818     length = len(s)
4819     if length % 4:
4820         extra = (4 - length % 4)
4821         s = b'\000' * extra + s
4822         length = length + extra
4823     for i in range(0, length, 4):
4824         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4825     return acc
4826
4827
4828 def ohdave_rsa_encrypt(data, exponent, modulus):
4829     '''
4830     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4831
4832     Input:
4833         data: data to encrypt, bytes-like object
4834         exponent, modulus: parameter e and N of RSA algorithm, both integer
4835     Output: hex string of encrypted data
4836
4837     Limitation: supports one block encryption only
4838     '''
4839
4840     payload = int(binascii.hexlify(data[::-1]), 16)
4841     encrypted = pow(payload, exponent, modulus)
4842     return '%x' % encrypted
4843
4844
4845 def pkcs1pad(data, length):
4846     """
4847     Padding input data with PKCS#1 scheme
4848
4849     @param {int[]} data        input data
4850     @param {int}   length      target length
4851     @returns {int[]}           padded data
4852     """
4853     if len(data) > length - 11:
4854         raise ValueError('Input data too long for PKCS#1 padding')
4855
4856     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4857     return [0, 2] + pseudo_random + [0] + data
4858
4859
4860 def _base_n_table(n, table):
4861     if not table and not n:
4862         raise ValueError('Either table or n must be specified')
4863     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4864
4865     if n and n != len(table):
4866         raise ValueError(f'base {n} exceeds table length {len(table)}')
4867     return table
4868
4869
4870 def encode_base_n(num, n=None, table=None):
4871     """Convert given int to a base-n string"""
4872     table = _base_n_table(n, table)
4873     if not num:
4874         return table[0]
4875
4876     result, base = '', len(table)
4877     while num:
4878         result = table[num % base] + result
4879         num = num // base
4880     return result
4881
4882
4883 def decode_base_n(string, n=None, table=None):
4884     """Convert given base-n string to int"""
4885     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4886     result, base = 0, len(table)
4887     for char in string:
4888         result = result * base + table[char]
4889     return result
4890
4891
4892 def decode_packed_codes(code):
4893     mobj = re.search(PACKED_CODES_RE, code)
4894     obfuscated_code, base, count, symbols = mobj.groups()
4895     base = int(base)
4896     count = int(count)
4897     symbols = symbols.split('|')
4898     symbol_table = {}
4899
4900     while count:
4901         count -= 1
4902         base_n_count = encode_base_n(count, base)
4903         symbol_table[base_n_count] = symbols[count] or base_n_count
4904
4905     return re.sub(
4906         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4907         obfuscated_code)
4908
4909
4910 def caesar(s, alphabet, shift):
4911     if shift == 0:
4912         return s
4913     l = len(alphabet)
4914     return ''.join(
4915         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4916         for c in s)
4917
4918
4919 def rot47(s):
4920     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4921
4922
4923 def parse_m3u8_attributes(attrib):
4924     info = {}
4925     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4926         if val.startswith('"'):
4927             val = val[1:-1]
4928         info[key] = val
4929     return info
4930
4931
4932 def urshift(val, n):
4933     return val >> n if val >= 0 else (val + 0x100000000) >> n
4934
4935
4936 def write_xattr(path, key, value):
4937     # Windows: Write xattrs to NTFS Alternate Data Streams:
4938     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4939     if compat_os_name == 'nt':
4940         assert ':' not in key
4941         assert os.path.exists(path)
4942
4943         try:
4944             with open(f'{path}:{key}', 'wb') as f:
4945                 f.write(value)
4946         except OSError as e:
4947             raise XAttrMetadataError(e.errno, e.strerror)
4948         return
4949
4950     # UNIX Method 1. Use xattrs/pyxattrs modules
4951
4952     setxattr = None
4953     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4954         # Unicode arguments are not supported in pyxattr until version 0.5.0
4955         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4956         if version_tuple(xattr.__version__) >= (0, 5, 0):
4957             setxattr = xattr.set
4958     elif xattr:
4959         setxattr = xattr.setxattr
4960
4961     if setxattr:
4962         try:
4963             setxattr(path, key, value)
4964         except OSError as e:
4965             raise XAttrMetadataError(e.errno, e.strerror)
4966         return
4967
4968     # UNIX Method 2. Use setfattr/xattr executables
4969     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4970            else 'xattr' if check_executable('xattr', ['-h']) else None)
4971     if not exe:
4972         raise XAttrUnavailableError(
4973             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4974             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4975
4976     value = value.decode()
4977     try:
4978         _, stderr, returncode = Popen.run(
4979             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4980             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4981     except OSError as e:
4982         raise XAttrMetadataError(e.errno, e.strerror)
4983     if returncode:
4984         raise XAttrMetadataError(returncode, stderr)
4985
4986
4987 def random_birthday(year_field, month_field, day_field):
4988     start_date = datetime.date(1950, 1, 1)
4989     end_date = datetime.date(1995, 12, 31)
4990     offset = random.randint(0, (end_date - start_date).days)
4991     random_date = start_date + datetime.timedelta(offset)
4992     return {
4993         year_field: str(random_date.year),
4994         month_field: str(random_date.month),
4995         day_field: str(random_date.day),
4996     }
4997
4998
4999 def find_available_port(interface=''):
5000     try:
5001         with socket.socket() as sock:
5002             sock.bind((interface, 0))
5003             return sock.getsockname()[1]
5004     except OSError:
5005         return None
5006
5007
5008 # Templates for internet shortcut files, which are plain text files.
5009 DOT_URL_LINK_TEMPLATE = '''\
5010 [InternetShortcut]
5011 URL=%(url)s
5012 '''
5013
5014 DOT_WEBLOC_LINK_TEMPLATE = '''\
5015 <?xml version="1.0" encoding="UTF-8"?>
5016 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5017 <plist version="1.0">
5018 <dict>
5019 \t<key>URL</key>
5020 \t<string>%(url)s</string>
5021 </dict>
5022 </plist>
5023 '''
5024
5025 DOT_DESKTOP_LINK_TEMPLATE = '''\
5026 [Desktop Entry]
5027 Encoding=UTF-8
5028 Name=%(filename)s
5029 Type=Link
5030 URL=%(url)s
5031 Icon=text-html
5032 '''
5033
5034 LINK_TEMPLATES = {
5035     'url': DOT_URL_LINK_TEMPLATE,
5036     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5037     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5038 }
5039
5040
5041 def iri_to_uri(iri):
5042     """
5043     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5044
5045     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5046     """
5047
5048     iri_parts = urllib.parse.urlparse(iri)
5049
5050     if '[' in iri_parts.netloc:
5051         raise ValueError('IPv6 URIs are not, yet, supported.')
5052         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5053
5054     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5055
5056     net_location = ''
5057     if iri_parts.username:
5058         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5059         if iri_parts.password is not None:
5060             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5061         net_location += '@'
5062
5063     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5064     # The 'idna' encoding produces ASCII text.
5065     if iri_parts.port is not None and iri_parts.port != 80:
5066         net_location += ':' + str(iri_parts.port)
5067
5068     return urllib.parse.urlunparse(
5069         (iri_parts.scheme,
5070             net_location,
5071
5072             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5073
5074             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5075             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5076
5077             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5078             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5079
5080             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5081
5082     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5083
5084
5085 def to_high_limit_path(path):
5086     if sys.platform in ['win32', 'cygwin']:
5087         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5088         return '\\\\?\\' + os.path.abspath(path)
5089
5090     return path
5091
5092
5093 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5094     val = traversal.traverse_obj(obj, *variadic(field))
5095     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5096         return default
5097     return template % func(val)
5098
5099
5100 def clean_podcast_url(url):
5101     return re.sub(r'''(?x)
5102         (?:
5103             (?:
5104                 chtbl\.com/track|
5105                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5106                 play\.podtrac\.com
5107             )/[^/]+|
5108             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5109             flex\.acast\.com|
5110             pd(?:
5111                 cn\.co| # https://podcorn.com/analytics-prefix/
5112                 st\.fm # https://podsights.com/docs/
5113             )/e
5114         )/''', '', url)
5115
5116
5117 _HEX_TABLE = '0123456789abcdef'
5118
5119
5120 def random_uuidv4():
5121     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5122
5123
5124 def make_dir(path, to_screen=None):
5125     try:
5126         dn = os.path.dirname(path)
5127         if dn:
5128             os.makedirs(dn, exist_ok=True)
5129         return True
5130     except OSError as err:
5131         if callable(to_screen) is not None:
5132             to_screen(f'unable to create directory {err}')
5133         return False
5134
5135
5136 def get_executable_path():
5137     from ..update import _get_variant_and_executable_path
5138
5139     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5140
5141
5142 def get_user_config_dirs(package_name):
5143     # .config (e.g. ~/.config/package_name)
5144     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5145     yield os.path.join(xdg_config_home, package_name)
5146
5147     # appdata (%APPDATA%/package_name)
5148     appdata_dir = os.getenv('appdata')
5149     if appdata_dir:
5150         yield os.path.join(appdata_dir, package_name)
5151
5152     # home (~/.package_name)
5153     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5154
5155
5156 def get_system_config_dirs(package_name):
5157     # /etc/package_name
5158     yield os.path.join('/etc', package_name)
5159
5160
5161 def time_seconds(**kwargs):
5162     """
5163     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5164     """
5165     return time.time() + datetime.timedelta(**kwargs).total_seconds()
5166
5167
5168 # create a JSON Web Signature (jws) with HS256 algorithm
5169 # the resulting format is in JWS Compact Serialization
5170 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5171 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5172 def jwt_encode_hs256(payload_data, key, headers={}):
5173     header_data = {
5174         'alg': 'HS256',
5175         'typ': 'JWT',
5176     }
5177     if headers:
5178         header_data.update(headers)
5179     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5180     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5181     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5182     signature_b64 = base64.b64encode(h.digest())
5183     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5184     return token
5185
5186
5187 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5188 def jwt_decode_hs256(jwt):
5189     header_b64, payload_b64, signature_b64 = jwt.split('.')
5190     # add trailing ='s that may have been stripped, superfluous ='s are ignored
5191     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5192     return payload_data
5193
5194
5195 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5196
5197
5198 @functools.cache
5199 def supports_terminal_sequences(stream):
5200     if compat_os_name == 'nt':
5201         if not WINDOWS_VT_MODE:
5202             return False
5203     elif not os.getenv('TERM'):
5204         return False
5205     try:
5206         return stream.isatty()
5207     except BaseException:
5208         return False
5209
5210
5211 def windows_enable_vt_mode():
5212     """Ref: https://bugs.python.org/issue30075 """
5213     if get_windows_version() < (10, 0, 10586):
5214         return
5215
5216     import ctypes
5217     import ctypes.wintypes
5218     import msvcrt
5219
5220     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5221
5222     dll = ctypes.WinDLL('kernel32', use_last_error=False)
5223     handle = os.open('CONOUT$', os.O_RDWR)
5224     try:
5225         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5226         dw_original_mode = ctypes.wintypes.DWORD()
5227         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5228         if not success:
5229             raise Exception('GetConsoleMode failed')
5230
5231         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5232             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5233         if not success:
5234             raise Exception('SetConsoleMode failed')
5235     finally:
5236         os.close(handle)
5237
5238     global WINDOWS_VT_MODE
5239     WINDOWS_VT_MODE = True
5240     supports_terminal_sequences.cache_clear()
5241
5242
5243 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5244
5245
5246 def remove_terminal_sequences(string):
5247     return _terminal_sequences_re.sub('', string)
5248
5249
5250 def number_of_digits(number):
5251     return len('%d' % number)
5252
5253
5254 def join_nonempty(*values, delim='-', from_dict=None):
5255     if from_dict is not None:
5256         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
5257     return delim.join(map(str, filter(None, values)))
5258
5259
5260 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5261     """
5262     Find the largest format dimensions in terms of video width and, for each thumbnail:
5263     * Modify the URL: Match the width with the provided regex and replace with the former width
5264     * Update dimensions
5265
5266     This function is useful with video services that scale the provided thumbnails on demand
5267     """
5268     _keys = ('width', 'height')
5269     max_dimensions = max(
5270         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5271         default=(0, 0))
5272     if not max_dimensions[0]:
5273         return thumbnails
5274     return [
5275         merge_dicts(
5276             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5277             dict(zip(_keys, max_dimensions)), thumbnail)
5278         for thumbnail in thumbnails
5279     ]
5280
5281
5282 def parse_http_range(range):
5283     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5284     if not range:
5285         return None, None, None
5286     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5287     if not crg:
5288         return None, None, None
5289     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5290
5291
5292 def read_stdin(what):
5293     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5294     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5295     return sys.stdin
5296
5297
5298 def determine_file_encoding(data):
5299     """
5300     Detect the text encoding used
5301     @returns (encoding, bytes to skip)
5302     """
5303
5304     # BOM marks are given priority over declarations
5305     for bom, enc in BOMS:
5306         if data.startswith(bom):
5307             return enc, len(bom)
5308
5309     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5310     # We ignore the endianness to get a good enough match
5311     data = data.replace(b'\0', b'')
5312     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5313     return mobj.group(1).decode() if mobj else None, 0
5314
5315
5316 class Config:
5317     own_args = None
5318     parsed_args = None
5319     filename = None
5320     __initialized = False
5321
5322     def __init__(self, parser, label=None):
5323         self.parser, self.label = parser, label
5324         self._loaded_paths, self.configs = set(), []
5325
5326     def init(self, args=None, filename=None):
5327         assert not self.__initialized
5328         self.own_args, self.filename = args, filename
5329         return self.load_configs()
5330
5331     def load_configs(self):
5332         directory = ''
5333         if self.filename:
5334             location = os.path.realpath(self.filename)
5335             directory = os.path.dirname(location)
5336             if location in self._loaded_paths:
5337                 return False
5338             self._loaded_paths.add(location)
5339
5340         self.__initialized = True
5341         opts, _ = self.parser.parse_known_args(self.own_args)
5342         self.parsed_args = self.own_args
5343         for location in opts.config_locations or []:
5344             if location == '-':
5345                 if location in self._loaded_paths:
5346                     continue
5347                 self._loaded_paths.add(location)
5348                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5349                 continue
5350             location = os.path.join(directory, expand_path(location))
5351             if os.path.isdir(location):
5352                 location = os.path.join(location, 'yt-dlp.conf')
5353             if not os.path.exists(location):
5354                 self.parser.error(f'config location {location} does not exist')
5355             self.append_config(self.read_file(location), location)
5356         return True
5357
5358     def __str__(self):
5359         label = join_nonempty(
5360             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5361             delim=' ')
5362         return join_nonempty(
5363             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5364             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5365             delim='\n')
5366
5367     @staticmethod
5368     def read_file(filename, default=[]):
5369         try:
5370             optionf = open(filename, 'rb')
5371         except OSError:
5372             return default  # silently skip if file is not present
5373         try:
5374             enc, skip = determine_file_encoding(optionf.read(512))
5375             optionf.seek(skip, io.SEEK_SET)
5376         except OSError:
5377             enc = None  # silently skip read errors
5378         try:
5379             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5380             contents = optionf.read().decode(enc or preferredencoding())
5381             res = shlex.split(contents, comments=True)
5382         except Exception as err:
5383             raise ValueError(f'Unable to parse "{filename}": {err}')
5384         finally:
5385             optionf.close()
5386         return res
5387
5388     @staticmethod
5389     def hide_login_info(opts):
5390         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5391         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5392
5393         def _scrub_eq(o):
5394             m = eqre.match(o)
5395             if m:
5396                 return m.group('key') + '=PRIVATE'
5397             else:
5398                 return o
5399
5400         opts = list(map(_scrub_eq, opts))
5401         for idx, opt in enumerate(opts):
5402             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5403                 opts[idx + 1] = 'PRIVATE'
5404         return opts
5405
5406     def append_config(self, *args, label=None):
5407         config = type(self)(self.parser, label)
5408         config._loaded_paths = self._loaded_paths
5409         if config.init(*args):
5410             self.configs.append(config)
5411
5412     @property
5413     def all_args(self):
5414         for config in reversed(self.configs):
5415             yield from config.all_args
5416         yield from self.parsed_args or []
5417
5418     def parse_known_args(self, **kwargs):
5419         return self.parser.parse_known_args(self.all_args, **kwargs)
5420
5421     def parse_args(self):
5422         return self.parser.parse_args(self.all_args)
5423
5424
5425 class WebSocketsWrapper:
5426     """Wraps websockets module to use in non-async scopes"""
5427     pool = None
5428
5429     def __init__(self, url, headers=None, connect=True):
5430         self.loop = asyncio.new_event_loop()
5431         # XXX: "loop" is deprecated
5432         self.conn = websockets.connect(
5433             url, extra_headers=headers, ping_interval=None,
5434             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5435         if connect:
5436             self.__enter__()
5437         atexit.register(self.__exit__, None, None, None)
5438
5439     def __enter__(self):
5440         if not self.pool:
5441             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5442         return self
5443
5444     def send(self, *args):
5445         self.run_with_loop(self.pool.send(*args), self.loop)
5446
5447     def recv(self, *args):
5448         return self.run_with_loop(self.pool.recv(*args), self.loop)
5449
5450     def __exit__(self, type, value, traceback):
5451         try:
5452             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5453         finally:
5454             self.loop.close()
5455             self._cancel_all_tasks(self.loop)
5456
5457     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5458     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5459     @staticmethod
5460     def run_with_loop(main, loop):
5461         if not asyncio.iscoroutine(main):
5462             raise ValueError(f'a coroutine was expected, got {main!r}')
5463
5464         try:
5465             return loop.run_until_complete(main)
5466         finally:
5467             loop.run_until_complete(loop.shutdown_asyncgens())
5468             if hasattr(loop, 'shutdown_default_executor'):
5469                 loop.run_until_complete(loop.shutdown_default_executor())
5470
5471     @staticmethod
5472     def _cancel_all_tasks(loop):
5473         to_cancel = asyncio.all_tasks(loop)
5474
5475         if not to_cancel:
5476             return
5477
5478         for task in to_cancel:
5479             task.cancel()
5480
5481         # XXX: "loop" is removed in python 3.10+
5482         loop.run_until_complete(
5483             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5484
5485         for task in to_cancel:
5486             if task.cancelled():
5487                 continue
5488             if task.exception() is not None:
5489                 loop.call_exception_handler({
5490                     'message': 'unhandled exception during asyncio.run() shutdown',
5491                     'exception': task.exception(),
5492                     'task': task,
5493                 })
5494
5495
5496 def merge_headers(*dicts):
5497     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5498     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5499
5500
5501 def cached_method(f):
5502     """Cache a method"""
5503     signature = inspect.signature(f)
5504
5505     @functools.wraps(f)
5506     def wrapper(self, *args, **kwargs):
5507         bound_args = signature.bind(self, *args, **kwargs)
5508         bound_args.apply_defaults()
5509         key = tuple(bound_args.arguments.values())[1:]
5510
5511         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5512         if key not in cache:
5513             cache[key] = f(self, *args, **kwargs)
5514         return cache[key]
5515     return wrapper
5516
5517
5518 class classproperty:
5519     """property access for class methods with optional caching"""
5520     def __new__(cls, func=None, *args, **kwargs):
5521         if not func:
5522             return functools.partial(cls, *args, **kwargs)
5523         return super().__new__(cls)
5524
5525     def __init__(self, func, *, cache=False):
5526         functools.update_wrapper(self, func)
5527         self.func = func
5528         self._cache = {} if cache else None
5529
5530     def __get__(self, _, cls):
5531         if self._cache is None:
5532             return self.func(cls)
5533         elif cls not in self._cache:
5534             self._cache[cls] = self.func(cls)
5535         return self._cache[cls]
5536
5537
5538 class function_with_repr:
5539     def __init__(self, func, repr_=None):
5540         functools.update_wrapper(self, func)
5541         self.func, self.__repr = func, repr_
5542
5543     def __call__(self, *args, **kwargs):
5544         return self.func(*args, **kwargs)
5545
5546     def __repr__(self):
5547         if self.__repr:
5548             return self.__repr
5549         return f'{self.func.__module__}.{self.func.__qualname__}'
5550
5551
5552 class Namespace(types.SimpleNamespace):
5553     """Immutable namespace"""
5554
5555     def __iter__(self):
5556         return iter(self.__dict__.values())
5557
5558     @property
5559     def items_(self):
5560         return self.__dict__.items()
5561
5562
5563 MEDIA_EXTENSIONS = Namespace(
5564     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5565     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5566     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5567     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5568     thumbnails=('jpg', 'png', 'webp'),
5569     storyboards=('mhtml', ),
5570     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5571     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5572 )
5573 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5574 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5575
5576 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5577
5578
5579 class RetryManager:
5580     """Usage:
5581         for retry in RetryManager(...):
5582             try:
5583                 ...
5584             except SomeException as err:
5585                 retry.error = err
5586                 continue
5587     """
5588     attempt, _error = 0, None
5589
5590     def __init__(self, _retries, _error_callback, **kwargs):
5591         self.retries = _retries or 0
5592         self.error_callback = functools.partial(_error_callback, **kwargs)
5593
5594     def _should_retry(self):
5595         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5596
5597     @property
5598     def error(self):
5599         if self._error is NO_DEFAULT:
5600             return None
5601         return self._error
5602
5603     @error.setter
5604     def error(self, value):
5605         self._error = value
5606
5607     def __iter__(self):
5608         while self._should_retry():
5609             self.error = NO_DEFAULT
5610             self.attempt += 1
5611             yield self
5612             if self.error:
5613                 self.error_callback(self.error, self.attempt, self.retries)
5614
5615     @staticmethod
5616     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5617         """Utility function for reporting retries"""
5618         if count > retries:
5619             if error:
5620                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5621             raise e
5622
5623         if not count:
5624             return warn(e)
5625         elif isinstance(e, ExtractorError):
5626             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5627         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5628
5629         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5630         if delay:
5631             info(f'Sleeping {delay:.2f} seconds ...')
5632             time.sleep(delay)
5633
5634
5635 def make_archive_id(ie, video_id):
5636     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5637     return f'{ie_key.lower()} {video_id}'
5638
5639
5640 def truncate_string(s, left, right=0):
5641     assert left > 3 and right >= 0
5642     if s is None or len(s) <= left + right:
5643         return s
5644     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5645
5646
5647 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5648     assert 'all' in alias_dict, '"all" alias is required'
5649     requested = list(start or [])
5650     for val in options:
5651         discard = val.startswith('-')
5652         if discard:
5653             val = val[1:]
5654
5655         if val in alias_dict:
5656             val = alias_dict[val] if not discard else [
5657                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5658             # NB: Do not allow regex in aliases for performance
5659             requested = orderedSet_from_options(val, alias_dict, start=requested)
5660             continue
5661
5662         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5663                    else [val] if val in alias_dict['all'] else None)
5664         if current is None:
5665             raise ValueError(val)
5666
5667         if discard:
5668             for item in current:
5669                 while item in requested:
5670                     requested.remove(item)
5671         else:
5672             requested.extend(current)
5673
5674     return orderedSet(requested)
5675
5676
5677 # TODO: Rewrite
5678 class FormatSorter:
5679     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5680
5681     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5682                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5683                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5684     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5685                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5686                     'fps', 'fs_approx', 'source', 'id')
5687
5688     settings = {
5689         'vcodec': {'type': 'ordered', 'regex': True,
5690                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5691         'acodec': {'type': 'ordered', 'regex': True,
5692                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5693         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5694                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5695         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5696                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5697         'vext': {'type': 'ordered', 'field': 'video_ext',
5698                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5699                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5700         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5701                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5702                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5703         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5704         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5705                        'field': ('vcodec', 'acodec'),
5706                        'function': lambda it: int(any(v != 'none' for v in it))},
5707         'ie_pref': {'priority': True, 'type': 'extractor'},
5708         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5709         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5710         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5711         'quality': {'convert': 'float', 'default': -1},
5712         'filesize': {'convert': 'bytes'},
5713         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5714         'id': {'convert': 'string', 'field': 'format_id'},
5715         'height': {'convert': 'float_none'},
5716         'width': {'convert': 'float_none'},
5717         'fps': {'convert': 'float_none'},
5718         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5719         'tbr': {'convert': 'float_none'},
5720         'vbr': {'convert': 'float_none'},
5721         'abr': {'convert': 'float_none'},
5722         'asr': {'convert': 'float_none'},
5723         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5724
5725         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5726         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'),
5727                'function': lambda it: next(filter(None, it), None)},
5728         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'),
5729                  'function': lambda it: next(filter(None, it), None)},
5730         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5731         'res': {'type': 'multiple', 'field': ('height', 'width'),
5732                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5733
5734         # Actual field names
5735         'format_id': {'type': 'alias', 'field': 'id'},
5736         'preference': {'type': 'alias', 'field': 'ie_pref'},
5737         'language_preference': {'type': 'alias', 'field': 'lang'},
5738         'source_preference': {'type': 'alias', 'field': 'source'},
5739         'protocol': {'type': 'alias', 'field': 'proto'},
5740         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5741         'audio_channels': {'type': 'alias', 'field': 'channels'},
5742
5743         # Deprecated
5744         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5745         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5746         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5747         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5748         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5749         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5750         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5751         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5752         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5753         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5754         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5755         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5756         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5757         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5758         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5759         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5760         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5761         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5762         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5763         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5764     }
5765
5766     def __init__(self, ydl, field_preference):
5767         self.ydl = ydl
5768         self._order = []
5769         self.evaluate_params(self.ydl.params, field_preference)
5770         if ydl.params.get('verbose'):
5771             self.print_verbose_info(self.ydl.write_debug)
5772
5773     def _get_field_setting(self, field, key):
5774         if field not in self.settings:
5775             if key in ('forced', 'priority'):
5776                 return False
5777             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5778                                         'deprecated and may be removed in a future version')
5779             self.settings[field] = {}
5780         propObj = self.settings[field]
5781         if key not in propObj:
5782             type = propObj.get('type')
5783             if key == 'field':
5784                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5785             elif key == 'convert':
5786                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5787             else:
5788                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5789             propObj[key] = default
5790         return propObj[key]
5791
5792     def _resolve_field_value(self, field, value, convertNone=False):
5793         if value is None:
5794             if not convertNone:
5795                 return None
5796         else:
5797             value = value.lower()
5798         conversion = self._get_field_setting(field, 'convert')
5799         if conversion == 'ignore':
5800             return None
5801         if conversion == 'string':
5802             return value
5803         elif conversion == 'float_none':
5804             return float_or_none(value)
5805         elif conversion == 'bytes':
5806             return parse_bytes(value)
5807         elif conversion == 'order':
5808             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5809             use_regex = self._get_field_setting(field, 'regex')
5810             list_length = len(order_list)
5811             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5812             if use_regex and value is not None:
5813                 for i, regex in enumerate(order_list):
5814                     if regex and re.match(regex, value):
5815                         return list_length - i
5816                 return list_length - empty_pos  # not in list
5817             else:  # not regex or  value = None
5818                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5819         else:
5820             if value.isnumeric():
5821                 return float(value)
5822             else:
5823                 self.settings[field]['convert'] = 'string'
5824                 return value
5825
5826     def evaluate_params(self, params, sort_extractor):
5827         self._use_free_order = params.get('prefer_free_formats', False)
5828         self._sort_user = params.get('format_sort', [])
5829         self._sort_extractor = sort_extractor
5830
5831         def add_item(field, reverse, closest, limit_text):
5832             field = field.lower()
5833             if field in self._order:
5834                 return
5835             self._order.append(field)
5836             limit = self._resolve_field_value(field, limit_text)
5837             data = {
5838                 'reverse': reverse,
5839                 'closest': False if limit is None else closest,
5840                 'limit_text': limit_text,
5841                 'limit': limit}
5842             if field in self.settings:
5843                 self.settings[field].update(data)
5844             else:
5845                 self.settings[field] = data
5846
5847         sort_list = (
5848             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5849             + (tuple() if params.get('format_sort_force', False)
5850                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5851             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5852
5853         for item in sort_list:
5854             match = re.match(self.regex, item)
5855             if match is None:
5856                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5857             field = match.group('field')
5858             if field is None:
5859                 continue
5860             if self._get_field_setting(field, 'type') == 'alias':
5861                 alias, field = field, self._get_field_setting(field, 'field')
5862                 if self._get_field_setting(alias, 'deprecated'):
5863                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5864                                                 f'be removed in a future version. Please use {field} instead')
5865             reverse = match.group('reverse') is not None
5866             closest = match.group('separator') == '~'
5867             limit_text = match.group('limit')
5868
5869             has_limit = limit_text is not None
5870             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5871             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5872
5873             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5874             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5875             limit_count = len(limits)
5876             for (i, f) in enumerate(fields):
5877                 add_item(f, reverse, closest,
5878                          limits[i] if i < limit_count
5879                          else limits[0] if has_limit and not has_multiple_limits
5880                          else None)
5881
5882     def print_verbose_info(self, write_debug):
5883         if self._sort_user:
5884             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5885         if self._sort_extractor:
5886             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5887         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5888             '+' if self._get_field_setting(field, 'reverse') else '', field,
5889             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5890                           self._get_field_setting(field, 'limit_text'),
5891                           self._get_field_setting(field, 'limit'))
5892             if self._get_field_setting(field, 'limit_text') is not None else '')
5893             for field in self._order if self._get_field_setting(field, 'visible')]))
5894
5895     def _calculate_field_preference_from_value(self, format, field, type, value):
5896         reverse = self._get_field_setting(field, 'reverse')
5897         closest = self._get_field_setting(field, 'closest')
5898         limit = self._get_field_setting(field, 'limit')
5899
5900         if type == 'extractor':
5901             maximum = self._get_field_setting(field, 'max')
5902             if value is None or (maximum is not None and value >= maximum):
5903                 value = -1
5904         elif type == 'boolean':
5905             in_list = self._get_field_setting(field, 'in_list')
5906             not_in_list = self._get_field_setting(field, 'not_in_list')
5907             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5908         elif type == 'ordered':
5909             value = self._resolve_field_value(field, value, True)
5910
5911         # try to convert to number
5912         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5913         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5914         if is_num:
5915             value = val_num
5916
5917         return ((-10, 0) if value is None
5918                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5919                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5920                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5921                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5922                 else (-1, value, 0))
5923
5924     def _calculate_field_preference(self, format, field):
5925         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5926         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5927         if type == 'multiple':
5928             type = 'field'  # Only 'field' is allowed in multiple for now
5929             actual_fields = self._get_field_setting(field, 'field')
5930
5931             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5932         else:
5933             value = get_value(field)
5934         return self._calculate_field_preference_from_value(format, field, type, value)
5935
5936     def calculate_preference(self, format):
5937         # Determine missing protocol
5938         if not format.get('protocol'):
5939             format['protocol'] = determine_protocol(format)
5940
5941         # Determine missing ext
5942         if not format.get('ext') and 'url' in format:
5943             format['ext'] = determine_ext(format['url'])
5944         if format.get('vcodec') == 'none':
5945             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5946             format['video_ext'] = 'none'
5947         else:
5948             format['video_ext'] = format['ext']
5949             format['audio_ext'] = 'none'
5950         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5951         #    format['preference'] = -1000
5952
5953         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5954             # HEVC-over-FLV is out-of-spec by FLV's original spec
5955             # ref. https://trac.ffmpeg.org/ticket/6389
5956             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5957             format['preference'] = -100
5958
5959         # Determine missing bitrates
5960         if format.get('vcodec') == 'none':
5961             format['vbr'] = 0
5962         if format.get('acodec') == 'none':
5963             format['abr'] = 0
5964         if not format.get('vbr') and format.get('vcodec') != 'none':
5965             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5966         if not format.get('abr') and format.get('acodec') != 'none':
5967             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5968         if not format.get('tbr'):
5969             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5970
5971         return tuple(self._calculate_field_preference(format, field) for field in self._order)