yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import netrc
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import shlex
  35 import socket
  36 import ssl
  37 import struct
  38 import subprocess
  39 import sys
  40 import tempfile
  41 import time
  42 import traceback
  43 import types
  44 import unicodedata
  45 import urllib.error
  46 import urllib.parse
  47 import urllib.request
  48 import xml.etree.ElementTree
  49 import zlib
  50
  51 from . import traversal
  52
  53 from ..compat import functools  # isort: split
  54 from ..compat import (
  55     compat_etree_fromstring,
  56     compat_expanduser,
  57     compat_HTMLParseError,
  58     compat_os_name,
  59     compat_shlex_quote,
  60 )
  61 from ..dependencies import brotli, certifi, websockets, xattr
  62 from ..socks import ProxyType, sockssocket
  63
  64 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  65
  66 # This is not clearly defined otherwise
  67 compiled_regex_type = type(re.compile(''))
  68
  69
  70 def random_user_agent():
  71     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  72     _CHROME_VERSIONS = (
  73         '90.0.4430.212',
  74         '90.0.4430.24',
  75         '90.0.4430.70',
  76         '90.0.4430.72',
  77         '90.0.4430.85',
  78         '90.0.4430.93',
  79         '91.0.4472.101',
  80         '91.0.4472.106',
  81         '91.0.4472.114',
  82         '91.0.4472.124',
  83         '91.0.4472.164',
  84         '91.0.4472.19',
  85         '91.0.4472.77',
  86         '92.0.4515.107',
  87         '92.0.4515.115',
  88         '92.0.4515.131',
  89         '92.0.4515.159',
  90         '92.0.4515.43',
  91         '93.0.4556.0',
  92         '93.0.4577.15',
  93         '93.0.4577.63',
  94         '93.0.4577.82',
  95         '94.0.4606.41',
  96         '94.0.4606.54',
  97         '94.0.4606.61',
  98         '94.0.4606.71',
  99         '94.0.4606.81',
 100         '94.0.4606.85',
 101         '95.0.4638.17',
 102         '95.0.4638.50',
 103         '95.0.4638.54',
 104         '95.0.4638.69',
 105         '95.0.4638.74',
 106         '96.0.4664.18',
 107         '96.0.4664.45',
 108         '96.0.4664.55',
 109         '96.0.4664.93',
 110         '97.0.4692.20',
 111     )
 112     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 113
 114
 115 SUPPORTED_ENCODINGS = [
 116     'gzip', 'deflate'
 117 ]
 118 if brotli:
 119     SUPPORTED_ENCODINGS.append('br')
 120
 121 std_headers = {
 122     'User-Agent': random_user_agent(),
 123     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 124     'Accept-Language': 'en-us,en;q=0.5',
 125     'Sec-Fetch-Mode': 'navigate',
 126 }
 127
 128
 129 USER_AGENTS = {
 130     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 131 }
 132
 133
 134 class NO_DEFAULT:
 135     pass
 136
 137
 138 def IDENTITY(x):
 139     return x
 140
 141
 142 ENGLISH_MONTH_NAMES = [
 143     'January', 'February', 'March', 'April', 'May', 'June',
 144     'July', 'August', 'September', 'October', 'November', 'December']
 145
 146 MONTH_NAMES = {
 147     'en': ENGLISH_MONTH_NAMES,
 148     'fr': [
 149         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 150         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 151     # these follow the genitive grammatical case (dopełniacz)
 152     # some websites might be using nominative, which will require another month list
 153     # https://en.wikibooks.org/wiki/Polish/Noun_cases
 154     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
 155            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
 156 }
 157
 158 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 159 TIMEZONE_NAMES = {
 160     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 161     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 162     'EST': -5, 'EDT': -4,  # Eastern
 163     'CST': -6, 'CDT': -5,  # Central
 164     'MST': -7, 'MDT': -6,  # Mountain
 165     'PST': -8, 'PDT': -7   # Pacific
 166 }
 167
 168 # needed for sanitizing filenames in restricted mode
 169 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 170                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 171                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 172
 173 DATE_FORMATS = (
 174     '%d %B %Y',
 175     '%d %b %Y',
 176     '%B %d %Y',
 177     '%B %dst %Y',
 178     '%B %dnd %Y',
 179     '%B %drd %Y',
 180     '%B %dth %Y',
 181     '%b %d %Y',
 182     '%b %dst %Y',
 183     '%b %dnd %Y',
 184     '%b %drd %Y',
 185     '%b %dth %Y',
 186     '%b %dst %Y %I:%M',
 187     '%b %dnd %Y %I:%M',
 188     '%b %drd %Y %I:%M',
 189     '%b %dth %Y %I:%M',
 190     '%Y %m %d',
 191     '%Y-%m-%d',
 192     '%Y.%m.%d.',
 193     '%Y/%m/%d',
 194     '%Y/%m/%d %H:%M',
 195     '%Y/%m/%d %H:%M:%S',
 196     '%Y%m%d%H%M',
 197     '%Y%m%d%H%M%S',
 198     '%Y%m%d',
 199     '%Y-%m-%d %H:%M',
 200     '%Y-%m-%d %H:%M:%S',
 201     '%Y-%m-%d %H:%M:%S.%f',
 202     '%Y-%m-%d %H:%M:%S:%f',
 203     '%d.%m.%Y %H:%M',
 204     '%d.%m.%Y %H.%M',
 205     '%Y-%m-%dT%H:%M:%SZ',
 206     '%Y-%m-%dT%H:%M:%S.%fZ',
 207     '%Y-%m-%dT%H:%M:%S.%f0Z',
 208     '%Y-%m-%dT%H:%M:%S',
 209     '%Y-%m-%dT%H:%M:%S.%f',
 210     '%Y-%m-%dT%H:%M',
 211     '%b %d %Y at %H:%M',
 212     '%b %d %Y at %H:%M:%S',
 213     '%B %d %Y at %H:%M',
 214     '%B %d %Y at %H:%M:%S',
 215     '%H:%M %d-%b-%Y',
 216 )
 217
 218 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 219 DATE_FORMATS_DAY_FIRST.extend([
 220     '%d-%m-%Y',
 221     '%d.%m.%Y',
 222     '%d.%m.%y',
 223     '%d/%m/%Y',
 224     '%d/%m/%y',
 225     '%d/%m/%Y %H:%M:%S',
 226     '%d-%m-%Y %H:%M',
 227     '%H:%M %d/%m/%Y',
 228 ])
 229
 230 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 231 DATE_FORMATS_MONTH_FIRST.extend([
 232     '%m-%d-%Y',
 233     '%m.%d.%Y',
 234     '%m/%d/%Y',
 235     '%m/%d/%y',
 236     '%m/%d/%Y %H:%M:%S',
 237 ])
 238
 239 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 240 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 241
 242 NUMBER_RE = r'\d+(?:\.\d+)?'
 243
 244
 245 @functools.cache
 246 def preferredencoding():
 247     """Get preferred encoding.
 248
 249     Returns the best encoding scheme for the system, based on
 250     locale.getpreferredencoding() and some further tweaks.
 251     """
 252     try:
 253         pref = locale.getpreferredencoding()
 254         'TEST'.encode(pref)
 255     except Exception:
 256         pref = 'UTF-8'
 257
 258     return pref
 259
 260
 261 def write_json_file(obj, fn):
 262     """ Encode obj as JSON and write it to fn, atomically if possible """
 263
 264     tf = tempfile.NamedTemporaryFile(
 265         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 266         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 267
 268     try:
 269         with tf:
 270             json.dump(obj, tf, ensure_ascii=False)
 271         if sys.platform == 'win32':
 272             # Need to remove existing file on Windows, else os.rename raises
 273             # WindowsError or FileExistsError.
 274             with contextlib.suppress(OSError):
 275                 os.unlink(fn)
 276         with contextlib.suppress(OSError):
 277             mask = os.umask(0)
 278             os.umask(mask)
 279             os.chmod(tf.name, 0o666 & ~mask)
 280         os.rename(tf.name, fn)
 281     except Exception:
 282         with contextlib.suppress(OSError):
 283             os.remove(tf.name)
 284         raise
 285
 286
 287 def find_xpath_attr(node, xpath, key, val=None):
 288     """ Find the xpath xpath[@key=val] """
 289     assert re.match(r'^[a-zA-Z_-]+$', key)
 290     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 291     return node.find(expr)
 292
 293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 294 # the namespace parameter
 295
 296
 297 def xpath_with_ns(path, ns_map):
 298     components = [c.split(':') for c in path.split('/')]
 299     replaced = []
 300     for c in components:
 301         if len(c) == 1:
 302             replaced.append(c[0])
 303         else:
 304             ns, tag = c
 305             replaced.append('{%s}%s' % (ns_map[ns], tag))
 306     return '/'.join(replaced)
 307
 308
 309 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 310     def _find_xpath(xpath):
 311         return node.find(xpath)
 312
 313     if isinstance(xpath, str):
 314         n = _find_xpath(xpath)
 315     else:
 316         for xp in xpath:
 317             n = _find_xpath(xp)
 318             if n is not None:
 319                 break
 320
 321     if n is None:
 322         if default is not NO_DEFAULT:
 323             return default
 324         elif fatal:
 325             name = xpath if name is None else name
 326             raise ExtractorError('Could not find XML element %s' % name)
 327         else:
 328             return None
 329     return n
 330
 331
 332 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 333     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 334     if n is None or n == default:
 335         return n
 336     if n.text is None:
 337         if default is not NO_DEFAULT:
 338             return default
 339         elif fatal:
 340             name = xpath if name is None else name
 341             raise ExtractorError('Could not find XML element\'s text %s' % name)
 342         else:
 343             return None
 344     return n.text
 345
 346
 347 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 348     n = find_xpath_attr(node, xpath, key)
 349     if n is None:
 350         if default is not NO_DEFAULT:
 351             return default
 352         elif fatal:
 353             name = f'{xpath}[@{key}]' if name is None else name
 354             raise ExtractorError('Could not find XML attribute %s' % name)
 355         else:
 356             return None
 357     return n.attrib[key]
 358
 359
 360 def get_element_by_id(id, html, **kwargs):
 361     """Return the content of the tag with the specified ID in the passed HTML document"""
 362     return get_element_by_attribute('id', id, html, **kwargs)
 363
 364
 365 def get_element_html_by_id(id, html, **kwargs):
 366     """Return the html of the tag with the specified ID in the passed HTML document"""
 367     return get_element_html_by_attribute('id', id, html, **kwargs)
 368
 369
 370 def get_element_by_class(class_name, html):
 371     """Return the content of the first tag with the specified class in the passed HTML document"""
 372     retval = get_elements_by_class(class_name, html)
 373     return retval[0] if retval else None
 374
 375
 376 def get_element_html_by_class(class_name, html):
 377     """Return the html of the first tag with the specified class in the passed HTML document"""
 378     retval = get_elements_html_by_class(class_name, html)
 379     return retval[0] if retval else None
 380
 381
 382 def get_element_by_attribute(attribute, value, html, **kwargs):
 383     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 384     return retval[0] if retval else None
 385
 386
 387 def get_element_html_by_attribute(attribute, value, html, **kargs):
 388     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 389     return retval[0] if retval else None
 390
 391
 392 def get_elements_by_class(class_name, html, **kargs):
 393     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 394     return get_elements_by_attribute(
 395         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 396         html, escape_value=False)
 397
 398
 399 def get_elements_html_by_class(class_name, html):
 400     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 401     return get_elements_html_by_attribute(
 402         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 403         html, escape_value=False)
 404
 405
 406 def get_elements_by_attribute(*args, **kwargs):
 407     """Return the content of the tag with the specified attribute in the passed HTML document"""
 408     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 409
 410
 411 def get_elements_html_by_attribute(*args, **kwargs):
 412     """Return the html of the tag with the specified attribute in the passed HTML document"""
 413     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 414
 415
 416 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 417     """
 418     Return the text (content) and the html (whole) of the tag with the specified
 419     attribute in the passed HTML document
 420     """
 421     if not value:
 422         return
 423
 424     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 425
 426     value = re.escape(value) if escape_value else value
 427
 428     partial_element_re = rf'''(?x)
 429         <(?P<tag>{tag})
 430          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 431          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 432         '''
 433
 434     for m in re.finditer(partial_element_re, html):
 435         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 436
 437         yield (
 438             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 439             whole
 440         )
 441
 442
 443 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 444     """
 445     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 446     closing tag for the first opening tag it has encountered, and can be used
 447     as a context manager
 448     """
 449
 450     class HTMLBreakOnClosingTagException(Exception):
 451         pass
 452
 453     def __init__(self):
 454         self.tagstack = collections.deque()
 455         html.parser.HTMLParser.__init__(self)
 456
 457     def __enter__(self):
 458         return self
 459
 460     def __exit__(self, *_):
 461         self.close()
 462
 463     def close(self):
 464         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 465         # so data remains buffered; we no longer have any interest in it, thus
 466         # override this method to discard it
 467         pass
 468
 469     def handle_starttag(self, tag, _):
 470         self.tagstack.append(tag)
 471
 472     def handle_endtag(self, tag):
 473         if not self.tagstack:
 474             raise compat_HTMLParseError('no tags in the stack')
 475         while self.tagstack:
 476             inner_tag = self.tagstack.pop()
 477             if inner_tag == tag:
 478                 break
 479         else:
 480             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 481         if not self.tagstack:
 482             raise self.HTMLBreakOnClosingTagException()
 483
 484
 485 # XXX: This should be far less strict
 486 def get_element_text_and_html_by_tag(tag, html):
 487     """
 488     For the first element with the specified tag in the passed HTML document
 489     return its' content (text) and the whole element (html)
 490     """
 491     def find_or_raise(haystack, needle, exc):
 492         try:
 493             return haystack.index(needle)
 494         except ValueError:
 495             raise exc
 496     closing_tag = f'</{tag}>'
 497     whole_start = find_or_raise(
 498         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 499     content_start = find_or_raise(
 500         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 501     content_start += whole_start + 1
 502     with HTMLBreakOnClosingTagParser() as parser:
 503         parser.feed(html[whole_start:content_start])
 504         if not parser.tagstack or parser.tagstack[0] != tag:
 505             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 506         offset = content_start
 507         while offset < len(html):
 508             next_closing_tag_start = find_or_raise(
 509                 html[offset:], closing_tag,
 510                 compat_HTMLParseError(f'closing {tag} tag not found'))
 511             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 512             try:
 513                 parser.feed(html[offset:offset + next_closing_tag_end])
 514                 offset += next_closing_tag_end
 515             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 516                 return html[content_start:offset + next_closing_tag_start], \
 517                     html[whole_start:offset + next_closing_tag_end]
 518         raise compat_HTMLParseError('unexpected end of html')
 519
 520
 521 class HTMLAttributeParser(html.parser.HTMLParser):
 522     """Trivial HTML parser to gather the attributes for a single element"""
 523
 524     def __init__(self):
 525         self.attrs = {}
 526         html.parser.HTMLParser.__init__(self)
 527
 528     def handle_starttag(self, tag, attrs):
 529         self.attrs = dict(attrs)
 530         raise compat_HTMLParseError('done')
 531
 532
 533 class HTMLListAttrsParser(html.parser.HTMLParser):
 534     """HTML parser to gather the attributes for the elements of a list"""
 535
 536     def __init__(self):
 537         html.parser.HTMLParser.__init__(self)
 538         self.items = []
 539         self._level = 0
 540
 541     def handle_starttag(self, tag, attrs):
 542         if tag == 'li' and self._level == 0:
 543             self.items.append(dict(attrs))
 544         self._level += 1
 545
 546     def handle_endtag(self, tag):
 547         self._level -= 1
 548
 549
 550 def extract_attributes(html_element):
 551     """Given a string for an HTML element such as
 552     <el
 553          a="foo" B="bar" c="&98;az" d=boz
 554          empty= noval entity="&amp;"
 555          sq='"' dq="'"
 556     >
 557     Decode and return a dictionary of attributes.
 558     {
 559         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 560         'empty': '', 'noval': None, 'entity': '&',
 561         'sq': '"', 'dq': '\''
 562     }.
 563     """
 564     parser = HTMLAttributeParser()
 565     with contextlib.suppress(compat_HTMLParseError):
 566         parser.feed(html_element)
 567         parser.close()
 568     return parser.attrs
 569
 570
 571 def parse_list(webpage):
 572     """Given a string for an series of HTML <li> elements,
 573     return a dictionary of their attributes"""
 574     parser = HTMLListAttrsParser()
 575     parser.feed(webpage)
 576     parser.close()
 577     return parser.items
 578
 579
 580 def clean_html(html):
 581     """Clean an HTML snippet into a readable string"""
 582
 583     if html is None:  # Convenience for sanitizing descriptions etc.
 584         return html
 585
 586     html = re.sub(r'\s+', ' ', html)
 587     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 588     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 589     # Strip html tags
 590     html = re.sub('<.*?>', '', html)
 591     # Replace html entities
 592     html = unescapeHTML(html)
 593     return html.strip()
 594
 595
 596 class LenientJSONDecoder(json.JSONDecoder):
 597     # TODO: Write tests
 598     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 599         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 600         self._close_attempts = 2 * close_objects
 601         super().__init__(*args, **kwargs)
 602
 603     @staticmethod
 604     def _close_object(err):
 605         doc = err.doc[:err.pos]
 606         # We need to add comma first to get the correct error message
 607         if err.msg.startswith('Expecting \',\''):
 608             return doc + ','
 609         elif not doc.endswith(','):
 610             return
 611
 612         if err.msg.startswith('Expecting property name'):
 613             return doc[:-1] + '}'
 614         elif err.msg.startswith('Expecting value'):
 615             return doc[:-1] + ']'
 616
 617     def decode(self, s):
 618         if self.transform_source:
 619             s = self.transform_source(s)
 620         for attempt in range(self._close_attempts + 1):
 621             try:
 622                 if self.ignore_extra:
 623                     return self.raw_decode(s.lstrip())[0]
 624                 return super().decode(s)
 625             except json.JSONDecodeError as e:
 626                 if e.pos is None:
 627                     raise
 628                 elif attempt < self._close_attempts:
 629                     s = self._close_object(e)
 630                     if s is not None:
 631                         continue
 632                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 633         assert False, 'Too many attempts to decode JSON'
 634
 635
 636 def sanitize_open(filename, open_mode):
 637     """Try to open the given filename, and slightly tweak it if this fails.
 638
 639     Attempts to open the given filename. If this fails, it tries to change
 640     the filename slightly, step by step, until it's either able to open it
 641     or it fails and raises a final exception, like the standard open()
 642     function.
 643
 644     It returns the tuple (stream, definitive_file_name).
 645     """
 646     if filename == '-':
 647         if sys.platform == 'win32':
 648             import msvcrt
 649
 650             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 651             with contextlib.suppress(io.UnsupportedOperation):
 652                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 653         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 654
 655     for attempt in range(2):
 656         try:
 657             try:
 658                 if sys.platform == 'win32':
 659                     # FIXME: An exclusive lock also locks the file from being read.
 660                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 661                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 662                     raise LockingUnsupportedError()
 663                 stream = locked_file(filename, open_mode, block=False).__enter__()
 664             except OSError:
 665                 stream = open(filename, open_mode)
 666             return stream, filename
 667         except OSError as err:
 668             if attempt or err.errno in (errno.EACCES,):
 669                 raise
 670             old_filename, filename = filename, sanitize_path(filename)
 671             if old_filename == filename:
 672                 raise
 673
 674
 675 def timeconvert(timestr):
 676     """Convert RFC 2822 defined time string into system timestamp"""
 677     timestamp = None
 678     timetuple = email.utils.parsedate_tz(timestr)
 679     if timetuple is not None:
 680         timestamp = email.utils.mktime_tz(timetuple)
 681     return timestamp
 682
 683
 684 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 685     """Sanitizes a string so it could be used as part of a filename.
 686     @param restricted   Use a stricter subset of allowed characters
 687     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 688                         If unset, yt-dlp's new sanitization rules are in effect
 689     """
 690     if s == '':
 691         return ''
 692
 693     def replace_insane(char):
 694         if restricted and char in ACCENT_CHARS:
 695             return ACCENT_CHARS[char]
 696         elif not restricted and char == '\n':
 697             return '\0 '
 698         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 699             # Replace with their full-width unicode counterparts
 700             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 701         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 702             return ''
 703         elif char == '"':
 704             return '' if restricted else '\''
 705         elif char == ':':
 706             return '\0_\0-' if restricted else '\0 \0-'
 707         elif char in '\\/|*<>':
 708             return '\0_'
 709         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 710             return '\0_'
 711         return char
 712
 713     # Replace look-alike Unicode glyphs
 714     if restricted and (is_id is NO_DEFAULT or not is_id):
 715         s = unicodedata.normalize('NFKC', s)
 716     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 717     result = ''.join(map(replace_insane, s))
 718     if is_id is NO_DEFAULT:
 719         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 720         STRIP_RE = r'(?:\0.|[ _-])*'
 721         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 722     result = result.replace('\0', '') or '_'
 723
 724     if not is_id:
 725         while '__' in result:
 726             result = result.replace('__', '_')
 727         result = result.strip('_')
 728         # Common case of "Foreign band name - English song title"
 729         if restricted and result.startswith('-_'):
 730             result = result[2:]
 731         if result.startswith('-'):
 732             result = '_' + result[len('-'):]
 733         result = result.lstrip('.')
 734         if not result:
 735             result = '_'
 736     return result
 737
 738
 739 def sanitize_path(s, force=False):
 740     """Sanitizes and normalizes path on Windows"""
 741     if sys.platform == 'win32':
 742         force = False
 743         drive_or_unc, _ = os.path.splitdrive(s)
 744     elif force:
 745         drive_or_unc = ''
 746     else:
 747         return s
 748
 749     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 750     if drive_or_unc:
 751         norm_path.pop(0)
 752     sanitized_path = [
 753         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 754         for path_part in norm_path]
 755     if drive_or_unc:
 756         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 757     elif force and s and s[0] == os.path.sep:
 758         sanitized_path.insert(0, os.path.sep)
 759     return os.path.join(*sanitized_path)
 760
 761
 762 def sanitize_url(url, *, scheme='http'):
 763     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 764     # the number of unwanted failures due to missing protocol
 765     if url is None:
 766         return
 767     elif url.startswith('//'):
 768         return f'{scheme}:{url}'
 769     # Fix some common typos seen so far
 770     COMMON_TYPOS = (
 771         # https://github.com/ytdl-org/youtube-dl/issues/15649
 772         (r'^httpss://', r'https://'),
 773         # https://bx1.be/lives/direct-tv/
 774         (r'^rmtp([es]?)://', r'rtmp\1://'),
 775     )
 776     for mistake, fixup in COMMON_TYPOS:
 777         if re.match(mistake, url):
 778             return re.sub(mistake, fixup, url)
 779     return url
 780
 781
 782 def extract_basic_auth(url):
 783     parts = urllib.parse.urlsplit(url)
 784     if parts.username is None:
 785         return url, None
 786     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 787         parts.hostname if parts.port is None
 788         else '%s:%d' % (parts.hostname, parts.port))))
 789     auth_payload = base64.b64encode(
 790         ('%s:%s' % (parts.username, parts.password or '')).encode())
 791     return url, f'Basic {auth_payload.decode()}'
 792
 793
 794 def sanitized_Request(url, *args, **kwargs):
 795     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 796     if auth_header is not None:
 797         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 798         headers['Authorization'] = auth_header
 799     return urllib.request.Request(url, *args, **kwargs)
 800
 801
 802 def expand_path(s):
 803     """Expand shell variables and ~"""
 804     return os.path.expandvars(compat_expanduser(s))
 805
 806
 807 def orderedSet(iterable, *, lazy=False):
 808     """Remove all duplicates from the input iterable"""
 809     def _iter():
 810         seen = []  # Do not use set since the items can be unhashable
 811         for x in iterable:
 812             if x not in seen:
 813                 seen.append(x)
 814                 yield x
 815
 816     return _iter() if lazy else list(_iter())
 817
 818
 819 def _htmlentity_transform(entity_with_semicolon):
 820     """Transforms an HTML entity to a character."""
 821     entity = entity_with_semicolon[:-1]
 822
 823     # Known non-numeric HTML entity
 824     if entity in html.entities.name2codepoint:
 825         return chr(html.entities.name2codepoint[entity])
 826
 827     # TODO: HTML5 allows entities without a semicolon.
 828     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 829     if entity_with_semicolon in html.entities.html5:
 830         return html.entities.html5[entity_with_semicolon]
 831
 832     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 833     if mobj is not None:
 834         numstr = mobj.group(1)
 835         if numstr.startswith('x'):
 836             base = 16
 837             numstr = '0%s' % numstr
 838         else:
 839             base = 10
 840         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 841         with contextlib.suppress(ValueError):
 842             return chr(int(numstr, base))
 843
 844     # Unknown entity in name, return its literal representation
 845     return '&%s;' % entity
 846
 847
 848 def unescapeHTML(s):
 849     if s is None:
 850         return None
 851     assert isinstance(s, str)
 852
 853     return re.sub(
 854         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 855
 856
 857 def escapeHTML(text):
 858     return (
 859         text
 860         .replace('&', '&amp;')
 861         .replace('<', '&lt;')
 862         .replace('>', '&gt;')
 863         .replace('"', '&quot;')
 864         .replace("'", '&#39;')
 865     )
 866
 867
 868 class netrc_from_content(netrc.netrc):
 869     def __init__(self, content):
 870         self.hosts, self.macros = {}, {}
 871         with io.StringIO(content) as stream:
 872             self._parse('-', stream, False)
 873
 874
 875 class Popen(subprocess.Popen):
 876     if sys.platform == 'win32':
 877         _startupinfo = subprocess.STARTUPINFO()
 878         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 879     else:
 880         _startupinfo = None
 881
 882     @staticmethod
 883     def _fix_pyinstaller_ld_path(env):
 884         """Restore LD_LIBRARY_PATH when using PyInstaller
 885             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 886                  https://github.com/yt-dlp/yt-dlp/issues/4573
 887         """
 888         if not hasattr(sys, '_MEIPASS'):
 889             return
 890
 891         def _fix(key):
 892             orig = env.get(f'{key}_ORIG')
 893             if orig is None:
 894                 env.pop(key, None)
 895             else:
 896                 env[key] = orig
 897
 898         _fix('LD_LIBRARY_PATH')  # Linux
 899         _fix('DYLD_LIBRARY_PATH')  # macOS
 900
 901     def __init__(self, *args, env=None, text=False, **kwargs):
 902         if env is None:
 903             env = os.environ.copy()
 904         self._fix_pyinstaller_ld_path(env)
 905
 906         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 907         if text is True:
 908             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 909             kwargs.setdefault('encoding', 'utf-8')
 910             kwargs.setdefault('errors', 'replace')
 911         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 912
 913     def communicate_or_kill(self, *args, **kwargs):
 914         try:
 915             return self.communicate(*args, **kwargs)
 916         except BaseException:  # Including KeyboardInterrupt
 917             self.kill(timeout=None)
 918             raise
 919
 920     def kill(self, *, timeout=0):
 921         super().kill()
 922         if timeout != 0:
 923             self.wait(timeout=timeout)
 924
 925     @classmethod
 926     def run(cls, *args, timeout=None, **kwargs):
 927         with cls(*args, **kwargs) as proc:
 928             default = '' if proc.__text_mode else b''
 929             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 930             return stdout or default, stderr or default, proc.returncode
 931
 932
 933 def encodeArgument(s):
 934     # Legacy code that uses byte strings
 935     # Uncomment the following line after fixing all post processors
 936     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 937     return s if isinstance(s, str) else s.decode('ascii')
 938
 939
 940 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 941
 942
 943 def timetuple_from_msec(msec):
 944     secs, msec = divmod(msec, 1000)
 945     mins, secs = divmod(secs, 60)
 946     hrs, mins = divmod(mins, 60)
 947     return _timetuple(hrs, mins, secs, msec)
 948
 949
 950 def formatSeconds(secs, delim=':', msec=False):
 951     time = timetuple_from_msec(secs * 1000)
 952     if time.hours:
 953         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 954     elif time.minutes:
 955         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 956     else:
 957         ret = '%d' % time.seconds
 958     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 959
 960
 961 def _ssl_load_windows_store_certs(ssl_context, storename):
 962     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 963     try:
 964         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 965                  if encoding == 'x509_asn' and (
 966                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 967     except PermissionError:
 968         return
 969     for cert in certs:
 970         with contextlib.suppress(ssl.SSLError):
 971             ssl_context.load_verify_locations(cadata=cert)
 972
 973
 974 def make_HTTPS_handler(params, **kwargs):
 975     opts_check_certificate = not params.get('nocheckcertificate')
 976     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 977     context.check_hostname = opts_check_certificate
 978     if params.get('legacyserverconnect'):
 979         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 980         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 981         context.set_ciphers('DEFAULT')
 982     elif (
 983         sys.version_info < (3, 10)
 984         and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
 985         and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
 986     ):
 987         # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
 988         # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
 989         # in some situations [2][3].
 990         # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
 991         # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
 992         # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
 993         # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
 994         # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
 995         # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
 996         # 4. https://peps.python.org/pep-0644/
 997         # 5. https://peps.python.org/pep-0644/#libressl-support
 998         # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
 999         context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1000         context.minimum_version = ssl.TLSVersion.TLSv1_2
1001
1002     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1003     if opts_check_certificate:
1004         if certifi and 'no-certifi' not in params.get('compat_opts', []):
1005             context.load_verify_locations(cafile=certifi.where())
1006         else:
1007             try:
1008                 context.load_default_certs()
1009                 # Work around the issue in load_default_certs when there are bad certificates. See:
1010                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1011                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1012             except ssl.SSLError:
1013                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1014                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1015                     for storename in ('CA', 'ROOT'):
1016                         _ssl_load_windows_store_certs(context, storename)
1017                 context.set_default_verify_paths()
1018
1019     client_certfile = params.get('client_certificate')
1020     if client_certfile:
1021         try:
1022             context.load_cert_chain(
1023                 client_certfile, keyfile=params.get('client_certificate_key'),
1024                 password=params.get('client_certificate_password'))
1025         except ssl.SSLError:
1026             raise YoutubeDLError('Unable to load client certificate')
1027
1028     # Some servers may reject requests if ALPN extension is not sent. See:
1029     # https://github.com/python/cpython/issues/85140
1030     # https://github.com/yt-dlp/yt-dlp/issues/3878
1031     with contextlib.suppress(NotImplementedError):
1032         context.set_alpn_protocols(['http/1.1'])
1033
1034     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1035
1036
1037 def bug_reports_message(before=';'):
1038     from ..update import REPOSITORY
1039
1040     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1041            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1042
1043     before = before.rstrip()
1044     if not before or before.endswith(('.', '!', '?')):
1045         msg = msg[0].title() + msg[1:]
1046
1047     return (before + ' ' if before else '') + msg
1048
1049
1050 class YoutubeDLError(Exception):
1051     """Base exception for YoutubeDL errors."""
1052     msg = None
1053
1054     def __init__(self, msg=None):
1055         if msg is not None:
1056             self.msg = msg
1057         elif self.msg is None:
1058             self.msg = type(self).__name__
1059         super().__init__(self.msg)
1060
1061
1062 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1063 if hasattr(ssl, 'CertificateError'):
1064     network_exceptions.append(ssl.CertificateError)
1065 network_exceptions = tuple(network_exceptions)
1066
1067
1068 class ExtractorError(YoutubeDLError):
1069     """Error during info extraction."""
1070
1071     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1072         """ tb, if given, is the original traceback (so that it can be printed out).
1073         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1074         """
1075         if sys.exc_info()[0] in network_exceptions:
1076             expected = True
1077
1078         self.orig_msg = str(msg)
1079         self.traceback = tb
1080         self.expected = expected
1081         self.cause = cause
1082         self.video_id = video_id
1083         self.ie = ie
1084         self.exc_info = sys.exc_info()  # preserve original exception
1085         if isinstance(self.exc_info[1], ExtractorError):
1086             self.exc_info = self.exc_info[1].exc_info
1087         super().__init__(self.__msg)
1088
1089     @property
1090     def __msg(self):
1091         return ''.join((
1092             format_field(self.ie, None, '[%s] '),
1093             format_field(self.video_id, None, '%s: '),
1094             self.orig_msg,
1095             format_field(self.cause, None, ' (caused by %r)'),
1096             '' if self.expected else bug_reports_message()))
1097
1098     def format_traceback(self):
1099         return join_nonempty(
1100             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1101             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1102             delim='\n') or None
1103
1104     def __setattr__(self, name, value):
1105         super().__setattr__(name, value)
1106         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1107             self.msg = self.__msg or type(self).__name__
1108             self.args = (self.msg, )  # Cannot be property
1109
1110
1111 class UnsupportedError(ExtractorError):
1112     def __init__(self, url):
1113         super().__init__(
1114             'Unsupported URL: %s' % url, expected=True)
1115         self.url = url
1116
1117
1118 class RegexNotFoundError(ExtractorError):
1119     """Error when a regex didn't match"""
1120     pass
1121
1122
1123 class GeoRestrictedError(ExtractorError):
1124     """Geographic restriction Error exception.
1125
1126     This exception may be thrown when a video is not available from your
1127     geographic location due to geographic restrictions imposed by a website.
1128     """
1129
1130     def __init__(self, msg, countries=None, **kwargs):
1131         kwargs['expected'] = True
1132         super().__init__(msg, **kwargs)
1133         self.countries = countries
1134
1135
1136 class UserNotLive(ExtractorError):
1137     """Error when a channel/user is not live"""
1138
1139     def __init__(self, msg=None, **kwargs):
1140         kwargs['expected'] = True
1141         super().__init__(msg or 'The channel is not currently live', **kwargs)
1142
1143
1144 class DownloadError(YoutubeDLError):
1145     """Download Error exception.
1146
1147     This exception may be thrown by FileDownloader objects if they are not
1148     configured to continue on errors. They will contain the appropriate
1149     error message.
1150     """
1151
1152     def __init__(self, msg, exc_info=None):
1153         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1154         super().__init__(msg)
1155         self.exc_info = exc_info
1156
1157
1158 class EntryNotInPlaylist(YoutubeDLError):
1159     """Entry not in playlist exception.
1160
1161     This exception will be thrown by YoutubeDL when a requested entry
1162     is not found in the playlist info_dict
1163     """
1164     msg = 'Entry not found in info'
1165
1166
1167 class SameFileError(YoutubeDLError):
1168     """Same File exception.
1169
1170     This exception will be thrown by FileDownloader objects if they detect
1171     multiple files would have to be downloaded to the same file on disk.
1172     """
1173     msg = 'Fixed output name but more than one file to download'
1174
1175     def __init__(self, filename=None):
1176         if filename is not None:
1177             self.msg += f': {filename}'
1178         super().__init__(self.msg)
1179
1180
1181 class PostProcessingError(YoutubeDLError):
1182     """Post Processing exception.
1183
1184     This exception may be raised by PostProcessor's .run() method to
1185     indicate an error in the postprocessing task.
1186     """
1187
1188
1189 class DownloadCancelled(YoutubeDLError):
1190     """ Exception raised when the download queue should be interrupted """
1191     msg = 'The download was cancelled'
1192
1193
1194 class ExistingVideoReached(DownloadCancelled):
1195     """ --break-on-existing triggered """
1196     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1197
1198
1199 class RejectedVideoReached(DownloadCancelled):
1200     """ --break-match-filter triggered """
1201     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1202
1203
1204 class MaxDownloadsReached(DownloadCancelled):
1205     """ --max-downloads limit has been reached. """
1206     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1207
1208
1209 class ReExtractInfo(YoutubeDLError):
1210     """ Video info needs to be re-extracted. """
1211
1212     def __init__(self, msg, expected=False):
1213         super().__init__(msg)
1214         self.expected = expected
1215
1216
1217 class ThrottledDownload(ReExtractInfo):
1218     """ Download speed below --throttled-rate. """
1219     msg = 'The download speed is below throttle limit'
1220
1221     def __init__(self):
1222         super().__init__(self.msg, expected=False)
1223
1224
1225 class UnavailableVideoError(YoutubeDLError):
1226     """Unavailable Format exception.
1227
1228     This exception will be thrown when a video is requested
1229     in a format that is not available for that video.
1230     """
1231     msg = 'Unable to download video'
1232
1233     def __init__(self, err=None):
1234         if err is not None:
1235             self.msg += f': {err}'
1236         super().__init__(self.msg)
1237
1238
1239 class ContentTooShortError(YoutubeDLError):
1240     """Content Too Short exception.
1241
1242     This exception may be raised by FileDownloader objects when a file they
1243     download is too small for what the server announced first, indicating
1244     the connection was probably interrupted.
1245     """
1246
1247     def __init__(self, downloaded, expected):
1248         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1249         # Both in bytes
1250         self.downloaded = downloaded
1251         self.expected = expected
1252
1253
1254 class XAttrMetadataError(YoutubeDLError):
1255     def __init__(self, code=None, msg='Unknown error'):
1256         super().__init__(msg)
1257         self.code = code
1258         self.msg = msg
1259
1260         # Parsing code and msg
1261         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1262                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1263             self.reason = 'NO_SPACE'
1264         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1265             self.reason = 'VALUE_TOO_LONG'
1266         else:
1267             self.reason = 'NOT_SUPPORTED'
1268
1269
1270 class XAttrUnavailableError(YoutubeDLError):
1271     pass
1272
1273
1274 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1275     hc = http_class(*args, **kwargs)
1276     source_address = ydl_handler._params.get('source_address')
1277
1278     if source_address is not None:
1279         # This is to workaround _create_connection() from socket where it will try all
1280         # address data from getaddrinfo() including IPv6. This filters the result from
1281         # getaddrinfo() based on the source_address value.
1282         # This is based on the cpython socket.create_connection() function.
1283         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1284         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1285             host, port = address
1286             err = None
1287             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1288             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1289             ip_addrs = [addr for addr in addrs if addr[0] == af]
1290             if addrs and not ip_addrs:
1291                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1292                 raise OSError(
1293                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1294                     % (ip_version, source_address[0]))
1295             for res in ip_addrs:
1296                 af, socktype, proto, canonname, sa = res
1297                 sock = None
1298                 try:
1299                     sock = socket.socket(af, socktype, proto)
1300                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1301                         sock.settimeout(timeout)
1302                     sock.bind(source_address)
1303                     sock.connect(sa)
1304                     err = None  # Explicitly break reference cycle
1305                     return sock
1306                 except OSError as _:
1307                     err = _
1308                     if sock is not None:
1309                         sock.close()
1310             if err is not None:
1311                 raise err
1312             else:
1313                 raise OSError('getaddrinfo returns an empty list')
1314         if hasattr(hc, '_create_connection'):
1315             hc._create_connection = _create_connection
1316         hc.source_address = (source_address, 0)
1317
1318     return hc
1319
1320
1321 class YoutubeDLHandler(urllib.request.HTTPHandler):
1322     """Handler for HTTP requests and responses.
1323
1324     This class, when installed with an OpenerDirector, automatically adds
1325     the standard headers to every HTTP request and handles gzipped, deflated and
1326     brotli responses from web servers.
1327
1328     Part of this code was copied from:
1329
1330     http://techknack.net/python-urllib2-handlers/
1331
1332     Andrew Rowls, the author of that code, agreed to release it to the
1333     public domain.
1334     """
1335
1336     def __init__(self, params, *args, **kwargs):
1337         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1338         self._params = params
1339
1340     def http_open(self, req):
1341         conn_class = http.client.HTTPConnection
1342
1343         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1344         if socks_proxy:
1345             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1346             del req.headers['Ytdl-socks-proxy']
1347
1348         return self.do_open(functools.partial(
1349             _create_http_connection, self, conn_class, False),
1350             req)
1351
1352     @staticmethod
1353     def deflate(data):
1354         if not data:
1355             return data
1356         try:
1357             return zlib.decompress(data, -zlib.MAX_WBITS)
1358         except zlib.error:
1359             return zlib.decompress(data)
1360
1361     @staticmethod
1362     def brotli(data):
1363         if not data:
1364             return data
1365         return brotli.decompress(data)
1366
1367     @staticmethod
1368     def gz(data):
1369         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
1370         try:
1371             return gz.read()
1372         except OSError as original_oserror:
1373             # There may be junk add the end of the file
1374             # See http://stackoverflow.com/q/4928560/35070 for details
1375             for i in range(1, 1024):
1376                 try:
1377                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
1378                     return gz.read()
1379                 except OSError:
1380                     continue
1381             else:
1382                 raise original_oserror
1383
1384     def http_request(self, req):
1385         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1386         # always respected by websites, some tend to give out URLs with non percent-encoded
1387         # non-ASCII characters (see telemb.py, ard.py [#3412])
1388         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1389         # To work around aforementioned issue we will replace request's original URL with
1390         # percent-encoded one
1391         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1392         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1393         url = req.get_full_url()
1394         url_escaped = escape_url(url)
1395
1396         # Substitute URL if any change after escaping
1397         if url != url_escaped:
1398             req = update_Request(req, url=url_escaped)
1399
1400         for h, v in self._params.get('http_headers', std_headers).items():
1401             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1402             # The dict keys are capitalized because of this bug by urllib
1403             if h.capitalize() not in req.headers:
1404                 req.add_header(h, v)
1405
1406         if 'Youtubedl-no-compression' in req.headers:  # deprecated
1407             req.headers.pop('Youtubedl-no-compression', None)
1408             req.add_header('Accept-encoding', 'identity')
1409
1410         if 'Accept-encoding' not in req.headers:
1411             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1412
1413         return super().do_request_(req)
1414
1415     def http_response(self, req, resp):
1416         old_resp = resp
1417
1418         # Content-Encoding header lists the encodings in order that they were applied [1].
1419         # To decompress, we simply do the reverse.
1420         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
1421         decoded_response = None
1422         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
1423             if encoding == 'gzip':
1424                 decoded_response = self.gz(decoded_response or resp.read())
1425             elif encoding == 'deflate':
1426                 decoded_response = self.deflate(decoded_response or resp.read())
1427             elif encoding == 'br' and brotli:
1428                 decoded_response = self.brotli(decoded_response or resp.read())
1429
1430         if decoded_response is not None:
1431             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
1432             resp.msg = old_resp.msg
1433         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1434         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1435         if 300 <= resp.code < 400:
1436             location = resp.headers.get('Location')
1437             if location:
1438                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1439                 location = location.encode('iso-8859-1').decode()
1440                 location_escaped = escape_url(location)
1441                 if location != location_escaped:
1442                     del resp.headers['Location']
1443                     resp.headers['Location'] = location_escaped
1444         return resp
1445
1446     https_request = http_request
1447     https_response = http_response
1448
1449
1450 def make_socks_conn_class(base_class, socks_proxy):
1451     assert issubclass(base_class, (
1452         http.client.HTTPConnection, http.client.HTTPSConnection))
1453
1454     url_components = urllib.parse.urlparse(socks_proxy)
1455     if url_components.scheme.lower() == 'socks5':
1456         socks_type = ProxyType.SOCKS5
1457     elif url_components.scheme.lower() in ('socks', 'socks4'):
1458         socks_type = ProxyType.SOCKS4
1459     elif url_components.scheme.lower() == 'socks4a':
1460         socks_type = ProxyType.SOCKS4A
1461
1462     def unquote_if_non_empty(s):
1463         if not s:
1464             return s
1465         return urllib.parse.unquote_plus(s)
1466
1467     proxy_args = (
1468         socks_type,
1469         url_components.hostname, url_components.port or 1080,
1470         True,  # Remote DNS
1471         unquote_if_non_empty(url_components.username),
1472         unquote_if_non_empty(url_components.password),
1473     )
1474
1475     class SocksConnection(base_class):
1476         def connect(self):
1477             self.sock = sockssocket()
1478             self.sock.setproxy(*proxy_args)
1479             if isinstance(self.timeout, (int, float)):
1480                 self.sock.settimeout(self.timeout)
1481             self.sock.connect((self.host, self.port))
1482
1483             if isinstance(self, http.client.HTTPSConnection):
1484                 if hasattr(self, '_context'):  # Python > 2.6
1485                     self.sock = self._context.wrap_socket(
1486                         self.sock, server_hostname=self.host)
1487                 else:
1488                     self.sock = ssl.wrap_socket(self.sock)
1489
1490     return SocksConnection
1491
1492
1493 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1494     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1495         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1496         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1497         self._params = params
1498
1499     def https_open(self, req):
1500         kwargs = {}
1501         conn_class = self._https_conn_class
1502
1503         if hasattr(self, '_context'):  # python > 2.6
1504             kwargs['context'] = self._context
1505         if hasattr(self, '_check_hostname'):  # python 3.x
1506             kwargs['check_hostname'] = self._check_hostname
1507
1508         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1509         if socks_proxy:
1510             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1511             del req.headers['Ytdl-socks-proxy']
1512
1513         try:
1514             return self.do_open(
1515                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1516         except urllib.error.URLError as e:
1517             if (isinstance(e.reason, ssl.SSLError)
1518                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1519                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1520             raise
1521
1522
1523 def is_path_like(f):
1524     return isinstance(f, (str, bytes, os.PathLike))
1525
1526
1527 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1528     def __init__(self, cookiejar=None):
1529         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1530
1531     def http_response(self, request, response):
1532         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1533
1534     https_request = urllib.request.HTTPCookieProcessor.http_request
1535     https_response = http_response
1536
1537
1538 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1539     """YoutubeDL redirect handler
1540
1541     The code is based on HTTPRedirectHandler implementation from CPython [1].
1542
1543     This redirect handler fixes and improves the logic to better align with RFC7261
1544      and what browsers tend to do [2][3]
1545
1546     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1547     2. https://datatracker.ietf.org/doc/html/rfc7231
1548     3. https://github.com/python/cpython/issues/91306
1549     """
1550
1551     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1552
1553     def redirect_request(self, req, fp, code, msg, headers, newurl):
1554         if code not in (301, 302, 303, 307, 308):
1555             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1556
1557         new_method = req.get_method()
1558         new_data = req.data
1559         remove_headers = []
1560         # A 303 must either use GET or HEAD for subsequent request
1561         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1562         if code == 303 and req.get_method() != 'HEAD':
1563             new_method = 'GET'
1564         # 301 and 302 redirects are commonly turned into a GET from a POST
1565         # for subsequent requests by browsers, so we'll do the same.
1566         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1567         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1568         elif code in (301, 302) and req.get_method() == 'POST':
1569             new_method = 'GET'
1570
1571         # only remove payload if method changed (e.g. POST to GET)
1572         if new_method != req.get_method():
1573             new_data = None
1574             remove_headers.extend(['Content-Length', 'Content-Type'])
1575
1576         new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
1577
1578         return urllib.request.Request(
1579             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1580             unverifiable=True, method=new_method, data=new_data)
1581
1582
1583 def extract_timezone(date_str):
1584     m = re.search(
1585         r'''(?x)
1586             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1587             (?P<tz>Z|                                            # just the UTC Z, or
1588                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1589                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1590                    [ ]?                                          # optional space
1591                 (?P<sign>\+|-)                                   # +/-
1592                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1593             $)
1594         ''', date_str)
1595     if not m:
1596         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1597         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1598         if timezone is not None:
1599             date_str = date_str[:-len(m.group('tz'))]
1600         timezone = datetime.timedelta(hours=timezone or 0)
1601     else:
1602         date_str = date_str[:-len(m.group('tz'))]
1603         if not m.group('sign'):
1604             timezone = datetime.timedelta()
1605         else:
1606             sign = 1 if m.group('sign') == '+' else -1
1607             timezone = datetime.timedelta(
1608                 hours=sign * int(m.group('hours')),
1609                 minutes=sign * int(m.group('minutes')))
1610     return timezone, date_str
1611
1612
1613 def parse_iso8601(date_str, delimiter='T', timezone=None):
1614     """ Return a UNIX timestamp from the given date """
1615
1616     if date_str is None:
1617         return None
1618
1619     date_str = re.sub(r'\.[0-9]+', '', date_str)
1620
1621     if timezone is None:
1622         timezone, date_str = extract_timezone(date_str)
1623
1624     with contextlib.suppress(ValueError):
1625         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1626         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1627         return calendar.timegm(dt.timetuple())
1628
1629
1630 def date_formats(day_first=True):
1631     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1632
1633
1634 def unified_strdate(date_str, day_first=True):
1635     """Return a string with the date in the format YYYYMMDD"""
1636
1637     if date_str is None:
1638         return None
1639     upload_date = None
1640     # Replace commas
1641     date_str = date_str.replace(',', ' ')
1642     # Remove AM/PM + timezone
1643     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1644     _, date_str = extract_timezone(date_str)
1645
1646     for expression in date_formats(day_first):
1647         with contextlib.suppress(ValueError):
1648             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1649     if upload_date is None:
1650         timetuple = email.utils.parsedate_tz(date_str)
1651         if timetuple:
1652             with contextlib.suppress(ValueError):
1653                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1654     if upload_date is not None:
1655         return str(upload_date)
1656
1657
1658 def unified_timestamp(date_str, day_first=True):
1659     if not isinstance(date_str, str):
1660         return None
1661
1662     date_str = re.sub(r'\s+', ' ', re.sub(
1663         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1664
1665     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1666     timezone, date_str = extract_timezone(date_str)
1667
1668     # Remove AM/PM + timezone
1669     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1670
1671     # Remove unrecognized timezones from ISO 8601 alike timestamps
1672     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1673     if m:
1674         date_str = date_str[:-len(m.group('tz'))]
1675
1676     # Python only supports microseconds, so remove nanoseconds
1677     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1678     if m:
1679         date_str = m.group(1)
1680
1681     for expression in date_formats(day_first):
1682         with contextlib.suppress(ValueError):
1683             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1684             return calendar.timegm(dt.timetuple())
1685
1686     timetuple = email.utils.parsedate_tz(date_str)
1687     if timetuple:
1688         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1689
1690
1691 def determine_ext(url, default_ext='unknown_video'):
1692     if url is None or '.' not in url:
1693         return default_ext
1694     guess = url.partition('?')[0].rpartition('.')[2]
1695     if re.match(r'^[A-Za-z0-9]+$', guess):
1696         return guess
1697     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1698     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1699         return guess.rstrip('/')
1700     else:
1701         return default_ext
1702
1703
1704 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1705     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1706
1707
1708 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1709     R"""
1710     Return a datetime object from a string.
1711     Supported format:
1712         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1713
1714     @param format       strftime format of DATE
1715     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1716                         auto: round to the unit provided in date_str (if applicable).
1717     """
1718     auto_precision = False
1719     if precision == 'auto':
1720         auto_precision = True
1721         precision = 'microsecond'
1722     today = datetime_round(datetime.datetime.utcnow(), precision)
1723     if date_str in ('now', 'today'):
1724         return today
1725     if date_str == 'yesterday':
1726         return today - datetime.timedelta(days=1)
1727     match = re.match(
1728         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1729         date_str)
1730     if match is not None:
1731         start_time = datetime_from_str(match.group('start'), precision, format)
1732         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1733         unit = match.group('unit')
1734         if unit == 'month' or unit == 'year':
1735             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1736             unit = 'day'
1737         else:
1738             if unit == 'week':
1739                 unit = 'day'
1740                 time *= 7
1741             delta = datetime.timedelta(**{unit + 's': time})
1742             new_date = start_time + delta
1743         if auto_precision:
1744             return datetime_round(new_date, unit)
1745         return new_date
1746
1747     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1748
1749
1750 def date_from_str(date_str, format='%Y%m%d', strict=False):
1751     R"""
1752     Return a date object from a string using datetime_from_str
1753
1754     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1755                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1756     """
1757     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1758         raise ValueError(f'Invalid date format "{date_str}"')
1759     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1760
1761
1762 def datetime_add_months(dt, months):
1763     """Increment/Decrement a datetime object by months."""
1764     month = dt.month + months - 1
1765     year = dt.year + month // 12
1766     month = month % 12 + 1
1767     day = min(dt.day, calendar.monthrange(year, month)[1])
1768     return dt.replace(year, month, day)
1769
1770
1771 def datetime_round(dt, precision='day'):
1772     """
1773     Round a datetime object's time to a specific precision
1774     """
1775     if precision == 'microsecond':
1776         return dt
1777
1778     unit_seconds = {
1779         'day': 86400,
1780         'hour': 3600,
1781         'minute': 60,
1782         'second': 1,
1783     }
1784     roundto = lambda x, n: ((x + n / 2) // n) * n
1785     timestamp = calendar.timegm(dt.timetuple())
1786     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1787
1788
1789 def hyphenate_date(date_str):
1790     """
1791     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1792     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1793     if match is not None:
1794         return '-'.join(match.groups())
1795     else:
1796         return date_str
1797
1798
1799 class DateRange:
1800     """Represents a time interval between two dates"""
1801
1802     def __init__(self, start=None, end=None):
1803         """start and end must be strings in the format accepted by date"""
1804         if start is not None:
1805             self.start = date_from_str(start, strict=True)
1806         else:
1807             self.start = datetime.datetime.min.date()
1808         if end is not None:
1809             self.end = date_from_str(end, strict=True)
1810         else:
1811             self.end = datetime.datetime.max.date()
1812         if self.start > self.end:
1813             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1814
1815     @classmethod
1816     def day(cls, day):
1817         """Returns a range that only contains the given day"""
1818         return cls(day, day)
1819
1820     def __contains__(self, date):
1821         """Check if the date is in the range"""
1822         if not isinstance(date, datetime.date):
1823             date = date_from_str(date)
1824         return self.start <= date <= self.end
1825
1826     def __repr__(self):
1827         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1828
1829     def __eq__(self, other):
1830         return (isinstance(other, DateRange)
1831                 and self.start == other.start and self.end == other.end)
1832
1833
1834 @functools.cache
1835 def system_identifier():
1836     python_implementation = platform.python_implementation()
1837     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1838         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1839     libc_ver = []
1840     with contextlib.suppress(OSError):  # We may not have access to the executable
1841         libc_ver = platform.libc_ver()
1842
1843     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1844         platform.python_version(),
1845         python_implementation,
1846         platform.machine(),
1847         platform.architecture()[0],
1848         platform.platform(),
1849         ssl.OPENSSL_VERSION,
1850         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1851     )
1852
1853
1854 @functools.cache
1855 def get_windows_version():
1856     ''' Get Windows version. returns () if it's not running on Windows '''
1857     if compat_os_name == 'nt':
1858         return version_tuple(platform.win32_ver()[1])
1859     else:
1860         return ()
1861
1862
1863 def write_string(s, out=None, encoding=None):
1864     assert isinstance(s, str)
1865     out = out or sys.stderr
1866     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1867     if not out:
1868         return
1869
1870     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1871         s = re.sub(r'([\r\n]+)', r' \1', s)
1872
1873     enc, buffer = None, out
1874     if 'b' in getattr(out, 'mode', ''):
1875         enc = encoding or preferredencoding()
1876     elif hasattr(out, 'buffer'):
1877         buffer = out.buffer
1878         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1879
1880     buffer.write(s.encode(enc, 'ignore') if enc else s)
1881     out.flush()
1882
1883
1884 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1885     from .. import _IN_CLI
1886     if _IN_CLI:
1887         if msg in deprecation_warning._cache:
1888             return
1889         deprecation_warning._cache.add(msg)
1890         if printer:
1891             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1892         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1893     else:
1894         import warnings
1895         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1896
1897
1898 deprecation_warning._cache = set()
1899
1900
1901 def bytes_to_intlist(bs):
1902     if not bs:
1903         return []
1904     if isinstance(bs[0], int):  # Python 3
1905         return list(bs)
1906     else:
1907         return [ord(c) for c in bs]
1908
1909
1910 def intlist_to_bytes(xs):
1911     if not xs:
1912         return b''
1913     return struct.pack('%dB' % len(xs), *xs)
1914
1915
1916 class LockingUnsupportedError(OSError):
1917     msg = 'File locking is not supported'
1918
1919     def __init__(self):
1920         super().__init__(self.msg)
1921
1922
1923 # Cross-platform file locking
1924 if sys.platform == 'win32':
1925     import ctypes
1926     import ctypes.wintypes
1927     import msvcrt
1928
1929     class OVERLAPPED(ctypes.Structure):
1930         _fields_ = [
1931             ('Internal', ctypes.wintypes.LPVOID),
1932             ('InternalHigh', ctypes.wintypes.LPVOID),
1933             ('Offset', ctypes.wintypes.DWORD),
1934             ('OffsetHigh', ctypes.wintypes.DWORD),
1935             ('hEvent', ctypes.wintypes.HANDLE),
1936         ]
1937
1938     kernel32 = ctypes.WinDLL('kernel32')
1939     LockFileEx = kernel32.LockFileEx
1940     LockFileEx.argtypes = [
1941         ctypes.wintypes.HANDLE,     # hFile
1942         ctypes.wintypes.DWORD,      # dwFlags
1943         ctypes.wintypes.DWORD,      # dwReserved
1944         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1945         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1946         ctypes.POINTER(OVERLAPPED)  # Overlapped
1947     ]
1948     LockFileEx.restype = ctypes.wintypes.BOOL
1949     UnlockFileEx = kernel32.UnlockFileEx
1950     UnlockFileEx.argtypes = [
1951         ctypes.wintypes.HANDLE,     # hFile
1952         ctypes.wintypes.DWORD,      # dwReserved
1953         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1954         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1955         ctypes.POINTER(OVERLAPPED)  # Overlapped
1956     ]
1957     UnlockFileEx.restype = ctypes.wintypes.BOOL
1958     whole_low = 0xffffffff
1959     whole_high = 0x7fffffff
1960
1961     def _lock_file(f, exclusive, block):
1962         overlapped = OVERLAPPED()
1963         overlapped.Offset = 0
1964         overlapped.OffsetHigh = 0
1965         overlapped.hEvent = 0
1966         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1967
1968         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1969                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1970                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1971             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1972             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1973
1974     def _unlock_file(f):
1975         assert f._lock_file_overlapped_p
1976         handle = msvcrt.get_osfhandle(f.fileno())
1977         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1978             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1979
1980 else:
1981     try:
1982         import fcntl
1983
1984         def _lock_file(f, exclusive, block):
1985             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1986             if not block:
1987                 flags |= fcntl.LOCK_NB
1988             try:
1989                 fcntl.flock(f, flags)
1990             except BlockingIOError:
1991                 raise
1992             except OSError:  # AOSP does not have flock()
1993                 fcntl.lockf(f, flags)
1994
1995         def _unlock_file(f):
1996             with contextlib.suppress(OSError):
1997                 return fcntl.flock(f, fcntl.LOCK_UN)
1998             with contextlib.suppress(OSError):
1999                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
2000             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
2001
2002     except ImportError:
2003
2004         def _lock_file(f, exclusive, block):
2005             raise LockingUnsupportedError()
2006
2007         def _unlock_file(f):
2008             raise LockingUnsupportedError()
2009
2010
2011 class locked_file:
2012     locked = False
2013
2014     def __init__(self, filename, mode, block=True, encoding=None):
2015         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2016             raise NotImplementedError(mode)
2017         self.mode, self.block = mode, block
2018
2019         writable = any(f in mode for f in 'wax+')
2020         readable = any(f in mode for f in 'r+')
2021         flags = functools.reduce(operator.ior, (
2022             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2023             getattr(os, 'O_BINARY', 0),  # Windows only
2024             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2025             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2026             os.O_APPEND if 'a' in mode else 0,
2027             os.O_EXCL if 'x' in mode else 0,
2028             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2029         ))
2030
2031         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2032
2033     def __enter__(self):
2034         exclusive = 'r' not in self.mode
2035         try:
2036             _lock_file(self.f, exclusive, self.block)
2037             self.locked = True
2038         except OSError:
2039             self.f.close()
2040             raise
2041         if 'w' in self.mode:
2042             try:
2043                 self.f.truncate()
2044             except OSError as e:
2045                 if e.errno not in (
2046                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2047                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2048                 ):
2049                     raise
2050         return self
2051
2052     def unlock(self):
2053         if not self.locked:
2054             return
2055         try:
2056             _unlock_file(self.f)
2057         finally:
2058             self.locked = False
2059
2060     def __exit__(self, *_):
2061         try:
2062             self.unlock()
2063         finally:
2064             self.f.close()
2065
2066     open = __enter__
2067     close = __exit__
2068
2069     def __getattr__(self, attr):
2070         return getattr(self.f, attr)
2071
2072     def __iter__(self):
2073         return iter(self.f)
2074
2075
2076 @functools.cache
2077 def get_filesystem_encoding():
2078     encoding = sys.getfilesystemencoding()
2079     return encoding if encoding is not None else 'utf-8'
2080
2081
2082 def shell_quote(args):
2083     quoted_args = []
2084     encoding = get_filesystem_encoding()
2085     for a in args:
2086         if isinstance(a, bytes):
2087             # We may get a filename encoded with 'encodeFilename'
2088             a = a.decode(encoding)
2089         quoted_args.append(compat_shlex_quote(a))
2090     return ' '.join(quoted_args)
2091
2092
2093 def smuggle_url(url, data):
2094     """ Pass additional data in a URL for internal use. """
2095
2096     url, idata = unsmuggle_url(url, {})
2097     data.update(idata)
2098     sdata = urllib.parse.urlencode(
2099         {'__youtubedl_smuggle': json.dumps(data)})
2100     return url + '#' + sdata
2101
2102
2103 def unsmuggle_url(smug_url, default=None):
2104     if '#__youtubedl_smuggle' not in smug_url:
2105         return smug_url, default
2106     url, _, sdata = smug_url.rpartition('#')
2107     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2108     data = json.loads(jsond)
2109     return url, data
2110
2111
2112 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2113     """ Formats numbers with decimal sufixes like K, M, etc """
2114     num, factor = float_or_none(num), float(factor)
2115     if num is None or num < 0:
2116         return None
2117     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2118     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2119     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2120     if factor == 1024:
2121         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2122     converted = num / (factor ** exponent)
2123     return fmt % (converted, suffix)
2124
2125
2126 def format_bytes(bytes):
2127     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2128
2129
2130 def lookup_unit_table(unit_table, s, strict=False):
2131     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2132     units_re = '|'.join(re.escape(u) for u in unit_table)
2133     m = (re.fullmatch if strict else re.match)(
2134         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2135     if not m:
2136         return None
2137
2138     num = float(m.group('num').replace(',', '.'))
2139     mult = unit_table[m.group('unit')]
2140     return round(num * mult)
2141
2142
2143 def parse_bytes(s):
2144     """Parse a string indicating a byte quantity into an integer"""
2145     return lookup_unit_table(
2146         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2147         s.upper(), strict=True)
2148
2149
2150 def parse_filesize(s):
2151     if s is None:
2152         return None
2153
2154     # The lower-case forms are of course incorrect and unofficial,
2155     # but we support those too
2156     _UNIT_TABLE = {
2157         'B': 1,
2158         'b': 1,
2159         'bytes': 1,
2160         'KiB': 1024,
2161         'KB': 1000,
2162         'kB': 1024,
2163         'Kb': 1000,
2164         'kb': 1000,
2165         'kilobytes': 1000,
2166         'kibibytes': 1024,
2167         'MiB': 1024 ** 2,
2168         'MB': 1000 ** 2,
2169         'mB': 1024 ** 2,
2170         'Mb': 1000 ** 2,
2171         'mb': 1000 ** 2,
2172         'megabytes': 1000 ** 2,
2173         'mebibytes': 1024 ** 2,
2174         'GiB': 1024 ** 3,
2175         'GB': 1000 ** 3,
2176         'gB': 1024 ** 3,
2177         'Gb': 1000 ** 3,
2178         'gb': 1000 ** 3,
2179         'gigabytes': 1000 ** 3,
2180         'gibibytes': 1024 ** 3,
2181         'TiB': 1024 ** 4,
2182         'TB': 1000 ** 4,
2183         'tB': 1024 ** 4,
2184         'Tb': 1000 ** 4,
2185         'tb': 1000 ** 4,
2186         'terabytes': 1000 ** 4,
2187         'tebibytes': 1024 ** 4,
2188         'PiB': 1024 ** 5,
2189         'PB': 1000 ** 5,
2190         'pB': 1024 ** 5,
2191         'Pb': 1000 ** 5,
2192         'pb': 1000 ** 5,
2193         'petabytes': 1000 ** 5,
2194         'pebibytes': 1024 ** 5,
2195         'EiB': 1024 ** 6,
2196         'EB': 1000 ** 6,
2197         'eB': 1024 ** 6,
2198         'Eb': 1000 ** 6,
2199         'eb': 1000 ** 6,
2200         'exabytes': 1000 ** 6,
2201         'exbibytes': 1024 ** 6,
2202         'ZiB': 1024 ** 7,
2203         'ZB': 1000 ** 7,
2204         'zB': 1024 ** 7,
2205         'Zb': 1000 ** 7,
2206         'zb': 1000 ** 7,
2207         'zettabytes': 1000 ** 7,
2208         'zebibytes': 1024 ** 7,
2209         'YiB': 1024 ** 8,
2210         'YB': 1000 ** 8,
2211         'yB': 1024 ** 8,
2212         'Yb': 1000 ** 8,
2213         'yb': 1000 ** 8,
2214         'yottabytes': 1000 ** 8,
2215         'yobibytes': 1024 ** 8,
2216     }
2217
2218     return lookup_unit_table(_UNIT_TABLE, s)
2219
2220
2221 def parse_count(s):
2222     if s is None:
2223         return None
2224
2225     s = re.sub(r'^[^\d]+\s', '', s).strip()
2226
2227     if re.match(r'^[\d,.]+$', s):
2228         return str_to_int(s)
2229
2230     _UNIT_TABLE = {
2231         'k': 1000,
2232         'K': 1000,
2233         'm': 1000 ** 2,
2234         'M': 1000 ** 2,
2235         'kk': 1000 ** 2,
2236         'KK': 1000 ** 2,
2237         'b': 1000 ** 3,
2238         'B': 1000 ** 3,
2239     }
2240
2241     ret = lookup_unit_table(_UNIT_TABLE, s)
2242     if ret is not None:
2243         return ret
2244
2245     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2246     if mobj:
2247         return str_to_int(mobj.group(1))
2248
2249
2250 def parse_resolution(s, *, lenient=False):
2251     if s is None:
2252         return {}
2253
2254     if lenient:
2255         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2256     else:
2257         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2258     if mobj:
2259         return {
2260             'width': int(mobj.group('w')),
2261             'height': int(mobj.group('h')),
2262         }
2263
2264     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2265     if mobj:
2266         return {'height': int(mobj.group(1))}
2267
2268     mobj = re.search(r'\b([48])[kK]\b', s)
2269     if mobj:
2270         return {'height': int(mobj.group(1)) * 540}
2271
2272     return {}
2273
2274
2275 def parse_bitrate(s):
2276     if not isinstance(s, str):
2277         return
2278     mobj = re.search(r'\b(\d+)\s*kbps', s)
2279     if mobj:
2280         return int(mobj.group(1))
2281
2282
2283 def month_by_name(name, lang='en'):
2284     """ Return the number of a month by (locale-independently) English name """
2285
2286     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2287
2288     try:
2289         return month_names.index(name) + 1
2290     except ValueError:
2291         return None
2292
2293
2294 def month_by_abbreviation(abbrev):
2295     """ Return the number of a month by (locale-independently) English
2296         abbreviations """
2297
2298     try:
2299         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2300     except ValueError:
2301         return None
2302
2303
2304 def fix_xml_ampersands(xml_str):
2305     """Replace all the '&' by '&amp;' in XML"""
2306     return re.sub(
2307         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2308         '&amp;',
2309         xml_str)
2310
2311
2312 def setproctitle(title):
2313     assert isinstance(title, str)
2314
2315     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2316     try:
2317         import ctypes
2318     except ImportError:
2319         return
2320
2321     try:
2322         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2323     except OSError:
2324         return
2325     except TypeError:
2326         # LoadLibrary in Windows Python 2.7.13 only expects
2327         # a bytestring, but since unicode_literals turns
2328         # every string into a unicode string, it fails.
2329         return
2330     title_bytes = title.encode()
2331     buf = ctypes.create_string_buffer(len(title_bytes))
2332     buf.value = title_bytes
2333     try:
2334         libc.prctl(15, buf, 0, 0, 0)
2335     except AttributeError:
2336         return  # Strange libc, just skip this
2337
2338
2339 def remove_start(s, start):
2340     return s[len(start):] if s is not None and s.startswith(start) else s
2341
2342
2343 def remove_end(s, end):
2344     return s[:-len(end)] if s is not None and s.endswith(end) else s
2345
2346
2347 def remove_quotes(s):
2348     if s is None or len(s) < 2:
2349         return s
2350     for quote in ('"', "'", ):
2351         if s[0] == quote and s[-1] == quote:
2352             return s[1:-1]
2353     return s
2354
2355
2356 def get_domain(url):
2357     """
2358     This implementation is inconsistent, but is kept for compatibility.
2359     Use this only for "webpage_url_domain"
2360     """
2361     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2362
2363
2364 def url_basename(url):
2365     path = urllib.parse.urlparse(url).path
2366     return path.strip('/').split('/')[-1]
2367
2368
2369 def base_url(url):
2370     return re.match(r'https?://[^?#]+/', url).group()
2371
2372
2373 def urljoin(base, path):
2374     if isinstance(path, bytes):
2375         path = path.decode()
2376     if not isinstance(path, str) or not path:
2377         return None
2378     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2379         return path
2380     if isinstance(base, bytes):
2381         base = base.decode()
2382     if not isinstance(base, str) or not re.match(
2383             r'^(?:https?:)?//', base):
2384         return None
2385     return urllib.parse.urljoin(base, path)
2386
2387
2388 class HEADRequest(urllib.request.Request):
2389     def get_method(self):
2390         return 'HEAD'
2391
2392
2393 class PUTRequest(urllib.request.Request):
2394     def get_method(self):
2395         return 'PUT'
2396
2397
2398 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2399     if get_attr and v is not None:
2400         v = getattr(v, get_attr, None)
2401     try:
2402         return int(v) * invscale // scale
2403     except (ValueError, TypeError, OverflowError):
2404         return default
2405
2406
2407 def str_or_none(v, default=None):
2408     return default if v is None else str(v)
2409
2410
2411 def str_to_int(int_str):
2412     """ A more relaxed version of int_or_none """
2413     if isinstance(int_str, int):
2414         return int_str
2415     elif isinstance(int_str, str):
2416         int_str = re.sub(r'[,\.\+]', '', int_str)
2417         return int_or_none(int_str)
2418
2419
2420 def float_or_none(v, scale=1, invscale=1, default=None):
2421     if v is None:
2422         return default
2423     try:
2424         return float(v) * invscale / scale
2425     except (ValueError, TypeError):
2426         return default
2427
2428
2429 def bool_or_none(v, default=None):
2430     return v if isinstance(v, bool) else default
2431
2432
2433 def strip_or_none(v, default=None):
2434     return v.strip() if isinstance(v, str) else default
2435
2436
2437 def url_or_none(url):
2438     if not url or not isinstance(url, str):
2439         return None
2440     url = url.strip()
2441     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2442
2443
2444 def request_to_url(req):
2445     if isinstance(req, urllib.request.Request):
2446         return req.get_full_url()
2447     else:
2448         return req
2449
2450
2451 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
2452     datetime_object = None
2453     try:
2454         if isinstance(timestamp, (int, float)):  # unix timestamp
2455             # Using naive datetime here can break timestamp() in Windows
2456             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2457             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2458             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2459             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2460                                + datetime.timedelta(seconds=timestamp))
2461         elif isinstance(timestamp, str):  # assume YYYYMMDD
2462             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2463         date_format = re.sub(  # Support %s on windows
2464             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2465         return datetime_object.strftime(date_format)
2466     except (ValueError, TypeError, AttributeError):
2467         return default
2468
2469
2470 def parse_duration(s):
2471     if not isinstance(s, str):
2472         return None
2473     s = s.strip()
2474     if not s:
2475         return None
2476
2477     days, hours, mins, secs, ms = [None] * 5
2478     m = re.match(r'''(?x)
2479             (?P<before_secs>
2480                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2481             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2482             (?P<ms>[.:][0-9]+)?Z?$
2483         ''', s)
2484     if m:
2485         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2486     else:
2487         m = re.match(
2488             r'''(?ix)(?:P?
2489                 (?:
2490                     [0-9]+\s*y(?:ears?)?,?\s*
2491                 )?
2492                 (?:
2493                     [0-9]+\s*m(?:onths?)?,?\s*
2494                 )?
2495                 (?:
2496                     [0-9]+\s*w(?:eeks?)?,?\s*
2497                 )?
2498                 (?:
2499                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2500                 )?
2501                 T)?
2502                 (?:
2503                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2504                 )?
2505                 (?:
2506                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2507                 )?
2508                 (?:
2509                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2510                 )?Z?$''', s)
2511         if m:
2512             days, hours, mins, secs, ms = m.groups()
2513         else:
2514             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2515             if m:
2516                 hours, mins = m.groups()
2517             else:
2518                 return None
2519
2520     if ms:
2521         ms = ms.replace(':', '.')
2522     return sum(float(part or 0) * mult for part, mult in (
2523         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2524
2525
2526 def prepend_extension(filename, ext, expected_real_ext=None):
2527     name, real_ext = os.path.splitext(filename)
2528     return (
2529         f'{name}.{ext}{real_ext}'
2530         if not expected_real_ext or real_ext[1:] == expected_real_ext
2531         else f'{filename}.{ext}')
2532
2533
2534 def replace_extension(filename, ext, expected_real_ext=None):
2535     name, real_ext = os.path.splitext(filename)
2536     return '{}.{}'.format(
2537         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2538         ext)
2539
2540
2541 def check_executable(exe, args=[]):
2542     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2543     args can be a list of arguments for a short output (like -version) """
2544     try:
2545         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2546     except OSError:
2547         return False
2548     return exe
2549
2550
2551 def _get_exe_version_output(exe, args):
2552     try:
2553         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2554         # SIGTTOU if yt-dlp is run in the background.
2555         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2556         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2557                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2558         if ret:
2559             return None
2560     except OSError:
2561         return False
2562     return stdout
2563
2564
2565 def detect_exe_version(output, version_re=None, unrecognized='present'):
2566     assert isinstance(output, str)
2567     if version_re is None:
2568         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2569     m = re.search(version_re, output)
2570     if m:
2571         return m.group(1)
2572     else:
2573         return unrecognized
2574
2575
2576 def get_exe_version(exe, args=['--version'],
2577                     version_re=None, unrecognized=('present', 'broken')):
2578     """ Returns the version of the specified executable,
2579     or False if the executable is not present """
2580     unrecognized = variadic(unrecognized)
2581     assert len(unrecognized) in (1, 2)
2582     out = _get_exe_version_output(exe, args)
2583     if out is None:
2584         return unrecognized[-1]
2585     return out and detect_exe_version(out, version_re, unrecognized[0])
2586
2587
2588 def frange(start=0, stop=None, step=1):
2589     """Float range"""
2590     if stop is None:
2591         start, stop = 0, start
2592     sign = [-1, 1][step > 0] if step else 0
2593     while sign * start < sign * stop:
2594         yield start
2595         start += step
2596
2597
2598 class LazyList(collections.abc.Sequence):
2599     """Lazy immutable list from an iterable
2600     Note that slices of a LazyList are lists and not LazyList"""
2601
2602     class IndexError(IndexError):
2603         pass
2604
2605     def __init__(self, iterable, *, reverse=False, _cache=None):
2606         self._iterable = iter(iterable)
2607         self._cache = [] if _cache is None else _cache
2608         self._reversed = reverse
2609
2610     def __iter__(self):
2611         if self._reversed:
2612             # We need to consume the entire iterable to iterate in reverse
2613             yield from self.exhaust()
2614             return
2615         yield from self._cache
2616         for item in self._iterable:
2617             self._cache.append(item)
2618             yield item
2619
2620     def _exhaust(self):
2621         self._cache.extend(self._iterable)
2622         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2623         return self._cache
2624
2625     def exhaust(self):
2626         """Evaluate the entire iterable"""
2627         return self._exhaust()[::-1 if self._reversed else 1]
2628
2629     @staticmethod
2630     def _reverse_index(x):
2631         return None if x is None else ~x
2632
2633     def __getitem__(self, idx):
2634         if isinstance(idx, slice):
2635             if self._reversed:
2636                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2637             start, stop, step = idx.start, idx.stop, idx.step or 1
2638         elif isinstance(idx, int):
2639             if self._reversed:
2640                 idx = self._reverse_index(idx)
2641             start, stop, step = idx, idx, 0
2642         else:
2643             raise TypeError('indices must be integers or slices')
2644         if ((start or 0) < 0 or (stop or 0) < 0
2645                 or (start is None and step < 0)
2646                 or (stop is None and step > 0)):
2647             # We need to consume the entire iterable to be able to slice from the end
2648             # Obviously, never use this with infinite iterables
2649             self._exhaust()
2650             try:
2651                 return self._cache[idx]
2652             except IndexError as e:
2653                 raise self.IndexError(e) from e
2654         n = max(start or 0, stop or 0) - len(self._cache) + 1
2655         if n > 0:
2656             self._cache.extend(itertools.islice(self._iterable, n))
2657         try:
2658             return self._cache[idx]
2659         except IndexError as e:
2660             raise self.IndexError(e) from e
2661
2662     def __bool__(self):
2663         try:
2664             self[-1] if self._reversed else self[0]
2665         except self.IndexError:
2666             return False
2667         return True
2668
2669     def __len__(self):
2670         self._exhaust()
2671         return len(self._cache)
2672
2673     def __reversed__(self):
2674         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2675
2676     def __copy__(self):
2677         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2678
2679     def __repr__(self):
2680         # repr and str should mimic a list. So we exhaust the iterable
2681         return repr(self.exhaust())
2682
2683     def __str__(self):
2684         return repr(self.exhaust())
2685
2686
2687 class PagedList:
2688
2689     class IndexError(IndexError):
2690         pass
2691
2692     def __len__(self):
2693         # This is only useful for tests
2694         return len(self.getslice())
2695
2696     def __init__(self, pagefunc, pagesize, use_cache=True):
2697         self._pagefunc = pagefunc
2698         self._pagesize = pagesize
2699         self._pagecount = float('inf')
2700         self._use_cache = use_cache
2701         self._cache = {}
2702
2703     def getpage(self, pagenum):
2704         page_results = self._cache.get(pagenum)
2705         if page_results is None:
2706             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2707         if self._use_cache:
2708             self._cache[pagenum] = page_results
2709         return page_results
2710
2711     def getslice(self, start=0, end=None):
2712         return list(self._getslice(start, end))
2713
2714     def _getslice(self, start, end):
2715         raise NotImplementedError('This method must be implemented by subclasses')
2716
2717     def __getitem__(self, idx):
2718         assert self._use_cache, 'Indexing PagedList requires cache'
2719         if not isinstance(idx, int) or idx < 0:
2720             raise TypeError('indices must be non-negative integers')
2721         entries = self.getslice(idx, idx + 1)
2722         if not entries:
2723             raise self.IndexError()
2724         return entries[0]
2725
2726
2727 class OnDemandPagedList(PagedList):
2728     """Download pages until a page with less than maximum results"""
2729
2730     def _getslice(self, start, end):
2731         for pagenum in itertools.count(start // self._pagesize):
2732             firstid = pagenum * self._pagesize
2733             nextfirstid = pagenum * self._pagesize + self._pagesize
2734             if start >= nextfirstid:
2735                 continue
2736
2737             startv = (
2738                 start % self._pagesize
2739                 if firstid <= start < nextfirstid
2740                 else 0)
2741             endv = (
2742                 ((end - 1) % self._pagesize) + 1
2743                 if (end is not None and firstid <= end <= nextfirstid)
2744                 else None)
2745
2746             try:
2747                 page_results = self.getpage(pagenum)
2748             except Exception:
2749                 self._pagecount = pagenum - 1
2750                 raise
2751             if startv != 0 or endv is not None:
2752                 page_results = page_results[startv:endv]
2753             yield from page_results
2754
2755             # A little optimization - if current page is not "full", ie. does
2756             # not contain page_size videos then we can assume that this page
2757             # is the last one - there are no more ids on further pages -
2758             # i.e. no need to query again.
2759             if len(page_results) + startv < self._pagesize:
2760                 break
2761
2762             # If we got the whole page, but the next page is not interesting,
2763             # break out early as well
2764             if end == nextfirstid:
2765                 break
2766
2767
2768 class InAdvancePagedList(PagedList):
2769     """PagedList with total number of pages known in advance"""
2770
2771     def __init__(self, pagefunc, pagecount, pagesize):
2772         PagedList.__init__(self, pagefunc, pagesize, True)
2773         self._pagecount = pagecount
2774
2775     def _getslice(self, start, end):
2776         start_page = start // self._pagesize
2777         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2778         skip_elems = start - start_page * self._pagesize
2779         only_more = None if end is None else end - start
2780         for pagenum in range(start_page, end_page):
2781             page_results = self.getpage(pagenum)
2782             if skip_elems:
2783                 page_results = page_results[skip_elems:]
2784                 skip_elems = None
2785             if only_more is not None:
2786                 if len(page_results) < only_more:
2787                     only_more -= len(page_results)
2788                 else:
2789                     yield from page_results[:only_more]
2790                     break
2791             yield from page_results
2792
2793
2794 class PlaylistEntries:
2795     MissingEntry = object()
2796     is_exhausted = False
2797
2798     def __init__(self, ydl, info_dict):
2799         self.ydl = ydl
2800
2801         # _entries must be assigned now since infodict can change during iteration
2802         entries = info_dict.get('entries')
2803         if entries is None:
2804             raise EntryNotInPlaylist('There are no entries')
2805         elif isinstance(entries, list):
2806             self.is_exhausted = True
2807
2808         requested_entries = info_dict.get('requested_entries')
2809         self.is_incomplete = requested_entries is not None
2810         if self.is_incomplete:
2811             assert self.is_exhausted
2812             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2813             for i, entry in zip(requested_entries, entries):
2814                 self._entries[i - 1] = entry
2815         elif isinstance(entries, (list, PagedList, LazyList)):
2816             self._entries = entries
2817         else:
2818             self._entries = LazyList(entries)
2819
2820     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2821         (?P<start>[+-]?\d+)?
2822         (?P<range>[:-]
2823             (?P<end>[+-]?\d+|inf(?:inite)?)?
2824             (?::(?P<step>[+-]?\d+))?
2825         )?''')
2826
2827     @classmethod
2828     def parse_playlist_items(cls, string):
2829         for segment in string.split(','):
2830             if not segment:
2831                 raise ValueError('There is two or more consecutive commas')
2832             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2833             if not mobj:
2834                 raise ValueError(f'{segment!r} is not a valid specification')
2835             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2836             if int_or_none(step) == 0:
2837                 raise ValueError(f'Step in {segment!r} cannot be zero')
2838             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2839
2840     def get_requested_items(self):
2841         playlist_items = self.ydl.params.get('playlist_items')
2842         playlist_start = self.ydl.params.get('playliststart', 1)
2843         playlist_end = self.ydl.params.get('playlistend')
2844         # For backwards compatibility, interpret -1 as whole list
2845         if playlist_end in (-1, None):
2846             playlist_end = ''
2847         if not playlist_items:
2848             playlist_items = f'{playlist_start}:{playlist_end}'
2849         elif playlist_start != 1 or playlist_end:
2850             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2851
2852         for index in self.parse_playlist_items(playlist_items):
2853             for i, entry in self[index]:
2854                 yield i, entry
2855                 if not entry:
2856                     continue
2857                 try:
2858                     # The item may have just been added to archive. Don't break due to it
2859                     if not self.ydl.params.get('lazy_playlist'):
2860                         # TODO: Add auto-generated fields
2861                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2862                 except (ExistingVideoReached, RejectedVideoReached):
2863                     return
2864
2865     def get_full_count(self):
2866         if self.is_exhausted and not self.is_incomplete:
2867             return len(self)
2868         elif isinstance(self._entries, InAdvancePagedList):
2869             if self._entries._pagesize == 1:
2870                 return self._entries._pagecount
2871
2872     @functools.cached_property
2873     def _getter(self):
2874         if isinstance(self._entries, list):
2875             def get_entry(i):
2876                 try:
2877                     entry = self._entries[i]
2878                 except IndexError:
2879                     entry = self.MissingEntry
2880                     if not self.is_incomplete:
2881                         raise self.IndexError()
2882                 if entry is self.MissingEntry:
2883                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2884                 return entry
2885         else:
2886             def get_entry(i):
2887                 try:
2888                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2889                 except (LazyList.IndexError, PagedList.IndexError):
2890                     raise self.IndexError()
2891         return get_entry
2892
2893     def __getitem__(self, idx):
2894         if isinstance(idx, int):
2895             idx = slice(idx, idx)
2896
2897         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2898         step = 1 if idx.step is None else idx.step
2899         if idx.start is None:
2900             start = 0 if step > 0 else len(self) - 1
2901         else:
2902             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2903
2904         # NB: Do not call len(self) when idx == [:]
2905         if idx.stop is None:
2906             stop = 0 if step < 0 else float('inf')
2907         else:
2908             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2909         stop += [-1, 1][step > 0]
2910
2911         for i in frange(start, stop, step):
2912             if i < 0:
2913                 continue
2914             try:
2915                 entry = self._getter(i)
2916             except self.IndexError:
2917                 self.is_exhausted = True
2918                 if step > 0:
2919                     break
2920                 continue
2921             yield i + 1, entry
2922
2923     def __len__(self):
2924         return len(tuple(self[:]))
2925
2926     class IndexError(IndexError):
2927         pass
2928
2929
2930 def uppercase_escape(s):
2931     unicode_escape = codecs.getdecoder('unicode_escape')
2932     return re.sub(
2933         r'\\U[0-9a-fA-F]{8}',
2934         lambda m: unicode_escape(m.group(0))[0],
2935         s)
2936
2937
2938 def lowercase_escape(s):
2939     unicode_escape = codecs.getdecoder('unicode_escape')
2940     return re.sub(
2941         r'\\u[0-9a-fA-F]{4}',
2942         lambda m: unicode_escape(m.group(0))[0],
2943         s)
2944
2945
2946 def escape_rfc3986(s):
2947     """Escape non-ASCII characters as suggested by RFC 3986"""
2948     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2949
2950
2951 def escape_url(url):
2952     """Escape URL as suggested by RFC 3986"""
2953     url_parsed = urllib.parse.urlparse(url)
2954     return url_parsed._replace(
2955         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2956         path=escape_rfc3986(url_parsed.path),
2957         params=escape_rfc3986(url_parsed.params),
2958         query=escape_rfc3986(url_parsed.query),
2959         fragment=escape_rfc3986(url_parsed.fragment)
2960     ).geturl()
2961
2962
2963 def parse_qs(url, **kwargs):
2964     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2965
2966
2967 def read_batch_urls(batch_fd):
2968     def fixup(url):
2969         if not isinstance(url, str):
2970             url = url.decode('utf-8', 'replace')
2971         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2972         for bom in BOM_UTF8:
2973             if url.startswith(bom):
2974                 url = url[len(bom):]
2975         url = url.lstrip()
2976         if not url or url.startswith(('#', ';', ']')):
2977             return False
2978         # "#" cannot be stripped out since it is part of the URI
2979         # However, it can be safely stripped out if following a whitespace
2980         return re.split(r'\s#', url, 1)[0].rstrip()
2981
2982     with contextlib.closing(batch_fd) as fd:
2983         return [url for url in map(fixup, fd) if url]
2984
2985
2986 def urlencode_postdata(*args, **kargs):
2987     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2988
2989
2990 def update_url(url, *, query_update=None, **kwargs):
2991     """Replace URL components specified by kwargs
2992        @param url           str or parse url tuple
2993        @param query_update  update query
2994        @returns             str
2995     """
2996     if isinstance(url, str):
2997         if not kwargs and not query_update:
2998             return url
2999         else:
3000             url = urllib.parse.urlparse(url)
3001     if query_update:
3002         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3003         kwargs['query'] = urllib.parse.urlencode({
3004             **urllib.parse.parse_qs(url.query),
3005             **query_update
3006         }, True)
3007     return urllib.parse.urlunparse(url._replace(**kwargs))
3008
3009
3010 def update_url_query(url, query):
3011     return update_url(url, query_update=query)
3012
3013
3014 def update_Request(req, url=None, data=None, headers=None, query=None):
3015     req_headers = req.headers.copy()
3016     req_headers.update(headers or {})
3017     req_data = data or req.data
3018     req_url = update_url_query(url or req.get_full_url(), query)
3019     req_get_method = req.get_method()
3020     if req_get_method == 'HEAD':
3021         req_type = HEADRequest
3022     elif req_get_method == 'PUT':
3023         req_type = PUTRequest
3024     else:
3025         req_type = urllib.request.Request
3026     new_req = req_type(
3027         req_url, data=req_data, headers=req_headers,
3028         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3029     if hasattr(req, 'timeout'):
3030         new_req.timeout = req.timeout
3031     return new_req
3032
3033
3034 def _multipart_encode_impl(data, boundary):
3035     content_type = 'multipart/form-data; boundary=%s' % boundary
3036
3037     out = b''
3038     for k, v in data.items():
3039         out += b'--' + boundary.encode('ascii') + b'\r\n'
3040         if isinstance(k, str):
3041             k = k.encode()
3042         if isinstance(v, str):
3043             v = v.encode()
3044         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3045         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3046         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3047         if boundary.encode('ascii') in content:
3048             raise ValueError('Boundary overlaps with data')
3049         out += content
3050
3051     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3052
3053     return out, content_type
3054
3055
3056 def multipart_encode(data, boundary=None):
3057     '''
3058     Encode a dict to RFC 7578-compliant form-data
3059
3060     data:
3061         A dict where keys and values can be either Unicode or bytes-like
3062         objects.
3063     boundary:
3064         If specified a Unicode object, it's used as the boundary. Otherwise
3065         a random boundary is generated.
3066
3067     Reference: https://tools.ietf.org/html/rfc7578
3068     '''
3069     has_specified_boundary = boundary is not None
3070
3071     while True:
3072         if boundary is None:
3073             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3074
3075         try:
3076             out, content_type = _multipart_encode_impl(data, boundary)
3077             break
3078         except ValueError:
3079             if has_specified_boundary:
3080                 raise
3081             boundary = None
3082
3083     return out, content_type
3084
3085
3086 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3087     if blocked_types is NO_DEFAULT:
3088         blocked_types = (str, bytes, collections.abc.Mapping)
3089     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3090
3091
3092 def variadic(x, allowed_types=NO_DEFAULT):
3093     if not isinstance(allowed_types, (tuple, type)):
3094         deprecation_warning('allowed_types should be a tuple or a type')
3095         allowed_types = tuple(allowed_types)
3096     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3097
3098
3099 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3100     for f in funcs:
3101         try:
3102             val = f(*args, **kwargs)
3103         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3104             pass
3105         else:
3106             if expected_type is None or isinstance(val, expected_type):
3107                 return val
3108
3109
3110 def try_get(src, getter, expected_type=None):
3111     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3112
3113
3114 def filter_dict(dct, cndn=lambda _, v: v is not None):
3115     return {k: v for k, v in dct.items() if cndn(k, v)}
3116
3117
3118 def merge_dicts(*dicts):
3119     merged = {}
3120     for a_dict in dicts:
3121         for k, v in a_dict.items():
3122             if (v is not None and k not in merged
3123                     or isinstance(v, str) and merged[k] == ''):
3124                 merged[k] = v
3125     return merged
3126
3127
3128 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3129     return string if isinstance(string, str) else str(string, encoding, errors)
3130
3131
3132 US_RATINGS = {
3133     'G': 0,
3134     'PG': 10,
3135     'PG-13': 13,
3136     'R': 16,
3137     'NC': 18,
3138 }
3139
3140
3141 TV_PARENTAL_GUIDELINES = {
3142     'TV-Y': 0,
3143     'TV-Y7': 7,
3144     'TV-G': 0,
3145     'TV-PG': 0,
3146     'TV-14': 14,
3147     'TV-MA': 17,
3148 }
3149
3150
3151 def parse_age_limit(s):
3152     # isinstance(False, int) is True. So type() must be used instead
3153     if type(s) is int:  # noqa: E721
3154         return s if 0 <= s <= 21 else None
3155     elif not isinstance(s, str):
3156         return None
3157     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3158     if m:
3159         return int(m.group('age'))
3160     s = s.upper()
3161     if s in US_RATINGS:
3162         return US_RATINGS[s]
3163     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3164     if m:
3165         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3166     return None
3167
3168
3169 def strip_jsonp(code):
3170     return re.sub(
3171         r'''(?sx)^
3172             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3173             (?:\s*&&\s*(?P=func_name))?
3174             \s*\(\s*(?P<callback_data>.*)\);?
3175             \s*?(?://[^\n]*)*$''',
3176         r'\g<callback_data>', code)
3177
3178
3179 def js_to_json(code, vars={}, *, strict=False):
3180     # vars is a dict of var, val pairs to substitute
3181     STRING_QUOTES = '\'"`'
3182     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3183     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3184     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3185     INTEGER_TABLE = (
3186         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3187         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3188     )
3189
3190     def process_escape(match):
3191         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3192         escape = match.group(1) or match.group(2)
3193
3194         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3195                 else R'\u00' if escape == 'x'
3196                 else '' if escape == '\n'
3197                 else escape)
3198
3199     def template_substitute(match):
3200         evaluated = js_to_json(match.group(1), vars, strict=strict)
3201         if evaluated[0] == '"':
3202             return json.loads(evaluated)
3203         return evaluated
3204
3205     def fix_kv(m):
3206         v = m.group(0)
3207         if v in ('true', 'false', 'null'):
3208             return v
3209         elif v in ('undefined', 'void 0'):
3210             return 'null'
3211         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3212             return ''
3213
3214         if v[0] in STRING_QUOTES:
3215             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3216             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3217             return f'"{escaped}"'
3218
3219         for regex, base in INTEGER_TABLE:
3220             im = re.match(regex, v)
3221             if im:
3222                 i = int(im.group(1), base)
3223                 return f'"{i}":' if v.endswith(':') else str(i)
3224
3225         if v in vars:
3226             try:
3227                 if not strict:
3228                     json.loads(vars[v])
3229             except json.JSONDecodeError:
3230                 return json.dumps(vars[v])
3231             else:
3232                 return vars[v]
3233
3234         if not strict:
3235             return f'"{v}"'
3236
3237         raise ValueError(f'Unknown value: {v}')
3238
3239     def create_map(mobj):
3240         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3241
3242     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3243     if not strict:
3244         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3245         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3246         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3247         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3248
3249     return re.sub(rf'''(?sx)
3250         {STRING_RE}|
3251         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3252         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3253         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3254         [0-9]+(?={SKIP_RE}:)|
3255         !+
3256         ''', fix_kv, code)
3257
3258
3259 def qualities(quality_ids):
3260     """ Get a numeric quality value out of a list of possible values """
3261     def q(qid):
3262         try:
3263             return quality_ids.index(qid)
3264         except ValueError:
3265             return -1
3266     return q
3267
3268
3269 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3270
3271
3272 DEFAULT_OUTTMPL = {
3273     'default': '%(title)s [%(id)s].%(ext)s',
3274     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3275 }
3276 OUTTMPL_TYPES = {
3277     'chapter': None,
3278     'subtitle': None,
3279     'thumbnail': None,
3280     'description': 'description',
3281     'annotation': 'annotations.xml',
3282     'infojson': 'info.json',
3283     'link': None,
3284     'pl_video': None,
3285     'pl_thumbnail': None,
3286     'pl_description': 'description',
3287     'pl_infojson': 'info.json',
3288 }
3289
3290 # As of [1] format syntax is:
3291 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3292 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3293 STR_FORMAT_RE_TMPL = r'''(?x)
3294     (?<!%)(?P<prefix>(?:%%)*)
3295     %
3296     (?P<has_key>\((?P<key>{0})\))?
3297     (?P<format>
3298         (?P<conversion>[#0\-+ ]+)?
3299         (?P<min_width>\d+)?
3300         (?P<precision>\.\d+)?
3301         (?P<len_mod>[hlL])?  # unused in python
3302         {1}  # conversion type
3303     )
3304 '''
3305
3306
3307 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
3308
3309
3310 def limit_length(s, length):
3311     """ Add ellipses to overly long strings """
3312     if s is None:
3313         return None
3314     ELLIPSES = '...'
3315     if len(s) > length:
3316         return s[:length - len(ELLIPSES)] + ELLIPSES
3317     return s
3318
3319
3320 def version_tuple(v):
3321     return tuple(int(e) for e in re.split(r'[-.]', v))
3322
3323
3324 def is_outdated_version(version, limit, assume_new=True):
3325     if not version:
3326         return not assume_new
3327     try:
3328         return version_tuple(version) < version_tuple(limit)
3329     except ValueError:
3330         return not assume_new
3331
3332
3333 def ytdl_is_updateable():
3334     """ Returns if yt-dlp can be updated with -U """
3335
3336     from ..update import is_non_updateable
3337
3338     return not is_non_updateable()
3339
3340
3341 def args_to_str(args):
3342     # Get a short string representation for a subprocess command
3343     return ' '.join(compat_shlex_quote(a) for a in args)
3344
3345
3346 def error_to_str(err):
3347     return f'{type(err).__name__}: {err}'
3348
3349
3350 def mimetype2ext(mt, default=NO_DEFAULT):
3351     if not isinstance(mt, str):
3352         if default is not NO_DEFAULT:
3353             return default
3354         return None
3355
3356     MAP = {
3357         # video
3358         '3gpp': '3gp',
3359         'mp2t': 'ts',
3360         'mp4': 'mp4',
3361         'mpeg': 'mpeg',
3362         'mpegurl': 'm3u8',
3363         'quicktime': 'mov',
3364         'webm': 'webm',
3365         'vp9': 'vp9',
3366         'x-flv': 'flv',
3367         'x-m4v': 'm4v',
3368         'x-matroska': 'mkv',
3369         'x-mng': 'mng',
3370         'x-mp4-fragmented': 'mp4',
3371         'x-ms-asf': 'asf',
3372         'x-ms-wmv': 'wmv',
3373         'x-msvideo': 'avi',
3374
3375         # application (streaming playlists)
3376         'dash+xml': 'mpd',
3377         'f4m+xml': 'f4m',
3378         'hds+xml': 'f4m',
3379         'vnd.apple.mpegurl': 'm3u8',
3380         'vnd.ms-sstr+xml': 'ism',
3381         'x-mpegurl': 'm3u8',
3382
3383         # audio
3384         'audio/mp4': 'm4a',
3385         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3386         # Using .mp3 as it's the most popular one
3387         'audio/mpeg': 'mp3',
3388         'audio/webm': 'webm',
3389         'audio/x-matroska': 'mka',
3390         'audio/x-mpegurl': 'm3u',
3391         'midi': 'mid',
3392         'ogg': 'ogg',
3393         'wav': 'wav',
3394         'wave': 'wav',
3395         'x-aac': 'aac',
3396         'x-flac': 'flac',
3397         'x-m4a': 'm4a',
3398         'x-realaudio': 'ra',
3399         'x-wav': 'wav',
3400
3401         # image
3402         'avif': 'avif',
3403         'bmp': 'bmp',
3404         'gif': 'gif',
3405         'jpeg': 'jpg',
3406         'png': 'png',
3407         'svg+xml': 'svg',
3408         'tiff': 'tif',
3409         'vnd.wap.wbmp': 'wbmp',
3410         'webp': 'webp',
3411         'x-icon': 'ico',
3412         'x-jng': 'jng',
3413         'x-ms-bmp': 'bmp',
3414
3415         # caption
3416         'filmstrip+json': 'fs',
3417         'smptett+xml': 'tt',
3418         'ttaf+xml': 'dfxp',
3419         'ttml+xml': 'ttml',
3420         'x-ms-sami': 'sami',
3421
3422         # misc
3423         'gzip': 'gz',
3424         'json': 'json',
3425         'xml': 'xml',
3426         'zip': 'zip',
3427     }
3428
3429     mimetype = mt.partition(';')[0].strip().lower()
3430     _, _, subtype = mimetype.rpartition('/')
3431
3432     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3433     if ext:
3434         return ext
3435     elif default is not NO_DEFAULT:
3436         return default
3437     return subtype.replace('+', '.')
3438
3439
3440 def ext2mimetype(ext_or_url):
3441     if not ext_or_url:
3442         return None
3443     if '.' not in ext_or_url:
3444         ext_or_url = f'file.{ext_or_url}'
3445     return mimetypes.guess_type(ext_or_url)[0]
3446
3447
3448 def parse_codecs(codecs_str):
3449     # http://tools.ietf.org/html/rfc6381
3450     if not codecs_str:
3451         return {}
3452     split_codecs = list(filter(None, map(
3453         str.strip, codecs_str.strip().strip(',').split(','))))
3454     vcodec, acodec, scodec, hdr = None, None, None, None
3455     for full_codec in split_codecs:
3456         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3457         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3458                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3459             if vcodec:
3460                 continue
3461             vcodec = full_codec
3462             if parts[0] in ('dvh1', 'dvhe'):
3463                 hdr = 'DV'
3464             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3465                 hdr = 'HDR10'
3466             elif parts[:2] == ['vp9', '2']:
3467                 hdr = 'HDR10'
3468         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3469                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3470             acodec = acodec or full_codec
3471         elif parts[0] in ('stpp', 'wvtt'):
3472             scodec = scodec or full_codec
3473         else:
3474             write_string(f'WARNING: Unknown codec {full_codec}\n')
3475     if vcodec or acodec or scodec:
3476         return {
3477             'vcodec': vcodec or 'none',
3478             'acodec': acodec or 'none',
3479             'dynamic_range': hdr,
3480             **({'scodec': scodec} if scodec is not None else {}),
3481         }
3482     elif len(split_codecs) == 2:
3483         return {
3484             'vcodec': split_codecs[0],
3485             'acodec': split_codecs[1],
3486         }
3487     return {}
3488
3489
3490 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3491     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3492
3493     allow_mkv = not preferences or 'mkv' in preferences
3494
3495     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3496         return 'mkv'  # TODO: any other format allows this?
3497
3498     # TODO: All codecs supported by parse_codecs isn't handled here
3499     COMPATIBLE_CODECS = {
3500         'mp4': {
3501             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3502             'h264', 'aacl', 'ec-3',  # Set in ISM
3503         },
3504         'webm': {
3505             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3506             'vp9x', 'vp8x',  # in the webm spec
3507         },
3508     }
3509
3510     sanitize_codec = functools.partial(
3511         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3512     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3513
3514     for ext in preferences or COMPATIBLE_CODECS.keys():
3515         codec_set = COMPATIBLE_CODECS.get(ext, set())
3516         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3517             return ext
3518
3519     COMPATIBLE_EXTS = (
3520         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3521         {'webm', 'weba'},
3522     )
3523     for ext in preferences or vexts:
3524         current_exts = {ext, *vexts, *aexts}
3525         if ext == 'mkv' or current_exts == {ext} or any(
3526                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3527             return ext
3528     return 'mkv' if allow_mkv else preferences[-1]
3529
3530
3531 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3532     getheader = url_handle.headers.get
3533
3534     cd = getheader('Content-Disposition')
3535     if cd:
3536         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3537         if m:
3538             e = determine_ext(m.group('filename'), default_ext=None)
3539             if e:
3540                 return e
3541
3542     meta_ext = getheader('x-amz-meta-name')
3543     if meta_ext:
3544         e = meta_ext.rpartition('.')[2]
3545         if e:
3546             return e
3547
3548     return mimetype2ext(getheader('Content-Type'), default=default)
3549
3550
3551 def encode_data_uri(data, mime_type):
3552     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3553
3554
3555 def age_restricted(content_limit, age_limit):
3556     """ Returns True iff the content should be blocked """
3557
3558     if age_limit is None:  # No limit set
3559         return False
3560     if content_limit is None:
3561         return False  # Content available for everyone
3562     return age_limit < content_limit
3563
3564
3565 # List of known byte-order-marks (BOM)
3566 BOMS = [
3567     (b'\xef\xbb\xbf', 'utf-8'),
3568     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3569     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3570     (b'\xff\xfe', 'utf-16-le'),
3571     (b'\xfe\xff', 'utf-16-be'),
3572 ]
3573
3574
3575 def is_html(first_bytes):
3576     """ Detect whether a file contains HTML by examining its first bytes. """
3577
3578     encoding = 'utf-8'
3579     for bom, enc in BOMS:
3580         while first_bytes.startswith(bom):
3581             encoding, first_bytes = enc, first_bytes[len(bom):]
3582
3583     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3584
3585
3586 def determine_protocol(info_dict):
3587     protocol = info_dict.get('protocol')
3588     if protocol is not None:
3589         return protocol
3590
3591     url = sanitize_url(info_dict['url'])
3592     if url.startswith('rtmp'):
3593         return 'rtmp'
3594     elif url.startswith('mms'):
3595         return 'mms'
3596     elif url.startswith('rtsp'):
3597         return 'rtsp'
3598
3599     ext = determine_ext(url)
3600     if ext == 'm3u8':
3601         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3602     elif ext == 'f4m':
3603         return 'f4m'
3604
3605     return urllib.parse.urlparse(url).scheme
3606
3607
3608 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3609     """ Render a list of rows, each as a list of values.
3610     Text after a \t will be right aligned """
3611     def width(string):
3612         return len(remove_terminal_sequences(string).replace('\t', ''))
3613
3614     def get_max_lens(table):
3615         return [max(width(str(v)) for v in col) for col in zip(*table)]
3616
3617     def filter_using_list(row, filterArray):
3618         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3619
3620     max_lens = get_max_lens(data) if hide_empty else []
3621     header_row = filter_using_list(header_row, max_lens)
3622     data = [filter_using_list(row, max_lens) for row in data]
3623
3624     table = [header_row] + data
3625     max_lens = get_max_lens(table)
3626     extra_gap += 1
3627     if delim:
3628         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3629         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3630     for row in table:
3631         for pos, text in enumerate(map(str, row)):
3632             if '\t' in text:
3633                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3634             else:
3635                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3636     ret = '\n'.join(''.join(row).rstrip() for row in table)
3637     return ret
3638
3639
3640 def _match_one(filter_part, dct, incomplete):
3641     # TODO: Generalize code with YoutubeDL._build_format_filter
3642     STRING_OPERATORS = {
3643         '*=': operator.contains,
3644         '^=': lambda attr, value: attr.startswith(value),
3645         '$=': lambda attr, value: attr.endswith(value),
3646         '~=': lambda attr, value: re.search(value, attr),
3647     }
3648     COMPARISON_OPERATORS = {
3649         **STRING_OPERATORS,
3650         '<=': operator.le,  # "<=" must be defined above "<"
3651         '<': operator.lt,
3652         '>=': operator.ge,
3653         '>': operator.gt,
3654         '=': operator.eq,
3655     }
3656
3657     if isinstance(incomplete, bool):
3658         is_incomplete = lambda _: incomplete
3659     else:
3660         is_incomplete = lambda k: k in incomplete
3661
3662     operator_rex = re.compile(r'''(?x)
3663         (?P<key>[a-z_]+)
3664         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3665         (?:
3666             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3667             (?P<strval>.+?)
3668         )
3669         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3670     m = operator_rex.fullmatch(filter_part.strip())
3671     if m:
3672         m = m.groupdict()
3673         unnegated_op = COMPARISON_OPERATORS[m['op']]
3674         if m['negation']:
3675             op = lambda attr, value: not unnegated_op(attr, value)
3676         else:
3677             op = unnegated_op
3678         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3679         if m['quote']:
3680             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3681         actual_value = dct.get(m['key'])
3682         numeric_comparison = None
3683         if isinstance(actual_value, (int, float)):
3684             # If the original field is a string and matching comparisonvalue is
3685             # a number we should respect the origin of the original field
3686             # and process comparison value as a string (see
3687             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3688             try:
3689                 numeric_comparison = int(comparison_value)
3690             except ValueError:
3691                 numeric_comparison = parse_filesize(comparison_value)
3692                 if numeric_comparison is None:
3693                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3694                 if numeric_comparison is None:
3695                     numeric_comparison = parse_duration(comparison_value)
3696         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3697             raise ValueError('Operator %s only supports string values!' % m['op'])
3698         if actual_value is None:
3699             return is_incomplete(m['key']) or m['none_inclusive']
3700         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3701
3702     UNARY_OPERATORS = {
3703         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3704         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3705     }
3706     operator_rex = re.compile(r'''(?x)
3707         (?P<op>%s)\s*(?P<key>[a-z_]+)
3708         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3709     m = operator_rex.fullmatch(filter_part.strip())
3710     if m:
3711         op = UNARY_OPERATORS[m.group('op')]
3712         actual_value = dct.get(m.group('key'))
3713         if is_incomplete(m.group('key')) and actual_value is None:
3714             return True
3715         return op(actual_value)
3716
3717     raise ValueError('Invalid filter part %r' % filter_part)
3718
3719
3720 def match_str(filter_str, dct, incomplete=False):
3721     """ Filter a dictionary with a simple string syntax.
3722     @returns           Whether the filter passes
3723     @param incomplete  Set of keys that is expected to be missing from dct.
3724                        Can be True/False to indicate all/none of the keys may be missing.
3725                        All conditions on incomplete keys pass if the key is missing
3726     """
3727     return all(
3728         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3729         for filter_part in re.split(r'(?<!\\)&', filter_str))
3730
3731
3732 def match_filter_func(filters, breaking_filters=None):
3733     if not filters and not breaking_filters:
3734         return None
3735     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3736     filters = set(variadic(filters or []))
3737
3738     interactive = '-' in filters
3739     if interactive:
3740         filters.remove('-')
3741
3742     def _match_func(info_dict, incomplete=False):
3743         ret = breaking_filters(info_dict, incomplete)
3744         if ret is not None:
3745             raise RejectedVideoReached(ret)
3746
3747         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3748             return NO_DEFAULT if interactive and not incomplete else None
3749         else:
3750             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3751             filter_str = ') | ('.join(map(str.strip, filters))
3752             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3753     return _match_func
3754
3755
3756 class download_range_func:
3757     def __init__(self, chapters, ranges, from_info=False):
3758         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3759
3760     def __call__(self, info_dict, ydl):
3761
3762         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3763                    else 'Cannot match chapters since chapter information is unavailable')
3764         for regex in self.chapters or []:
3765             for i, chapter in enumerate(info_dict.get('chapters') or []):
3766                 if re.search(regex, chapter['title']):
3767                     warning = None
3768                     yield {**chapter, 'index': i}
3769         if self.chapters and warning:
3770             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3771
3772         for start, end in self.ranges or []:
3773             yield {
3774                 'start_time': self._handle_negative_timestamp(start, info_dict),
3775                 'end_time': self._handle_negative_timestamp(end, info_dict),
3776             }
3777
3778         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3779             yield {
3780                 'start_time': info_dict.get('start_time') or 0,
3781                 'end_time': info_dict.get('end_time') or float('inf'),
3782             }
3783         elif not self.ranges and not self.chapters:
3784             yield {}
3785
3786     @staticmethod
3787     def _handle_negative_timestamp(time, info):
3788         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3789
3790     def __eq__(self, other):
3791         return (isinstance(other, download_range_func)
3792                 and self.chapters == other.chapters and self.ranges == other.ranges)
3793
3794     def __repr__(self):
3795         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3796
3797
3798 def parse_dfxp_time_expr(time_expr):
3799     if not time_expr:
3800         return
3801
3802     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3803     if mobj:
3804         return float(mobj.group('time_offset'))
3805
3806     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3807     if mobj:
3808         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3809
3810
3811 def srt_subtitles_timecode(seconds):
3812     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3813
3814
3815 def ass_subtitles_timecode(seconds):
3816     time = timetuple_from_msec(seconds * 1000)
3817     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3818
3819
3820 def dfxp2srt(dfxp_data):
3821     '''
3822     @param dfxp_data A bytes-like object containing DFXP data
3823     @returns A unicode object containing converted SRT data
3824     '''
3825     LEGACY_NAMESPACES = (
3826         (b'http://www.w3.org/ns/ttml', [
3827             b'http://www.w3.org/2004/11/ttaf1',
3828             b'http://www.w3.org/2006/04/ttaf1',
3829             b'http://www.w3.org/2006/10/ttaf1',
3830         ]),
3831         (b'http://www.w3.org/ns/ttml#styling', [
3832             b'http://www.w3.org/ns/ttml#style',
3833         ]),
3834     )
3835
3836     SUPPORTED_STYLING = [
3837         'color',
3838         'fontFamily',
3839         'fontSize',
3840         'fontStyle',
3841         'fontWeight',
3842         'textDecoration'
3843     ]
3844
3845     _x = functools.partial(xpath_with_ns, ns_map={
3846         'xml': 'http://www.w3.org/XML/1998/namespace',
3847         'ttml': 'http://www.w3.org/ns/ttml',
3848         'tts': 'http://www.w3.org/ns/ttml#styling',
3849     })
3850
3851     styles = {}
3852     default_style = {}
3853
3854     class TTMLPElementParser:
3855         _out = ''
3856         _unclosed_elements = []
3857         _applied_styles = []
3858
3859         def start(self, tag, attrib):
3860             if tag in (_x('ttml:br'), 'br'):
3861                 self._out += '\n'
3862             else:
3863                 unclosed_elements = []
3864                 style = {}
3865                 element_style_id = attrib.get('style')
3866                 if default_style:
3867                     style.update(default_style)
3868                 if element_style_id:
3869                     style.update(styles.get(element_style_id, {}))
3870                 for prop in SUPPORTED_STYLING:
3871                     prop_val = attrib.get(_x('tts:' + prop))
3872                     if prop_val:
3873                         style[prop] = prop_val
3874                 if style:
3875                     font = ''
3876                     for k, v in sorted(style.items()):
3877                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3878                             continue
3879                         if k == 'color':
3880                             font += ' color="%s"' % v
3881                         elif k == 'fontSize':
3882                             font += ' size="%s"' % v
3883                         elif k == 'fontFamily':
3884                             font += ' face="%s"' % v
3885                         elif k == 'fontWeight' and v == 'bold':
3886                             self._out += '<b>'
3887                             unclosed_elements.append('b')
3888                         elif k == 'fontStyle' and v == 'italic':
3889                             self._out += '<i>'
3890                             unclosed_elements.append('i')
3891                         elif k == 'textDecoration' and v == 'underline':
3892                             self._out += '<u>'
3893                             unclosed_elements.append('u')
3894                     if font:
3895                         self._out += '<font' + font + '>'
3896                         unclosed_elements.append('font')
3897                     applied_style = {}
3898                     if self._applied_styles:
3899                         applied_style.update(self._applied_styles[-1])
3900                     applied_style.update(style)
3901                     self._applied_styles.append(applied_style)
3902                 self._unclosed_elements.append(unclosed_elements)
3903
3904         def end(self, tag):
3905             if tag not in (_x('ttml:br'), 'br'):
3906                 unclosed_elements = self._unclosed_elements.pop()
3907                 for element in reversed(unclosed_elements):
3908                     self._out += '</%s>' % element
3909                 if unclosed_elements and self._applied_styles:
3910                     self._applied_styles.pop()
3911
3912         def data(self, data):
3913             self._out += data
3914
3915         def close(self):
3916             return self._out.strip()
3917
3918     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3919     # This will not trigger false positives since only UTF-8 text is being replaced
3920     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3921
3922     def parse_node(node):
3923         target = TTMLPElementParser()
3924         parser = xml.etree.ElementTree.XMLParser(target=target)
3925         parser.feed(xml.etree.ElementTree.tostring(node))
3926         return parser.close()
3927
3928     for k, v in LEGACY_NAMESPACES:
3929         for ns in v:
3930             dfxp_data = dfxp_data.replace(ns, k)
3931
3932     dfxp = compat_etree_fromstring(dfxp_data)
3933     out = []
3934     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3935
3936     if not paras:
3937         raise ValueError('Invalid dfxp/TTML subtitle')
3938
3939     repeat = False
3940     while True:
3941         for style in dfxp.findall(_x('.//ttml:style')):
3942             style_id = style.get('id') or style.get(_x('xml:id'))
3943             if not style_id:
3944                 continue
3945             parent_style_id = style.get('style')
3946             if parent_style_id:
3947                 if parent_style_id not in styles:
3948                     repeat = True
3949                     continue
3950                 styles[style_id] = styles[parent_style_id].copy()
3951             for prop in SUPPORTED_STYLING:
3952                 prop_val = style.get(_x('tts:' + prop))
3953                 if prop_val:
3954                     styles.setdefault(style_id, {})[prop] = prop_val
3955         if repeat:
3956             repeat = False
3957         else:
3958             break
3959
3960     for p in ('body', 'div'):
3961         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3962         if ele is None:
3963             continue
3964         style = styles.get(ele.get('style'))
3965         if not style:
3966             continue
3967         default_style.update(style)
3968
3969     for para, index in zip(paras, itertools.count(1)):
3970         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3971         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3972         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3973         if begin_time is None:
3974             continue
3975         if not end_time:
3976             if not dur:
3977                 continue
3978             end_time = begin_time + dur
3979         out.append('%d\n%s --> %s\n%s\n\n' % (
3980             index,
3981             srt_subtitles_timecode(begin_time),
3982             srt_subtitles_timecode(end_time),
3983             parse_node(para)))
3984
3985     return ''.join(out)
3986
3987
3988 def cli_option(params, command_option, param, separator=None):
3989     param = params.get(param)
3990     return ([] if param is None
3991             else [command_option, str(param)] if separator is None
3992             else [f'{command_option}{separator}{param}'])
3993
3994
3995 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3996     param = params.get(param)
3997     assert param in (True, False, None)
3998     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3999
4000
4001 def cli_valueless_option(params, command_option, param, expected_value=True):
4002     return [command_option] if params.get(param) == expected_value else []
4003
4004
4005 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4006     if isinstance(argdict, (list, tuple)):  # for backward compatibility
4007         if use_compat:
4008             return argdict
4009         else:
4010             argdict = None
4011     if argdict is None:
4012         return default
4013     assert isinstance(argdict, dict)
4014
4015     assert isinstance(keys, (list, tuple))
4016     for key_list in keys:
4017         arg_list = list(filter(
4018             lambda x: x is not None,
4019             [argdict.get(key.lower()) for key in variadic(key_list)]))
4020         if arg_list:
4021             return [arg for args in arg_list for arg in args]
4022     return default
4023
4024
4025 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4026     main_key, exe = main_key.lower(), exe.lower()
4027     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4028     keys = [f'{root_key}{k}' for k in (keys or [''])]
4029     if root_key in keys:
4030         if main_key != exe:
4031             keys.append((main_key, exe))
4032         keys.append('default')
4033     else:
4034         use_compat = False
4035     return cli_configuration_args(argdict, keys, default, use_compat)
4036
4037
4038 class ISO639Utils:
4039     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4040     _lang_map = {
4041         'aa': 'aar',
4042         'ab': 'abk',
4043         'ae': 'ave',
4044         'af': 'afr',
4045         'ak': 'aka',
4046         'am': 'amh',
4047         'an': 'arg',
4048         'ar': 'ara',
4049         'as': 'asm',
4050         'av': 'ava',
4051         'ay': 'aym',
4052         'az': 'aze',
4053         'ba': 'bak',
4054         'be': 'bel',
4055         'bg': 'bul',
4056         'bh': 'bih',
4057         'bi': 'bis',
4058         'bm': 'bam',
4059         'bn': 'ben',
4060         'bo': 'bod',
4061         'br': 'bre',
4062         'bs': 'bos',
4063         'ca': 'cat',
4064         'ce': 'che',
4065         'ch': 'cha',
4066         'co': 'cos',
4067         'cr': 'cre',
4068         'cs': 'ces',
4069         'cu': 'chu',
4070         'cv': 'chv',
4071         'cy': 'cym',
4072         'da': 'dan',
4073         'de': 'deu',
4074         'dv': 'div',
4075         'dz': 'dzo',
4076         'ee': 'ewe',
4077         'el': 'ell',
4078         'en': 'eng',
4079         'eo': 'epo',
4080         'es': 'spa',
4081         'et': 'est',
4082         'eu': 'eus',
4083         'fa': 'fas',
4084         'ff': 'ful',
4085         'fi': 'fin',
4086         'fj': 'fij',
4087         'fo': 'fao',
4088         'fr': 'fra',
4089         'fy': 'fry',
4090         'ga': 'gle',
4091         'gd': 'gla',
4092         'gl': 'glg',
4093         'gn': 'grn',
4094         'gu': 'guj',
4095         'gv': 'glv',
4096         'ha': 'hau',
4097         'he': 'heb',
4098         'iw': 'heb',  # Replaced by he in 1989 revision
4099         'hi': 'hin',
4100         'ho': 'hmo',
4101         'hr': 'hrv',
4102         'ht': 'hat',
4103         'hu': 'hun',
4104         'hy': 'hye',
4105         'hz': 'her',
4106         'ia': 'ina',
4107         'id': 'ind',
4108         'in': 'ind',  # Replaced by id in 1989 revision
4109         'ie': 'ile',
4110         'ig': 'ibo',
4111         'ii': 'iii',
4112         'ik': 'ipk',
4113         'io': 'ido',
4114         'is': 'isl',
4115         'it': 'ita',
4116         'iu': 'iku',
4117         'ja': 'jpn',
4118         'jv': 'jav',
4119         'ka': 'kat',
4120         'kg': 'kon',
4121         'ki': 'kik',
4122         'kj': 'kua',
4123         'kk': 'kaz',
4124         'kl': 'kal',
4125         'km': 'khm',
4126         'kn': 'kan',
4127         'ko': 'kor',
4128         'kr': 'kau',
4129         'ks': 'kas',
4130         'ku': 'kur',
4131         'kv': 'kom',
4132         'kw': 'cor',
4133         'ky': 'kir',
4134         'la': 'lat',
4135         'lb': 'ltz',
4136         'lg': 'lug',
4137         'li': 'lim',
4138         'ln': 'lin',
4139         'lo': 'lao',
4140         'lt': 'lit',
4141         'lu': 'lub',
4142         'lv': 'lav',
4143         'mg': 'mlg',
4144         'mh': 'mah',
4145         'mi': 'mri',
4146         'mk': 'mkd',
4147         'ml': 'mal',
4148         'mn': 'mon',
4149         'mr': 'mar',
4150         'ms': 'msa',
4151         'mt': 'mlt',
4152         'my': 'mya',
4153         'na': 'nau',
4154         'nb': 'nob',
4155         'nd': 'nde',
4156         'ne': 'nep',
4157         'ng': 'ndo',
4158         'nl': 'nld',
4159         'nn': 'nno',
4160         'no': 'nor',
4161         'nr': 'nbl',
4162         'nv': 'nav',
4163         'ny': 'nya',
4164         'oc': 'oci',
4165         'oj': 'oji',
4166         'om': 'orm',
4167         'or': 'ori',
4168         'os': 'oss',
4169         'pa': 'pan',
4170         'pe': 'per',
4171         'pi': 'pli',
4172         'pl': 'pol',
4173         'ps': 'pus',
4174         'pt': 'por',
4175         'qu': 'que',
4176         'rm': 'roh',
4177         'rn': 'run',
4178         'ro': 'ron',
4179         'ru': 'rus',
4180         'rw': 'kin',
4181         'sa': 'san',
4182         'sc': 'srd',
4183         'sd': 'snd',
4184         'se': 'sme',
4185         'sg': 'sag',
4186         'si': 'sin',
4187         'sk': 'slk',
4188         'sl': 'slv',
4189         'sm': 'smo',
4190         'sn': 'sna',
4191         'so': 'som',
4192         'sq': 'sqi',
4193         'sr': 'srp',
4194         'ss': 'ssw',
4195         'st': 'sot',
4196         'su': 'sun',
4197         'sv': 'swe',
4198         'sw': 'swa',
4199         'ta': 'tam',
4200         'te': 'tel',
4201         'tg': 'tgk',
4202         'th': 'tha',
4203         'ti': 'tir',
4204         'tk': 'tuk',
4205         'tl': 'tgl',
4206         'tn': 'tsn',
4207         'to': 'ton',
4208         'tr': 'tur',
4209         'ts': 'tso',
4210         'tt': 'tat',
4211         'tw': 'twi',
4212         'ty': 'tah',
4213         'ug': 'uig',
4214         'uk': 'ukr',
4215         'ur': 'urd',
4216         'uz': 'uzb',
4217         've': 'ven',
4218         'vi': 'vie',
4219         'vo': 'vol',
4220         'wa': 'wln',
4221         'wo': 'wol',
4222         'xh': 'xho',
4223         'yi': 'yid',
4224         'ji': 'yid',  # Replaced by yi in 1989 revision
4225         'yo': 'yor',
4226         'za': 'zha',
4227         'zh': 'zho',
4228         'zu': 'zul',
4229     }
4230
4231     @classmethod
4232     def short2long(cls, code):
4233         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4234         return cls._lang_map.get(code[:2])
4235
4236     @classmethod
4237     def long2short(cls, code):
4238         """Convert language code from ISO 639-2/T to ISO 639-1"""
4239         for short_name, long_name in cls._lang_map.items():
4240             if long_name == code:
4241                 return short_name
4242
4243
4244 class ISO3166Utils:
4245     # From http://data.okfn.org/data/core/country-list
4246     _country_map = {
4247         'AF': 'Afghanistan',
4248         'AX': 'Åland Islands',
4249         'AL': 'Albania',
4250         'DZ': 'Algeria',
4251         'AS': 'American Samoa',
4252         'AD': 'Andorra',
4253         'AO': 'Angola',
4254         'AI': 'Anguilla',
4255         'AQ': 'Antarctica',
4256         'AG': 'Antigua and Barbuda',
4257         'AR': 'Argentina',
4258         'AM': 'Armenia',
4259         'AW': 'Aruba',
4260         'AU': 'Australia',
4261         'AT': 'Austria',
4262         'AZ': 'Azerbaijan',
4263         'BS': 'Bahamas',
4264         'BH': 'Bahrain',
4265         'BD': 'Bangladesh',
4266         'BB': 'Barbados',
4267         'BY': 'Belarus',
4268         'BE': 'Belgium',
4269         'BZ': 'Belize',
4270         'BJ': 'Benin',
4271         'BM': 'Bermuda',
4272         'BT': 'Bhutan',
4273         'BO': 'Bolivia, Plurinational State of',
4274         'BQ': 'Bonaire, Sint Eustatius and Saba',
4275         'BA': 'Bosnia and Herzegovina',
4276         'BW': 'Botswana',
4277         'BV': 'Bouvet Island',
4278         'BR': 'Brazil',
4279         'IO': 'British Indian Ocean Territory',
4280         'BN': 'Brunei Darussalam',
4281         'BG': 'Bulgaria',
4282         'BF': 'Burkina Faso',
4283         'BI': 'Burundi',
4284         'KH': 'Cambodia',
4285         'CM': 'Cameroon',
4286         'CA': 'Canada',
4287         'CV': 'Cape Verde',
4288         'KY': 'Cayman Islands',
4289         'CF': 'Central African Republic',
4290         'TD': 'Chad',
4291         'CL': 'Chile',
4292         'CN': 'China',
4293         'CX': 'Christmas Island',
4294         'CC': 'Cocos (Keeling) Islands',
4295         'CO': 'Colombia',
4296         'KM': 'Comoros',
4297         'CG': 'Congo',
4298         'CD': 'Congo, the Democratic Republic of the',
4299         'CK': 'Cook Islands',
4300         'CR': 'Costa Rica',
4301         'CI': 'Côte d\'Ivoire',
4302         'HR': 'Croatia',
4303         'CU': 'Cuba',
4304         'CW': 'Curaçao',
4305         'CY': 'Cyprus',
4306         'CZ': 'Czech Republic',
4307         'DK': 'Denmark',
4308         'DJ': 'Djibouti',
4309         'DM': 'Dominica',
4310         'DO': 'Dominican Republic',
4311         'EC': 'Ecuador',
4312         'EG': 'Egypt',
4313         'SV': 'El Salvador',
4314         'GQ': 'Equatorial Guinea',
4315         'ER': 'Eritrea',
4316         'EE': 'Estonia',
4317         'ET': 'Ethiopia',
4318         'FK': 'Falkland Islands (Malvinas)',
4319         'FO': 'Faroe Islands',
4320         'FJ': 'Fiji',
4321         'FI': 'Finland',
4322         'FR': 'France',
4323         'GF': 'French Guiana',
4324         'PF': 'French Polynesia',
4325         'TF': 'French Southern Territories',
4326         'GA': 'Gabon',
4327         'GM': 'Gambia',
4328         'GE': 'Georgia',
4329         'DE': 'Germany',
4330         'GH': 'Ghana',
4331         'GI': 'Gibraltar',
4332         'GR': 'Greece',
4333         'GL': 'Greenland',
4334         'GD': 'Grenada',
4335         'GP': 'Guadeloupe',
4336         'GU': 'Guam',
4337         'GT': 'Guatemala',
4338         'GG': 'Guernsey',
4339         'GN': 'Guinea',
4340         'GW': 'Guinea-Bissau',
4341         'GY': 'Guyana',
4342         'HT': 'Haiti',
4343         'HM': 'Heard Island and McDonald Islands',
4344         'VA': 'Holy See (Vatican City State)',
4345         'HN': 'Honduras',
4346         'HK': 'Hong Kong',
4347         'HU': 'Hungary',
4348         'IS': 'Iceland',
4349         'IN': 'India',
4350         'ID': 'Indonesia',
4351         'IR': 'Iran, Islamic Republic of',
4352         'IQ': 'Iraq',
4353         'IE': 'Ireland',
4354         'IM': 'Isle of Man',
4355         'IL': 'Israel',
4356         'IT': 'Italy',
4357         'JM': 'Jamaica',
4358         'JP': 'Japan',
4359         'JE': 'Jersey',
4360         'JO': 'Jordan',
4361         'KZ': 'Kazakhstan',
4362         'KE': 'Kenya',
4363         'KI': 'Kiribati',
4364         'KP': 'Korea, Democratic People\'s Republic of',
4365         'KR': 'Korea, Republic of',
4366         'KW': 'Kuwait',
4367         'KG': 'Kyrgyzstan',
4368         'LA': 'Lao People\'s Democratic Republic',
4369         'LV': 'Latvia',
4370         'LB': 'Lebanon',
4371         'LS': 'Lesotho',
4372         'LR': 'Liberia',
4373         'LY': 'Libya',
4374         'LI': 'Liechtenstein',
4375         'LT': 'Lithuania',
4376         'LU': 'Luxembourg',
4377         'MO': 'Macao',
4378         'MK': 'Macedonia, the Former Yugoslav Republic of',
4379         'MG': 'Madagascar',
4380         'MW': 'Malawi',
4381         'MY': 'Malaysia',
4382         'MV': 'Maldives',
4383         'ML': 'Mali',
4384         'MT': 'Malta',
4385         'MH': 'Marshall Islands',
4386         'MQ': 'Martinique',
4387         'MR': 'Mauritania',
4388         'MU': 'Mauritius',
4389         'YT': 'Mayotte',
4390         'MX': 'Mexico',
4391         'FM': 'Micronesia, Federated States of',
4392         'MD': 'Moldova, Republic of',
4393         'MC': 'Monaco',
4394         'MN': 'Mongolia',
4395         'ME': 'Montenegro',
4396         'MS': 'Montserrat',
4397         'MA': 'Morocco',
4398         'MZ': 'Mozambique',
4399         'MM': 'Myanmar',
4400         'NA': 'Namibia',
4401         'NR': 'Nauru',
4402         'NP': 'Nepal',
4403         'NL': 'Netherlands',
4404         'NC': 'New Caledonia',
4405         'NZ': 'New Zealand',
4406         'NI': 'Nicaragua',
4407         'NE': 'Niger',
4408         'NG': 'Nigeria',
4409         'NU': 'Niue',
4410         'NF': 'Norfolk Island',
4411         'MP': 'Northern Mariana Islands',
4412         'NO': 'Norway',
4413         'OM': 'Oman',
4414         'PK': 'Pakistan',
4415         'PW': 'Palau',
4416         'PS': 'Palestine, State of',
4417         'PA': 'Panama',
4418         'PG': 'Papua New Guinea',
4419         'PY': 'Paraguay',
4420         'PE': 'Peru',
4421         'PH': 'Philippines',
4422         'PN': 'Pitcairn',
4423         'PL': 'Poland',
4424         'PT': 'Portugal',
4425         'PR': 'Puerto Rico',
4426         'QA': 'Qatar',
4427         'RE': 'Réunion',
4428         'RO': 'Romania',
4429         'RU': 'Russian Federation',
4430         'RW': 'Rwanda',
4431         'BL': 'Saint Barthélemy',
4432         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4433         'KN': 'Saint Kitts and Nevis',
4434         'LC': 'Saint Lucia',
4435         'MF': 'Saint Martin (French part)',
4436         'PM': 'Saint Pierre and Miquelon',
4437         'VC': 'Saint Vincent and the Grenadines',
4438         'WS': 'Samoa',
4439         'SM': 'San Marino',
4440         'ST': 'Sao Tome and Principe',
4441         'SA': 'Saudi Arabia',
4442         'SN': 'Senegal',
4443         'RS': 'Serbia',
4444         'SC': 'Seychelles',
4445         'SL': 'Sierra Leone',
4446         'SG': 'Singapore',
4447         'SX': 'Sint Maarten (Dutch part)',
4448         'SK': 'Slovakia',
4449         'SI': 'Slovenia',
4450         'SB': 'Solomon Islands',
4451         'SO': 'Somalia',
4452         'ZA': 'South Africa',
4453         'GS': 'South Georgia and the South Sandwich Islands',
4454         'SS': 'South Sudan',
4455         'ES': 'Spain',
4456         'LK': 'Sri Lanka',
4457         'SD': 'Sudan',
4458         'SR': 'Suriname',
4459         'SJ': 'Svalbard and Jan Mayen',
4460         'SZ': 'Swaziland',
4461         'SE': 'Sweden',
4462         'CH': 'Switzerland',
4463         'SY': 'Syrian Arab Republic',
4464         'TW': 'Taiwan, Province of China',
4465         'TJ': 'Tajikistan',
4466         'TZ': 'Tanzania, United Republic of',
4467         'TH': 'Thailand',
4468         'TL': 'Timor-Leste',
4469         'TG': 'Togo',
4470         'TK': 'Tokelau',
4471         'TO': 'Tonga',
4472         'TT': 'Trinidad and Tobago',
4473         'TN': 'Tunisia',
4474         'TR': 'Turkey',
4475         'TM': 'Turkmenistan',
4476         'TC': 'Turks and Caicos Islands',
4477         'TV': 'Tuvalu',
4478         'UG': 'Uganda',
4479         'UA': 'Ukraine',
4480         'AE': 'United Arab Emirates',
4481         'GB': 'United Kingdom',
4482         'US': 'United States',
4483         'UM': 'United States Minor Outlying Islands',
4484         'UY': 'Uruguay',
4485         'UZ': 'Uzbekistan',
4486         'VU': 'Vanuatu',
4487         'VE': 'Venezuela, Bolivarian Republic of',
4488         'VN': 'Viet Nam',
4489         'VG': 'Virgin Islands, British',
4490         'VI': 'Virgin Islands, U.S.',
4491         'WF': 'Wallis and Futuna',
4492         'EH': 'Western Sahara',
4493         'YE': 'Yemen',
4494         'ZM': 'Zambia',
4495         'ZW': 'Zimbabwe',
4496         # Not ISO 3166 codes, but used for IP blocks
4497         'AP': 'Asia/Pacific Region',
4498         'EU': 'Europe',
4499     }
4500
4501     @classmethod
4502     def short2full(cls, code):
4503         """Convert an ISO 3166-2 country code to the corresponding full name"""
4504         return cls._country_map.get(code.upper())
4505
4506
4507 class GeoUtils:
4508     # Major IPv4 address blocks per country
4509     _country_ip_map = {
4510         'AD': '46.172.224.0/19',
4511         'AE': '94.200.0.0/13',
4512         'AF': '149.54.0.0/17',
4513         'AG': '209.59.64.0/18',
4514         'AI': '204.14.248.0/21',
4515         'AL': '46.99.0.0/16',
4516         'AM': '46.70.0.0/15',
4517         'AO': '105.168.0.0/13',
4518         'AP': '182.50.184.0/21',
4519         'AQ': '23.154.160.0/24',
4520         'AR': '181.0.0.0/12',
4521         'AS': '202.70.112.0/20',
4522         'AT': '77.116.0.0/14',
4523         'AU': '1.128.0.0/11',
4524         'AW': '181.41.0.0/18',
4525         'AX': '185.217.4.0/22',
4526         'AZ': '5.197.0.0/16',
4527         'BA': '31.176.128.0/17',
4528         'BB': '65.48.128.0/17',
4529         'BD': '114.130.0.0/16',
4530         'BE': '57.0.0.0/8',
4531         'BF': '102.178.0.0/15',
4532         'BG': '95.42.0.0/15',
4533         'BH': '37.131.0.0/17',
4534         'BI': '154.117.192.0/18',
4535         'BJ': '137.255.0.0/16',
4536         'BL': '185.212.72.0/23',
4537         'BM': '196.12.64.0/18',
4538         'BN': '156.31.0.0/16',
4539         'BO': '161.56.0.0/16',
4540         'BQ': '161.0.80.0/20',
4541         'BR': '191.128.0.0/12',
4542         'BS': '24.51.64.0/18',
4543         'BT': '119.2.96.0/19',
4544         'BW': '168.167.0.0/16',
4545         'BY': '178.120.0.0/13',
4546         'BZ': '179.42.192.0/18',
4547         'CA': '99.224.0.0/11',
4548         'CD': '41.243.0.0/16',
4549         'CF': '197.242.176.0/21',
4550         'CG': '160.113.0.0/16',
4551         'CH': '85.0.0.0/13',
4552         'CI': '102.136.0.0/14',
4553         'CK': '202.65.32.0/19',
4554         'CL': '152.172.0.0/14',
4555         'CM': '102.244.0.0/14',
4556         'CN': '36.128.0.0/10',
4557         'CO': '181.240.0.0/12',
4558         'CR': '201.192.0.0/12',
4559         'CU': '152.206.0.0/15',
4560         'CV': '165.90.96.0/19',
4561         'CW': '190.88.128.0/17',
4562         'CY': '31.153.0.0/16',
4563         'CZ': '88.100.0.0/14',
4564         'DE': '53.0.0.0/8',
4565         'DJ': '197.241.0.0/17',
4566         'DK': '87.48.0.0/12',
4567         'DM': '192.243.48.0/20',
4568         'DO': '152.166.0.0/15',
4569         'DZ': '41.96.0.0/12',
4570         'EC': '186.68.0.0/15',
4571         'EE': '90.190.0.0/15',
4572         'EG': '156.160.0.0/11',
4573         'ER': '196.200.96.0/20',
4574         'ES': '88.0.0.0/11',
4575         'ET': '196.188.0.0/14',
4576         'EU': '2.16.0.0/13',
4577         'FI': '91.152.0.0/13',
4578         'FJ': '144.120.0.0/16',
4579         'FK': '80.73.208.0/21',
4580         'FM': '119.252.112.0/20',
4581         'FO': '88.85.32.0/19',
4582         'FR': '90.0.0.0/9',
4583         'GA': '41.158.0.0/15',
4584         'GB': '25.0.0.0/8',
4585         'GD': '74.122.88.0/21',
4586         'GE': '31.146.0.0/16',
4587         'GF': '161.22.64.0/18',
4588         'GG': '62.68.160.0/19',
4589         'GH': '154.160.0.0/12',
4590         'GI': '95.164.0.0/16',
4591         'GL': '88.83.0.0/19',
4592         'GM': '160.182.0.0/15',
4593         'GN': '197.149.192.0/18',
4594         'GP': '104.250.0.0/19',
4595         'GQ': '105.235.224.0/20',
4596         'GR': '94.64.0.0/13',
4597         'GT': '168.234.0.0/16',
4598         'GU': '168.123.0.0/16',
4599         'GW': '197.214.80.0/20',
4600         'GY': '181.41.64.0/18',
4601         'HK': '113.252.0.0/14',
4602         'HN': '181.210.0.0/16',
4603         'HR': '93.136.0.0/13',
4604         'HT': '148.102.128.0/17',
4605         'HU': '84.0.0.0/14',
4606         'ID': '39.192.0.0/10',
4607         'IE': '87.32.0.0/12',
4608         'IL': '79.176.0.0/13',
4609         'IM': '5.62.80.0/20',
4610         'IN': '117.192.0.0/10',
4611         'IO': '203.83.48.0/21',
4612         'IQ': '37.236.0.0/14',
4613         'IR': '2.176.0.0/12',
4614         'IS': '82.221.0.0/16',
4615         'IT': '79.0.0.0/10',
4616         'JE': '87.244.64.0/18',
4617         'JM': '72.27.0.0/17',
4618         'JO': '176.29.0.0/16',
4619         'JP': '133.0.0.0/8',
4620         'KE': '105.48.0.0/12',
4621         'KG': '158.181.128.0/17',
4622         'KH': '36.37.128.0/17',
4623         'KI': '103.25.140.0/22',
4624         'KM': '197.255.224.0/20',
4625         'KN': '198.167.192.0/19',
4626         'KP': '175.45.176.0/22',
4627         'KR': '175.192.0.0/10',
4628         'KW': '37.36.0.0/14',
4629         'KY': '64.96.0.0/15',
4630         'KZ': '2.72.0.0/13',
4631         'LA': '115.84.64.0/18',
4632         'LB': '178.135.0.0/16',
4633         'LC': '24.92.144.0/20',
4634         'LI': '82.117.0.0/19',
4635         'LK': '112.134.0.0/15',
4636         'LR': '102.183.0.0/16',
4637         'LS': '129.232.0.0/17',
4638         'LT': '78.56.0.0/13',
4639         'LU': '188.42.0.0/16',
4640         'LV': '46.109.0.0/16',
4641         'LY': '41.252.0.0/14',
4642         'MA': '105.128.0.0/11',
4643         'MC': '88.209.64.0/18',
4644         'MD': '37.246.0.0/16',
4645         'ME': '178.175.0.0/17',
4646         'MF': '74.112.232.0/21',
4647         'MG': '154.126.0.0/17',
4648         'MH': '117.103.88.0/21',
4649         'MK': '77.28.0.0/15',
4650         'ML': '154.118.128.0/18',
4651         'MM': '37.111.0.0/17',
4652         'MN': '49.0.128.0/17',
4653         'MO': '60.246.0.0/16',
4654         'MP': '202.88.64.0/20',
4655         'MQ': '109.203.224.0/19',
4656         'MR': '41.188.64.0/18',
4657         'MS': '208.90.112.0/22',
4658         'MT': '46.11.0.0/16',
4659         'MU': '105.16.0.0/12',
4660         'MV': '27.114.128.0/18',
4661         'MW': '102.70.0.0/15',
4662         'MX': '187.192.0.0/11',
4663         'MY': '175.136.0.0/13',
4664         'MZ': '197.218.0.0/15',
4665         'NA': '41.182.0.0/16',
4666         'NC': '101.101.0.0/18',
4667         'NE': '197.214.0.0/18',
4668         'NF': '203.17.240.0/22',
4669         'NG': '105.112.0.0/12',
4670         'NI': '186.76.0.0/15',
4671         'NL': '145.96.0.0/11',
4672         'NO': '84.208.0.0/13',
4673         'NP': '36.252.0.0/15',
4674         'NR': '203.98.224.0/19',
4675         'NU': '49.156.48.0/22',
4676         'NZ': '49.224.0.0/14',
4677         'OM': '5.36.0.0/15',
4678         'PA': '186.72.0.0/15',
4679         'PE': '186.160.0.0/14',
4680         'PF': '123.50.64.0/18',
4681         'PG': '124.240.192.0/19',
4682         'PH': '49.144.0.0/13',
4683         'PK': '39.32.0.0/11',
4684         'PL': '83.0.0.0/11',
4685         'PM': '70.36.0.0/20',
4686         'PR': '66.50.0.0/16',
4687         'PS': '188.161.0.0/16',
4688         'PT': '85.240.0.0/13',
4689         'PW': '202.124.224.0/20',
4690         'PY': '181.120.0.0/14',
4691         'QA': '37.210.0.0/15',
4692         'RE': '102.35.0.0/16',
4693         'RO': '79.112.0.0/13',
4694         'RS': '93.86.0.0/15',
4695         'RU': '5.136.0.0/13',
4696         'RW': '41.186.0.0/16',
4697         'SA': '188.48.0.0/13',
4698         'SB': '202.1.160.0/19',
4699         'SC': '154.192.0.0/11',
4700         'SD': '102.120.0.0/13',
4701         'SE': '78.64.0.0/12',
4702         'SG': '8.128.0.0/10',
4703         'SI': '188.196.0.0/14',
4704         'SK': '78.98.0.0/15',
4705         'SL': '102.143.0.0/17',
4706         'SM': '89.186.32.0/19',
4707         'SN': '41.82.0.0/15',
4708         'SO': '154.115.192.0/18',
4709         'SR': '186.179.128.0/17',
4710         'SS': '105.235.208.0/21',
4711         'ST': '197.159.160.0/19',
4712         'SV': '168.243.0.0/16',
4713         'SX': '190.102.0.0/20',
4714         'SY': '5.0.0.0/16',
4715         'SZ': '41.84.224.0/19',
4716         'TC': '65.255.48.0/20',
4717         'TD': '154.68.128.0/19',
4718         'TG': '196.168.0.0/14',
4719         'TH': '171.96.0.0/13',
4720         'TJ': '85.9.128.0/18',
4721         'TK': '27.96.24.0/21',
4722         'TL': '180.189.160.0/20',
4723         'TM': '95.85.96.0/19',
4724         'TN': '197.0.0.0/11',
4725         'TO': '175.176.144.0/21',
4726         'TR': '78.160.0.0/11',
4727         'TT': '186.44.0.0/15',
4728         'TV': '202.2.96.0/19',
4729         'TW': '120.96.0.0/11',
4730         'TZ': '156.156.0.0/14',
4731         'UA': '37.52.0.0/14',
4732         'UG': '102.80.0.0/13',
4733         'US': '6.0.0.0/8',
4734         'UY': '167.56.0.0/13',
4735         'UZ': '84.54.64.0/18',
4736         'VA': '212.77.0.0/19',
4737         'VC': '207.191.240.0/21',
4738         'VE': '186.88.0.0/13',
4739         'VG': '66.81.192.0/20',
4740         'VI': '146.226.0.0/16',
4741         'VN': '14.160.0.0/11',
4742         'VU': '202.80.32.0/20',
4743         'WF': '117.20.32.0/21',
4744         'WS': '202.4.32.0/19',
4745         'YE': '134.35.0.0/16',
4746         'YT': '41.242.116.0/22',
4747         'ZA': '41.0.0.0/11',
4748         'ZM': '102.144.0.0/13',
4749         'ZW': '102.177.192.0/18',
4750     }
4751
4752     @classmethod
4753     def random_ipv4(cls, code_or_block):
4754         if len(code_or_block) == 2:
4755             block = cls._country_ip_map.get(code_or_block.upper())
4756             if not block:
4757                 return None
4758         else:
4759             block = code_or_block
4760         addr, preflen = block.split('/')
4761         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4762         addr_max = addr_min | (0xffffffff >> int(preflen))
4763         return str(socket.inet_ntoa(
4764             struct.pack('!L', random.randint(addr_min, addr_max))))
4765
4766
4767 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4768     def __init__(self, proxies=None):
4769         # Set default handlers
4770         for type in ('http', 'https'):
4771             setattr(self, '%s_open' % type,
4772                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4773                         meth(r, proxy, type))
4774         urllib.request.ProxyHandler.__init__(self, proxies)
4775
4776     def proxy_open(self, req, proxy, type):
4777         req_proxy = req.headers.get('Ytdl-request-proxy')
4778         if req_proxy is not None:
4779             proxy = req_proxy
4780             del req.headers['Ytdl-request-proxy']
4781
4782         if proxy == '__noproxy__':
4783             return None  # No Proxy
4784         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4785             req.add_header('Ytdl-socks-proxy', proxy)
4786             # yt-dlp's http/https handlers do wrapping the socket with socks
4787             return None
4788         return urllib.request.ProxyHandler.proxy_open(
4789             self, req, proxy, type)
4790
4791
4792 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4793 # released into Public Domain
4794 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4795
4796 def long_to_bytes(n, blocksize=0):
4797     """long_to_bytes(n:long, blocksize:int) : string
4798     Convert a long integer to a byte string.
4799
4800     If optional blocksize is given and greater than zero, pad the front of the
4801     byte string with binary zeros so that the length is a multiple of
4802     blocksize.
4803     """
4804     # after much testing, this algorithm was deemed to be the fastest
4805     s = b''
4806     n = int(n)
4807     while n > 0:
4808         s = struct.pack('>I', n & 0xffffffff) + s
4809         n = n >> 32
4810     # strip off leading zeros
4811     for i in range(len(s)):
4812         if s[i] != b'\000'[0]:
4813             break
4814     else:
4815         # only happens when n == 0
4816         s = b'\000'
4817         i = 0
4818     s = s[i:]
4819     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4820     # de-padding being done above, but sigh...
4821     if blocksize > 0 and len(s) % blocksize:
4822         s = (blocksize - len(s) % blocksize) * b'\000' + s
4823     return s
4824
4825
4826 def bytes_to_long(s):
4827     """bytes_to_long(string) : long
4828     Convert a byte string to a long integer.
4829
4830     This is (essentially) the inverse of long_to_bytes().
4831     """
4832     acc = 0
4833     length = len(s)
4834     if length % 4:
4835         extra = (4 - length % 4)
4836         s = b'\000' * extra + s
4837         length = length + extra
4838     for i in range(0, length, 4):
4839         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4840     return acc
4841
4842
4843 def ohdave_rsa_encrypt(data, exponent, modulus):
4844     '''
4845     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4846
4847     Input:
4848         data: data to encrypt, bytes-like object
4849         exponent, modulus: parameter e and N of RSA algorithm, both integer
4850     Output: hex string of encrypted data
4851
4852     Limitation: supports one block encryption only
4853     '''
4854
4855     payload = int(binascii.hexlify(data[::-1]), 16)
4856     encrypted = pow(payload, exponent, modulus)
4857     return '%x' % encrypted
4858
4859
4860 def pkcs1pad(data, length):
4861     """
4862     Padding input data with PKCS#1 scheme
4863
4864     @param {int[]} data        input data
4865     @param {int}   length      target length
4866     @returns {int[]}           padded data
4867     """
4868     if len(data) > length - 11:
4869         raise ValueError('Input data too long for PKCS#1 padding')
4870
4871     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4872     return [0, 2] + pseudo_random + [0] + data
4873
4874
4875 def _base_n_table(n, table):
4876     if not table and not n:
4877         raise ValueError('Either table or n must be specified')
4878     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4879
4880     if n and n != len(table):
4881         raise ValueError(f'base {n} exceeds table length {len(table)}')
4882     return table
4883
4884
4885 def encode_base_n(num, n=None, table=None):
4886     """Convert given int to a base-n string"""
4887     table = _base_n_table(n, table)
4888     if not num:
4889         return table[0]
4890
4891     result, base = '', len(table)
4892     while num:
4893         result = table[num % base] + result
4894         num = num // base
4895     return result
4896
4897
4898 def decode_base_n(string, n=None, table=None):
4899     """Convert given base-n string to int"""
4900     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4901     result, base = 0, len(table)
4902     for char in string:
4903         result = result * base + table[char]
4904     return result
4905
4906
4907 def decode_packed_codes(code):
4908     mobj = re.search(PACKED_CODES_RE, code)
4909     obfuscated_code, base, count, symbols = mobj.groups()
4910     base = int(base)
4911     count = int(count)
4912     symbols = symbols.split('|')
4913     symbol_table = {}
4914
4915     while count:
4916         count -= 1
4917         base_n_count = encode_base_n(count, base)
4918         symbol_table[base_n_count] = symbols[count] or base_n_count
4919
4920     return re.sub(
4921         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4922         obfuscated_code)
4923
4924
4925 def caesar(s, alphabet, shift):
4926     if shift == 0:
4927         return s
4928     l = len(alphabet)
4929     return ''.join(
4930         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4931         for c in s)
4932
4933
4934 def rot47(s):
4935     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4936
4937
4938 def parse_m3u8_attributes(attrib):
4939     info = {}
4940     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4941         if val.startswith('"'):
4942             val = val[1:-1]
4943         info[key] = val
4944     return info
4945
4946
4947 def urshift(val, n):
4948     return val >> n if val >= 0 else (val + 0x100000000) >> n
4949
4950
4951 def write_xattr(path, key, value):
4952     # Windows: Write xattrs to NTFS Alternate Data Streams:
4953     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4954     if compat_os_name == 'nt':
4955         assert ':' not in key
4956         assert os.path.exists(path)
4957
4958         try:
4959             with open(f'{path}:{key}', 'wb') as f:
4960                 f.write(value)
4961         except OSError as e:
4962             raise XAttrMetadataError(e.errno, e.strerror)
4963         return
4964
4965     # UNIX Method 1. Use xattrs/pyxattrs modules
4966
4967     setxattr = None
4968     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4969         # Unicode arguments are not supported in pyxattr until version 0.5.0
4970         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4971         if version_tuple(xattr.__version__) >= (0, 5, 0):
4972             setxattr = xattr.set
4973     elif xattr:
4974         setxattr = xattr.setxattr
4975
4976     if setxattr:
4977         try:
4978             setxattr(path, key, value)
4979         except OSError as e:
4980             raise XAttrMetadataError(e.errno, e.strerror)
4981         return
4982
4983     # UNIX Method 2. Use setfattr/xattr executables
4984     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4985            else 'xattr' if check_executable('xattr', ['-h']) else None)
4986     if not exe:
4987         raise XAttrUnavailableError(
4988             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4989             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4990
4991     value = value.decode()
4992     try:
4993         _, stderr, returncode = Popen.run(
4994             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4995             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4996     except OSError as e:
4997         raise XAttrMetadataError(e.errno, e.strerror)
4998     if returncode:
4999         raise XAttrMetadataError(returncode, stderr)
5000
5001
5002 def random_birthday(year_field, month_field, day_field):
5003     start_date = datetime.date(1950, 1, 1)
5004     end_date = datetime.date(1995, 12, 31)
5005     offset = random.randint(0, (end_date - start_date).days)
5006     random_date = start_date + datetime.timedelta(offset)
5007     return {
5008         year_field: str(random_date.year),
5009         month_field: str(random_date.month),
5010         day_field: str(random_date.day),
5011     }
5012
5013
5014 def find_available_port(interface=''):
5015     try:
5016         with socket.socket() as sock:
5017             sock.bind((interface, 0))
5018             return sock.getsockname()[1]
5019     except OSError:
5020         return None
5021
5022
5023 # Templates for internet shortcut files, which are plain text files.
5024 DOT_URL_LINK_TEMPLATE = '''\
5025 [InternetShortcut]
5026 URL=%(url)s
5027 '''
5028
5029 DOT_WEBLOC_LINK_TEMPLATE = '''\
5030 <?xml version="1.0" encoding="UTF-8"?>
5031 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5032 <plist version="1.0">
5033 <dict>
5034 \t<key>URL</key>
5035 \t<string>%(url)s</string>
5036 </dict>
5037 </plist>
5038 '''
5039
5040 DOT_DESKTOP_LINK_TEMPLATE = '''\
5041 [Desktop Entry]
5042 Encoding=UTF-8
5043 Name=%(filename)s
5044 Type=Link
5045 URL=%(url)s
5046 Icon=text-html
5047 '''
5048
5049 LINK_TEMPLATES = {
5050     'url': DOT_URL_LINK_TEMPLATE,
5051     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5052     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5053 }
5054
5055
5056 def iri_to_uri(iri):
5057     """
5058     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5059
5060     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5061     """
5062
5063     iri_parts = urllib.parse.urlparse(iri)
5064
5065     if '[' in iri_parts.netloc:
5066         raise ValueError('IPv6 URIs are not, yet, supported.')
5067         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5068
5069     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5070
5071     net_location = ''
5072     if iri_parts.username:
5073         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5074         if iri_parts.password is not None:
5075             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5076         net_location += '@'
5077
5078     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5079     # The 'idna' encoding produces ASCII text.
5080     if iri_parts.port is not None and iri_parts.port != 80:
5081         net_location += ':' + str(iri_parts.port)
5082
5083     return urllib.parse.urlunparse(
5084         (iri_parts.scheme,
5085             net_location,
5086
5087             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5088
5089             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5090             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5091
5092             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5093             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5094
5095             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5096
5097     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5098
5099
5100 def to_high_limit_path(path):
5101     if sys.platform in ['win32', 'cygwin']:
5102         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5103         return '\\\\?\\' + os.path.abspath(path)
5104
5105     return path
5106
5107
5108 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5109     val = traversal.traverse_obj(obj, *variadic(field))
5110     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5111         return default
5112     return template % func(val)
5113
5114
5115 def clean_podcast_url(url):
5116     return re.sub(r'''(?x)
5117         (?:
5118             (?:
5119                 chtbl\.com/track|
5120                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5121                 play\.podtrac\.com
5122             )/[^/]+|
5123             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5124             flex\.acast\.com|
5125             pd(?:
5126                 cn\.co| # https://podcorn.com/analytics-prefix/
5127                 st\.fm # https://podsights.com/docs/
5128             )/e
5129         )/''', '', url)
5130
5131
5132 _HEX_TABLE = '0123456789abcdef'
5133
5134
5135 def random_uuidv4():
5136     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5137
5138
5139 def make_dir(path, to_screen=None):
5140     try:
5141         dn = os.path.dirname(path)
5142         if dn:
5143             os.makedirs(dn, exist_ok=True)
5144         return True
5145     except OSError as err:
5146         if callable(to_screen) is not None:
5147             to_screen(f'unable to create directory {err}')
5148         return False
5149
5150
5151 def get_executable_path():
5152     from ..update import _get_variant_and_executable_path
5153
5154     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5155
5156
5157 def get_user_config_dirs(package_name):
5158     # .config (e.g. ~/.config/package_name)
5159     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5160     yield os.path.join(xdg_config_home, package_name)
5161
5162     # appdata (%APPDATA%/package_name)
5163     appdata_dir = os.getenv('appdata')
5164     if appdata_dir:
5165         yield os.path.join(appdata_dir, package_name)
5166
5167     # home (~/.package_name)
5168     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5169
5170
5171 def get_system_config_dirs(package_name):
5172     # /etc/package_name
5173     yield os.path.join('/etc', package_name)
5174
5175
5176 def time_seconds(**kwargs):
5177     """
5178     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5179     """
5180     return time.time() + datetime.timedelta(**kwargs).total_seconds()
5181
5182
5183 # create a JSON Web Signature (jws) with HS256 algorithm
5184 # the resulting format is in JWS Compact Serialization
5185 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5186 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5187 def jwt_encode_hs256(payload_data, key, headers={}):
5188     header_data = {
5189         'alg': 'HS256',
5190         'typ': 'JWT',
5191     }
5192     if headers:
5193         header_data.update(headers)
5194     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5195     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5196     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5197     signature_b64 = base64.b64encode(h.digest())
5198     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5199     return token
5200
5201
5202 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5203 def jwt_decode_hs256(jwt):
5204     header_b64, payload_b64, signature_b64 = jwt.split('.')
5205     # add trailing ='s that may have been stripped, superfluous ='s are ignored
5206     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5207     return payload_data
5208
5209
5210 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5211
5212
5213 @functools.cache
5214 def supports_terminal_sequences(stream):
5215     if compat_os_name == 'nt':
5216         if not WINDOWS_VT_MODE:
5217             return False
5218     elif not os.getenv('TERM'):
5219         return False
5220     try:
5221         return stream.isatty()
5222     except BaseException:
5223         return False
5224
5225
5226 def windows_enable_vt_mode():
5227     """Ref: https://bugs.python.org/issue30075 """
5228     if get_windows_version() < (10, 0, 10586):
5229         return
5230
5231     import ctypes
5232     import ctypes.wintypes
5233     import msvcrt
5234
5235     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5236
5237     dll = ctypes.WinDLL('kernel32', use_last_error=False)
5238     handle = os.open('CONOUT$', os.O_RDWR)
5239     try:
5240         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5241         dw_original_mode = ctypes.wintypes.DWORD()
5242         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5243         if not success:
5244             raise Exception('GetConsoleMode failed')
5245
5246         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5247             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5248         if not success:
5249             raise Exception('SetConsoleMode failed')
5250     finally:
5251         os.close(handle)
5252
5253     global WINDOWS_VT_MODE
5254     WINDOWS_VT_MODE = True
5255     supports_terminal_sequences.cache_clear()
5256
5257
5258 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5259
5260
5261 def remove_terminal_sequences(string):
5262     return _terminal_sequences_re.sub('', string)
5263
5264
5265 def number_of_digits(number):
5266     return len('%d' % number)
5267
5268
5269 def join_nonempty(*values, delim='-', from_dict=None):
5270     if from_dict is not None:
5271         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
5272     return delim.join(map(str, filter(None, values)))
5273
5274
5275 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5276     """
5277     Find the largest format dimensions in terms of video width and, for each thumbnail:
5278     * Modify the URL: Match the width with the provided regex and replace with the former width
5279     * Update dimensions
5280
5281     This function is useful with video services that scale the provided thumbnails on demand
5282     """
5283     _keys = ('width', 'height')
5284     max_dimensions = max(
5285         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5286         default=(0, 0))
5287     if not max_dimensions[0]:
5288         return thumbnails
5289     return [
5290         merge_dicts(
5291             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5292             dict(zip(_keys, max_dimensions)), thumbnail)
5293         for thumbnail in thumbnails
5294     ]
5295
5296
5297 def parse_http_range(range):
5298     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5299     if not range:
5300         return None, None, None
5301     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5302     if not crg:
5303         return None, None, None
5304     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5305
5306
5307 def read_stdin(what):
5308     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5309     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5310     return sys.stdin
5311
5312
5313 def determine_file_encoding(data):
5314     """
5315     Detect the text encoding used
5316     @returns (encoding, bytes to skip)
5317     """
5318
5319     # BOM marks are given priority over declarations
5320     for bom, enc in BOMS:
5321         if data.startswith(bom):
5322             return enc, len(bom)
5323
5324     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5325     # We ignore the endianness to get a good enough match
5326     data = data.replace(b'\0', b'')
5327     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5328     return mobj.group(1).decode() if mobj else None, 0
5329
5330
5331 class Config:
5332     own_args = None
5333     parsed_args = None
5334     filename = None
5335     __initialized = False
5336
5337     def __init__(self, parser, label=None):
5338         self.parser, self.label = parser, label
5339         self._loaded_paths, self.configs = set(), []
5340
5341     def init(self, args=None, filename=None):
5342         assert not self.__initialized
5343         self.own_args, self.filename = args, filename
5344         return self.load_configs()
5345
5346     def load_configs(self):
5347         directory = ''
5348         if self.filename:
5349             location = os.path.realpath(self.filename)
5350             directory = os.path.dirname(location)
5351             if location in self._loaded_paths:
5352                 return False
5353             self._loaded_paths.add(location)
5354
5355         self.__initialized = True
5356         opts, _ = self.parser.parse_known_args(self.own_args)
5357         self.parsed_args = self.own_args
5358         for location in opts.config_locations or []:
5359             if location == '-':
5360                 if location in self._loaded_paths:
5361                     continue
5362                 self._loaded_paths.add(location)
5363                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5364                 continue
5365             location = os.path.join(directory, expand_path(location))
5366             if os.path.isdir(location):
5367                 location = os.path.join(location, 'yt-dlp.conf')
5368             if not os.path.exists(location):
5369                 self.parser.error(f'config location {location} does not exist')
5370             self.append_config(self.read_file(location), location)
5371         return True
5372
5373     def __str__(self):
5374         label = join_nonempty(
5375             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5376             delim=' ')
5377         return join_nonempty(
5378             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5379             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5380             delim='\n')
5381
5382     @staticmethod
5383     def read_file(filename, default=[]):
5384         try:
5385             optionf = open(filename, 'rb')
5386         except OSError:
5387             return default  # silently skip if file is not present
5388         try:
5389             enc, skip = determine_file_encoding(optionf.read(512))
5390             optionf.seek(skip, io.SEEK_SET)
5391         except OSError:
5392             enc = None  # silently skip read errors
5393         try:
5394             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5395             contents = optionf.read().decode(enc or preferredencoding())
5396             res = shlex.split(contents, comments=True)
5397         except Exception as err:
5398             raise ValueError(f'Unable to parse "{filename}": {err}')
5399         finally:
5400             optionf.close()
5401         return res
5402
5403     @staticmethod
5404     def hide_login_info(opts):
5405         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5406         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5407
5408         def _scrub_eq(o):
5409             m = eqre.match(o)
5410             if m:
5411                 return m.group('key') + '=PRIVATE'
5412             else:
5413                 return o
5414
5415         opts = list(map(_scrub_eq, opts))
5416         for idx, opt in enumerate(opts):
5417             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5418                 opts[idx + 1] = 'PRIVATE'
5419         return opts
5420
5421     def append_config(self, *args, label=None):
5422         config = type(self)(self.parser, label)
5423         config._loaded_paths = self._loaded_paths
5424         if config.init(*args):
5425             self.configs.append(config)
5426
5427     @property
5428     def all_args(self):
5429         for config in reversed(self.configs):
5430             yield from config.all_args
5431         yield from self.parsed_args or []
5432
5433     def parse_known_args(self, **kwargs):
5434         return self.parser.parse_known_args(self.all_args, **kwargs)
5435
5436     def parse_args(self):
5437         return self.parser.parse_args(self.all_args)
5438
5439
5440 class WebSocketsWrapper:
5441     """Wraps websockets module to use in non-async scopes"""
5442     pool = None
5443
5444     def __init__(self, url, headers=None, connect=True):
5445         self.loop = asyncio.new_event_loop()
5446         # XXX: "loop" is deprecated
5447         self.conn = websockets.connect(
5448             url, extra_headers=headers, ping_interval=None,
5449             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5450         if connect:
5451             self.__enter__()
5452         atexit.register(self.__exit__, None, None, None)
5453
5454     def __enter__(self):
5455         if not self.pool:
5456             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5457         return self
5458
5459     def send(self, *args):
5460         self.run_with_loop(self.pool.send(*args), self.loop)
5461
5462     def recv(self, *args):
5463         return self.run_with_loop(self.pool.recv(*args), self.loop)
5464
5465     def __exit__(self, type, value, traceback):
5466         try:
5467             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5468         finally:
5469             self.loop.close()
5470             self._cancel_all_tasks(self.loop)
5471
5472     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5473     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5474     @staticmethod
5475     def run_with_loop(main, loop):
5476         if not asyncio.iscoroutine(main):
5477             raise ValueError(f'a coroutine was expected, got {main!r}')
5478
5479         try:
5480             return loop.run_until_complete(main)
5481         finally:
5482             loop.run_until_complete(loop.shutdown_asyncgens())
5483             if hasattr(loop, 'shutdown_default_executor'):
5484                 loop.run_until_complete(loop.shutdown_default_executor())
5485
5486     @staticmethod
5487     def _cancel_all_tasks(loop):
5488         to_cancel = asyncio.all_tasks(loop)
5489
5490         if not to_cancel:
5491             return
5492
5493         for task in to_cancel:
5494             task.cancel()
5495
5496         # XXX: "loop" is removed in python 3.10+
5497         loop.run_until_complete(
5498             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5499
5500         for task in to_cancel:
5501             if task.cancelled():
5502                 continue
5503             if task.exception() is not None:
5504                 loop.call_exception_handler({
5505                     'message': 'unhandled exception during asyncio.run() shutdown',
5506                     'exception': task.exception(),
5507                     'task': task,
5508                 })
5509
5510
5511 def merge_headers(*dicts):
5512     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5513     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5514
5515
5516 def cached_method(f):
5517     """Cache a method"""
5518     signature = inspect.signature(f)
5519
5520     @functools.wraps(f)
5521     def wrapper(self, *args, **kwargs):
5522         bound_args = signature.bind(self, *args, **kwargs)
5523         bound_args.apply_defaults()
5524         key = tuple(bound_args.arguments.values())[1:]
5525
5526         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5527         if key not in cache:
5528             cache[key] = f(self, *args, **kwargs)
5529         return cache[key]
5530     return wrapper
5531
5532
5533 class classproperty:
5534     """property access for class methods with optional caching"""
5535     def __new__(cls, func=None, *args, **kwargs):
5536         if not func:
5537             return functools.partial(cls, *args, **kwargs)
5538         return super().__new__(cls)
5539
5540     def __init__(self, func, *, cache=False):
5541         functools.update_wrapper(self, func)
5542         self.func = func
5543         self._cache = {} if cache else None
5544
5545     def __get__(self, _, cls):
5546         if self._cache is None:
5547             return self.func(cls)
5548         elif cls not in self._cache:
5549             self._cache[cls] = self.func(cls)
5550         return self._cache[cls]
5551
5552
5553 class function_with_repr:
5554     def __init__(self, func, repr_=None):
5555         functools.update_wrapper(self, func)
5556         self.func, self.__repr = func, repr_
5557
5558     def __call__(self, *args, **kwargs):
5559         return self.func(*args, **kwargs)
5560
5561     def __repr__(self):
5562         if self.__repr:
5563             return self.__repr
5564         return f'{self.func.__module__}.{self.func.__qualname__}'
5565
5566
5567 class Namespace(types.SimpleNamespace):
5568     """Immutable namespace"""
5569
5570     def __iter__(self):
5571         return iter(self.__dict__.values())
5572
5573     @property
5574     def items_(self):
5575         return self.__dict__.items()
5576
5577
5578 MEDIA_EXTENSIONS = Namespace(
5579     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5580     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5581     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5582     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5583     thumbnails=('jpg', 'png', 'webp'),
5584     storyboards=('mhtml', ),
5585     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5586     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5587 )
5588 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5589 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5590
5591 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5592
5593
5594 class RetryManager:
5595     """Usage:
5596         for retry in RetryManager(...):
5597             try:
5598                 ...
5599             except SomeException as err:
5600                 retry.error = err
5601                 continue
5602     """
5603     attempt, _error = 0, None
5604
5605     def __init__(self, _retries, _error_callback, **kwargs):
5606         self.retries = _retries or 0
5607         self.error_callback = functools.partial(_error_callback, **kwargs)
5608
5609     def _should_retry(self):
5610         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5611
5612     @property
5613     def error(self):
5614         if self._error is NO_DEFAULT:
5615             return None
5616         return self._error
5617
5618     @error.setter
5619     def error(self, value):
5620         self._error = value
5621
5622     def __iter__(self):
5623         while self._should_retry():
5624             self.error = NO_DEFAULT
5625             self.attempt += 1
5626             yield self
5627             if self.error:
5628                 self.error_callback(self.error, self.attempt, self.retries)
5629
5630     @staticmethod
5631     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5632         """Utility function for reporting retries"""
5633         if count > retries:
5634             if error:
5635                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5636             raise e
5637
5638         if not count:
5639             return warn(e)
5640         elif isinstance(e, ExtractorError):
5641             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5642         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5643
5644         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5645         if delay:
5646             info(f'Sleeping {delay:.2f} seconds ...')
5647             time.sleep(delay)
5648
5649
5650 def make_archive_id(ie, video_id):
5651     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5652     return f'{ie_key.lower()} {video_id}'
5653
5654
5655 def truncate_string(s, left, right=0):
5656     assert left > 3 and right >= 0
5657     if s is None or len(s) <= left + right:
5658         return s
5659     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5660
5661
5662 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5663     assert 'all' in alias_dict, '"all" alias is required'
5664     requested = list(start or [])
5665     for val in options:
5666         discard = val.startswith('-')
5667         if discard:
5668             val = val[1:]
5669
5670         if val in alias_dict:
5671             val = alias_dict[val] if not discard else [
5672                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5673             # NB: Do not allow regex in aliases for performance
5674             requested = orderedSet_from_options(val, alias_dict, start=requested)
5675             continue
5676
5677         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5678                    else [val] if val in alias_dict['all'] else None)
5679         if current is None:
5680             raise ValueError(val)
5681
5682         if discard:
5683             for item in current:
5684                 while item in requested:
5685                     requested.remove(item)
5686         else:
5687             requested.extend(current)
5688
5689     return orderedSet(requested)
5690
5691
5692 # TODO: Rewrite
5693 class FormatSorter:
5694     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5695
5696     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5697                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5698                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5699     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5700                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5701                     'fps', 'fs_approx', 'source', 'id')
5702
5703     settings = {
5704         'vcodec': {'type': 'ordered', 'regex': True,
5705                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5706         'acodec': {'type': 'ordered', 'regex': True,
5707                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5708         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5709                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5710         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5711                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5712         'vext': {'type': 'ordered', 'field': 'video_ext',
5713                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5714                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5715         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5716                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5717                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5718         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5719         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5720                        'field': ('vcodec', 'acodec'),
5721                        'function': lambda it: int(any(v != 'none' for v in it))},
5722         'ie_pref': {'priority': True, 'type': 'extractor'},
5723         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5724         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5725         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5726         'quality': {'convert': 'float', 'default': -1},
5727         'filesize': {'convert': 'bytes'},
5728         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5729         'id': {'convert': 'string', 'field': 'format_id'},
5730         'height': {'convert': 'float_none'},
5731         'width': {'convert': 'float_none'},
5732         'fps': {'convert': 'float_none'},
5733         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5734         'tbr': {'convert': 'float_none'},
5735         'vbr': {'convert': 'float_none'},
5736         'abr': {'convert': 'float_none'},
5737         'asr': {'convert': 'float_none'},
5738         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5739
5740         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5741         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5742                'function': lambda it: next(filter(None, it), None)},
5743         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5744                  'function': lambda it: next(filter(None, it), None)},
5745         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5746         'res': {'type': 'multiple', 'field': ('height', 'width'),
5747                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5748
5749         # Actual field names
5750         'format_id': {'type': 'alias', 'field': 'id'},
5751         'preference': {'type': 'alias', 'field': 'ie_pref'},
5752         'language_preference': {'type': 'alias', 'field': 'lang'},
5753         'source_preference': {'type': 'alias', 'field': 'source'},
5754         'protocol': {'type': 'alias', 'field': 'proto'},
5755         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5756         'audio_channels': {'type': 'alias', 'field': 'channels'},
5757
5758         # Deprecated
5759         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5760         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5761         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5762         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5763         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5764         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5765         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5766         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5767         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5768         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5769         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5770         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5771         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5772         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5773         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5774         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5775         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5776         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5777         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5778         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5779     }
5780
5781     def __init__(self, ydl, field_preference):
5782         self.ydl = ydl
5783         self._order = []
5784         self.evaluate_params(self.ydl.params, field_preference)
5785         if ydl.params.get('verbose'):
5786             self.print_verbose_info(self.ydl.write_debug)
5787
5788     def _get_field_setting(self, field, key):
5789         if field not in self.settings:
5790             if key in ('forced', 'priority'):
5791                 return False
5792             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5793                                         'deprecated and may be removed in a future version')
5794             self.settings[field] = {}
5795         propObj = self.settings[field]
5796         if key not in propObj:
5797             type = propObj.get('type')
5798             if key == 'field':
5799                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5800             elif key == 'convert':
5801                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5802             else:
5803                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5804             propObj[key] = default
5805         return propObj[key]
5806
5807     def _resolve_field_value(self, field, value, convertNone=False):
5808         if value is None:
5809             if not convertNone:
5810                 return None
5811         else:
5812             value = value.lower()
5813         conversion = self._get_field_setting(field, 'convert')
5814         if conversion == 'ignore':
5815             return None
5816         if conversion == 'string':
5817             return value
5818         elif conversion == 'float_none':
5819             return float_or_none(value)
5820         elif conversion == 'bytes':
5821             return parse_bytes(value)
5822         elif conversion == 'order':
5823             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5824             use_regex = self._get_field_setting(field, 'regex')
5825             list_length = len(order_list)
5826             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5827             if use_regex and value is not None:
5828                 for i, regex in enumerate(order_list):
5829                     if regex and re.match(regex, value):
5830                         return list_length - i
5831                 return list_length - empty_pos  # not in list
5832             else:  # not regex or  value = None
5833                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5834         else:
5835             if value.isnumeric():
5836                 return float(value)
5837             else:
5838                 self.settings[field]['convert'] = 'string'
5839                 return value
5840
5841     def evaluate_params(self, params, sort_extractor):
5842         self._use_free_order = params.get('prefer_free_formats', False)
5843         self._sort_user = params.get('format_sort', [])
5844         self._sort_extractor = sort_extractor
5845
5846         def add_item(field, reverse, closest, limit_text):
5847             field = field.lower()
5848             if field in self._order:
5849                 return
5850             self._order.append(field)
5851             limit = self._resolve_field_value(field, limit_text)
5852             data = {
5853                 'reverse': reverse,
5854                 'closest': False if limit is None else closest,
5855                 'limit_text': limit_text,
5856                 'limit': limit}
5857             if field in self.settings:
5858                 self.settings[field].update(data)
5859             else:
5860                 self.settings[field] = data
5861
5862         sort_list = (
5863             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5864             + (tuple() if params.get('format_sort_force', False)
5865                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5866             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5867
5868         for item in sort_list:
5869             match = re.match(self.regex, item)
5870             if match is None:
5871                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5872             field = match.group('field')
5873             if field is None:
5874                 continue
5875             if self._get_field_setting(field, 'type') == 'alias':
5876                 alias, field = field, self._get_field_setting(field, 'field')
5877                 if self._get_field_setting(alias, 'deprecated'):
5878                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5879                                                 f'be removed in a future version. Please use {field} instead')
5880             reverse = match.group('reverse') is not None
5881             closest = match.group('separator') == '~'
5882             limit_text = match.group('limit')
5883
5884             has_limit = limit_text is not None
5885             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5886             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5887
5888             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5889             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5890             limit_count = len(limits)
5891             for (i, f) in enumerate(fields):
5892                 add_item(f, reverse, closest,
5893                          limits[i] if i < limit_count
5894                          else limits[0] if has_limit and not has_multiple_limits
5895                          else None)
5896
5897     def print_verbose_info(self, write_debug):
5898         if self._sort_user:
5899             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5900         if self._sort_extractor:
5901             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5902         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5903             '+' if self._get_field_setting(field, 'reverse') else '', field,
5904             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5905                           self._get_field_setting(field, 'limit_text'),
5906                           self._get_field_setting(field, 'limit'))
5907             if self._get_field_setting(field, 'limit_text') is not None else '')
5908             for field in self._order if self._get_field_setting(field, 'visible')]))
5909
5910     def _calculate_field_preference_from_value(self, format, field, type, value):
5911         reverse = self._get_field_setting(field, 'reverse')
5912         closest = self._get_field_setting(field, 'closest')
5913         limit = self._get_field_setting(field, 'limit')
5914
5915         if type == 'extractor':
5916             maximum = self._get_field_setting(field, 'max')
5917             if value is None or (maximum is not None and value >= maximum):
5918                 value = -1
5919         elif type == 'boolean':
5920             in_list = self._get_field_setting(field, 'in_list')
5921             not_in_list = self._get_field_setting(field, 'not_in_list')
5922             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5923         elif type == 'ordered':
5924             value = self._resolve_field_value(field, value, True)
5925
5926         # try to convert to number
5927         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5928         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5929         if is_num:
5930             value = val_num
5931
5932         return ((-10, 0) if value is None
5933                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5934                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5935                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5936                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5937                 else (-1, value, 0))
5938
5939     def _calculate_field_preference(self, format, field):
5940         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5941         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5942         if type == 'multiple':
5943             type = 'field'  # Only 'field' is allowed in multiple for now
5944             actual_fields = self._get_field_setting(field, 'field')
5945
5946             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5947         else:
5948             value = get_value(field)
5949         return self._calculate_field_preference_from_value(format, field, type, value)
5950
5951     def calculate_preference(self, format):
5952         # Determine missing protocol
5953         if not format.get('protocol'):
5954             format['protocol'] = determine_protocol(format)
5955
5956         # Determine missing ext
5957         if not format.get('ext') and 'url' in format:
5958             format['ext'] = determine_ext(format['url'])
5959         if format.get('vcodec') == 'none':
5960             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5961             format['video_ext'] = 'none'
5962         else:
5963             format['video_ext'] = format['ext']
5964             format['audio_ext'] = 'none'
5965         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5966         #    format['preference'] = -1000
5967
5968         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5969             # HEVC-over-FLV is out-of-spec by FLV's original spec
5970             # ref. https://trac.ffmpeg.org/ticket/6389
5971             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5972             format['preference'] = -100
5973
5974         # Determine missing bitrates
5975         if format.get('vcodec') == 'none':
5976             format['vbr'] = 0
5977         if format.get('acodec') == 'none':
5978             format['abr'] = 0
5979         if not format.get('vbr') and format.get('vcodec') != 'none':
5980             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5981         if not format.get('abr') and format.get('acodec') != 'none':
5982             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5983         if not format.get('tbr'):
5984             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5985
5986         return tuple(self._calculate_field_preference(format, field) for field in self._order)