yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import netrc
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import shlex
  35 import socket
  36 import ssl
  37 import struct
  38 import subprocess
  39 import sys
  40 import tempfile
  41 import time
  42 import traceback
  43 import types
  44 import unicodedata
  45 import urllib.error
  46 import urllib.parse
  47 import urllib.request
  48 import xml.etree.ElementTree
  49 import zlib
  50
  51 from . import traversal
  52
  53 from ..compat import functools  # isort: split
  54 from ..compat import (
  55     compat_etree_fromstring,
  56     compat_expanduser,
  57     compat_HTMLParseError,
  58     compat_os_name,
  59     compat_shlex_quote,
  60 )
  61 from ..dependencies import brotli, certifi, websockets, xattr
  62 from ..socks import ProxyType, sockssocket
  63
  64 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  65
  66 # This is not clearly defined otherwise
  67 compiled_regex_type = type(re.compile(''))
  68
  69
  70 def random_user_agent():
  71     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  72     _CHROME_VERSIONS = (
  73         '90.0.4430.212',
  74         '90.0.4430.24',
  75         '90.0.4430.70',
  76         '90.0.4430.72',
  77         '90.0.4430.85',
  78         '90.0.4430.93',
  79         '91.0.4472.101',
  80         '91.0.4472.106',
  81         '91.0.4472.114',
  82         '91.0.4472.124',
  83         '91.0.4472.164',
  84         '91.0.4472.19',
  85         '91.0.4472.77',
  86         '92.0.4515.107',
  87         '92.0.4515.115',
  88         '92.0.4515.131',
  89         '92.0.4515.159',
  90         '92.0.4515.43',
  91         '93.0.4556.0',
  92         '93.0.4577.15',
  93         '93.0.4577.63',
  94         '93.0.4577.82',
  95         '94.0.4606.41',
  96         '94.0.4606.54',
  97         '94.0.4606.61',
  98         '94.0.4606.71',
  99         '94.0.4606.81',
 100         '94.0.4606.85',
 101         '95.0.4638.17',
 102         '95.0.4638.50',
 103         '95.0.4638.54',
 104         '95.0.4638.69',
 105         '95.0.4638.74',
 106         '96.0.4664.18',
 107         '96.0.4664.45',
 108         '96.0.4664.55',
 109         '96.0.4664.93',
 110         '97.0.4692.20',
 111     )
 112     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 113
 114
 115 SUPPORTED_ENCODINGS = [
 116     'gzip', 'deflate'
 117 ]
 118 if brotli:
 119     SUPPORTED_ENCODINGS.append('br')
 120
 121 std_headers = {
 122     'User-Agent': random_user_agent(),
 123     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 124     'Accept-Language': 'en-us,en;q=0.5',
 125     'Sec-Fetch-Mode': 'navigate',
 126 }
 127
 128
 129 USER_AGENTS = {
 130     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 131 }
 132
 133
 134 class NO_DEFAULT:
 135     pass
 136
 137
 138 def IDENTITY(x):
 139     return x
 140
 141
 142 ENGLISH_MONTH_NAMES = [
 143     'January', 'February', 'March', 'April', 'May', 'June',
 144     'July', 'August', 'September', 'October', 'November', 'December']
 145
 146 MONTH_NAMES = {
 147     'en': ENGLISH_MONTH_NAMES,
 148     'fr': [
 149         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 150         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 151     # these follow the genitive grammatical case (dopełniacz)
 152     # some websites might be using nominative, which will require another month list
 153     # https://en.wikibooks.org/wiki/Polish/Noun_cases
 154     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
 155            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
 156 }
 157
 158 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 159 TIMEZONE_NAMES = {
 160     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 161     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 162     'EST': -5, 'EDT': -4,  # Eastern
 163     'CST': -6, 'CDT': -5,  # Central
 164     'MST': -7, 'MDT': -6,  # Mountain
 165     'PST': -8, 'PDT': -7   # Pacific
 166 }
 167
 168 # needed for sanitizing filenames in restricted mode
 169 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 170                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 171                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 172
 173 DATE_FORMATS = (
 174     '%d %B %Y',
 175     '%d %b %Y',
 176     '%B %d %Y',
 177     '%B %dst %Y',
 178     '%B %dnd %Y',
 179     '%B %drd %Y',
 180     '%B %dth %Y',
 181     '%b %d %Y',
 182     '%b %dst %Y',
 183     '%b %dnd %Y',
 184     '%b %drd %Y',
 185     '%b %dth %Y',
 186     '%b %dst %Y %I:%M',
 187     '%b %dnd %Y %I:%M',
 188     '%b %drd %Y %I:%M',
 189     '%b %dth %Y %I:%M',
 190     '%Y %m %d',
 191     '%Y-%m-%d',
 192     '%Y.%m.%d.',
 193     '%Y/%m/%d',
 194     '%Y/%m/%d %H:%M',
 195     '%Y/%m/%d %H:%M:%S',
 196     '%Y%m%d%H%M',
 197     '%Y%m%d%H%M%S',
 198     '%Y%m%d',
 199     '%Y-%m-%d %H:%M',
 200     '%Y-%m-%d %H:%M:%S',
 201     '%Y-%m-%d %H:%M:%S.%f',
 202     '%Y-%m-%d %H:%M:%S:%f',
 203     '%d.%m.%Y %H:%M',
 204     '%d.%m.%Y %H.%M',
 205     '%Y-%m-%dT%H:%M:%SZ',
 206     '%Y-%m-%dT%H:%M:%S.%fZ',
 207     '%Y-%m-%dT%H:%M:%S.%f0Z',
 208     '%Y-%m-%dT%H:%M:%S',
 209     '%Y-%m-%dT%H:%M:%S.%f',
 210     '%Y-%m-%dT%H:%M',
 211     '%b %d %Y at %H:%M',
 212     '%b %d %Y at %H:%M:%S',
 213     '%B %d %Y at %H:%M',
 214     '%B %d %Y at %H:%M:%S',
 215     '%H:%M %d-%b-%Y',
 216 )
 217
 218 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 219 DATE_FORMATS_DAY_FIRST.extend([
 220     '%d-%m-%Y',
 221     '%d.%m.%Y',
 222     '%d.%m.%y',
 223     '%d/%m/%Y',
 224     '%d/%m/%y',
 225     '%d/%m/%Y %H:%M:%S',
 226     '%d-%m-%Y %H:%M',
 227     '%H:%M %d/%m/%Y',
 228 ])
 229
 230 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 231 DATE_FORMATS_MONTH_FIRST.extend([
 232     '%m-%d-%Y',
 233     '%m.%d.%Y',
 234     '%m/%d/%Y',
 235     '%m/%d/%y',
 236     '%m/%d/%Y %H:%M:%S',
 237 ])
 238
 239 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 240 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 241
 242 NUMBER_RE = r'\d+(?:\.\d+)?'
 243
 244
 245 @functools.cache
 246 def preferredencoding():
 247     """Get preferred encoding.
 248
 249     Returns the best encoding scheme for the system, based on
 250     locale.getpreferredencoding() and some further tweaks.
 251     """
 252     try:
 253         pref = locale.getpreferredencoding()
 254         'TEST'.encode(pref)
 255     except Exception:
 256         pref = 'UTF-8'
 257
 258     return pref
 259
 260
 261 def write_json_file(obj, fn):
 262     """ Encode obj as JSON and write it to fn, atomically if possible """
 263
 264     tf = tempfile.NamedTemporaryFile(
 265         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 266         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 267
 268     try:
 269         with tf:
 270             json.dump(obj, tf, ensure_ascii=False)
 271         if sys.platform == 'win32':
 272             # Need to remove existing file on Windows, else os.rename raises
 273             # WindowsError or FileExistsError.
 274             with contextlib.suppress(OSError):
 275                 os.unlink(fn)
 276         with contextlib.suppress(OSError):
 277             mask = os.umask(0)
 278             os.umask(mask)
 279             os.chmod(tf.name, 0o666 & ~mask)
 280         os.rename(tf.name, fn)
 281     except Exception:
 282         with contextlib.suppress(OSError):
 283             os.remove(tf.name)
 284         raise
 285
 286
 287 def find_xpath_attr(node, xpath, key, val=None):
 288     """ Find the xpath xpath[@key=val] """
 289     assert re.match(r'^[a-zA-Z_-]+$', key)
 290     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 291     return node.find(expr)
 292
 293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 294 # the namespace parameter
 295
 296
 297 def xpath_with_ns(path, ns_map):
 298     components = [c.split(':') for c in path.split('/')]
 299     replaced = []
 300     for c in components:
 301         if len(c) == 1:
 302             replaced.append(c[0])
 303         else:
 304             ns, tag = c
 305             replaced.append('{%s}%s' % (ns_map[ns], tag))
 306     return '/'.join(replaced)
 307
 308
 309 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 310     def _find_xpath(xpath):
 311         return node.find(xpath)
 312
 313     if isinstance(xpath, str):
 314         n = _find_xpath(xpath)
 315     else:
 316         for xp in xpath:
 317             n = _find_xpath(xp)
 318             if n is not None:
 319                 break
 320
 321     if n is None:
 322         if default is not NO_DEFAULT:
 323             return default
 324         elif fatal:
 325             name = xpath if name is None else name
 326             raise ExtractorError('Could not find XML element %s' % name)
 327         else:
 328             return None
 329     return n
 330
 331
 332 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 333     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 334     if n is None or n == default:
 335         return n
 336     if n.text is None:
 337         if default is not NO_DEFAULT:
 338             return default
 339         elif fatal:
 340             name = xpath if name is None else name
 341             raise ExtractorError('Could not find XML element\'s text %s' % name)
 342         else:
 343             return None
 344     return n.text
 345
 346
 347 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 348     n = find_xpath_attr(node, xpath, key)
 349     if n is None:
 350         if default is not NO_DEFAULT:
 351             return default
 352         elif fatal:
 353             name = f'{xpath}[@{key}]' if name is None else name
 354             raise ExtractorError('Could not find XML attribute %s' % name)
 355         else:
 356             return None
 357     return n.attrib[key]
 358
 359
 360 def get_element_by_id(id, html, **kwargs):
 361     """Return the content of the tag with the specified ID in the passed HTML document"""
 362     return get_element_by_attribute('id', id, html, **kwargs)
 363
 364
 365 def get_element_html_by_id(id, html, **kwargs):
 366     """Return the html of the tag with the specified ID in the passed HTML document"""
 367     return get_element_html_by_attribute('id', id, html, **kwargs)
 368
 369
 370 def get_element_by_class(class_name, html):
 371     """Return the content of the first tag with the specified class in the passed HTML document"""
 372     retval = get_elements_by_class(class_name, html)
 373     return retval[0] if retval else None
 374
 375
 376 def get_element_html_by_class(class_name, html):
 377     """Return the html of the first tag with the specified class in the passed HTML document"""
 378     retval = get_elements_html_by_class(class_name, html)
 379     return retval[0] if retval else None
 380
 381
 382 def get_element_by_attribute(attribute, value, html, **kwargs):
 383     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 384     return retval[0] if retval else None
 385
 386
 387 def get_element_html_by_attribute(attribute, value, html, **kargs):
 388     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 389     return retval[0] if retval else None
 390
 391
 392 def get_elements_by_class(class_name, html, **kargs):
 393     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 394     return get_elements_by_attribute(
 395         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 396         html, escape_value=False)
 397
 398
 399 def get_elements_html_by_class(class_name, html):
 400     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 401     return get_elements_html_by_attribute(
 402         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 403         html, escape_value=False)
 404
 405
 406 def get_elements_by_attribute(*args, **kwargs):
 407     """Return the content of the tag with the specified attribute in the passed HTML document"""
 408     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 409
 410
 411 def get_elements_html_by_attribute(*args, **kwargs):
 412     """Return the html of the tag with the specified attribute in the passed HTML document"""
 413     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 414
 415
 416 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 417     """
 418     Return the text (content) and the html (whole) of the tag with the specified
 419     attribute in the passed HTML document
 420     """
 421     if not value:
 422         return
 423
 424     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 425
 426     value = re.escape(value) if escape_value else value
 427
 428     partial_element_re = rf'''(?x)
 429         <(?P<tag>{tag})
 430          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 431          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 432         '''
 433
 434     for m in re.finditer(partial_element_re, html):
 435         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 436
 437         yield (
 438             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 439             whole
 440         )
 441
 442
 443 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 444     """
 445     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 446     closing tag for the first opening tag it has encountered, and can be used
 447     as a context manager
 448     """
 449
 450     class HTMLBreakOnClosingTagException(Exception):
 451         pass
 452
 453     def __init__(self):
 454         self.tagstack = collections.deque()
 455         html.parser.HTMLParser.__init__(self)
 456
 457     def __enter__(self):
 458         return self
 459
 460     def __exit__(self, *_):
 461         self.close()
 462
 463     def close(self):
 464         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 465         # so data remains buffered; we no longer have any interest in it, thus
 466         # override this method to discard it
 467         pass
 468
 469     def handle_starttag(self, tag, _):
 470         self.tagstack.append(tag)
 471
 472     def handle_endtag(self, tag):
 473         if not self.tagstack:
 474             raise compat_HTMLParseError('no tags in the stack')
 475         while self.tagstack:
 476             inner_tag = self.tagstack.pop()
 477             if inner_tag == tag:
 478                 break
 479         else:
 480             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 481         if not self.tagstack:
 482             raise self.HTMLBreakOnClosingTagException()
 483
 484
 485 # XXX: This should be far less strict
 486 def get_element_text_and_html_by_tag(tag, html):
 487     """
 488     For the first element with the specified tag in the passed HTML document
 489     return its' content (text) and the whole element (html)
 490     """
 491     def find_or_raise(haystack, needle, exc):
 492         try:
 493             return haystack.index(needle)
 494         except ValueError:
 495             raise exc
 496     closing_tag = f'</{tag}>'
 497     whole_start = find_or_raise(
 498         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 499     content_start = find_or_raise(
 500         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 501     content_start += whole_start + 1
 502     with HTMLBreakOnClosingTagParser() as parser:
 503         parser.feed(html[whole_start:content_start])
 504         if not parser.tagstack or parser.tagstack[0] != tag:
 505             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 506         offset = content_start
 507         while offset < len(html):
 508             next_closing_tag_start = find_or_raise(
 509                 html[offset:], closing_tag,
 510                 compat_HTMLParseError(f'closing {tag} tag not found'))
 511             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 512             try:
 513                 parser.feed(html[offset:offset + next_closing_tag_end])
 514                 offset += next_closing_tag_end
 515             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 516                 return html[content_start:offset + next_closing_tag_start], \
 517                     html[whole_start:offset + next_closing_tag_end]
 518         raise compat_HTMLParseError('unexpected end of html')
 519
 520
 521 class HTMLAttributeParser(html.parser.HTMLParser):
 522     """Trivial HTML parser to gather the attributes for a single element"""
 523
 524     def __init__(self):
 525         self.attrs = {}
 526         html.parser.HTMLParser.__init__(self)
 527
 528     def handle_starttag(self, tag, attrs):
 529         self.attrs = dict(attrs)
 530         raise compat_HTMLParseError('done')
 531
 532
 533 class HTMLListAttrsParser(html.parser.HTMLParser):
 534     """HTML parser to gather the attributes for the elements of a list"""
 535
 536     def __init__(self):
 537         html.parser.HTMLParser.__init__(self)
 538         self.items = []
 539         self._level = 0
 540
 541     def handle_starttag(self, tag, attrs):
 542         if tag == 'li' and self._level == 0:
 543             self.items.append(dict(attrs))
 544         self._level += 1
 545
 546     def handle_endtag(self, tag):
 547         self._level -= 1
 548
 549
 550 def extract_attributes(html_element):
 551     """Given a string for an HTML element such as
 552     <el
 553          a="foo" B="bar" c="&98;az" d=boz
 554          empty= noval entity="&amp;"
 555          sq='"' dq="'"
 556     >
 557     Decode and return a dictionary of attributes.
 558     {
 559         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 560         'empty': '', 'noval': None, 'entity': '&',
 561         'sq': '"', 'dq': '\''
 562     }.
 563     """
 564     parser = HTMLAttributeParser()
 565     with contextlib.suppress(compat_HTMLParseError):
 566         parser.feed(html_element)
 567         parser.close()
 568     return parser.attrs
 569
 570
 571 def parse_list(webpage):
 572     """Given a string for an series of HTML <li> elements,
 573     return a dictionary of their attributes"""
 574     parser = HTMLListAttrsParser()
 575     parser.feed(webpage)
 576     parser.close()
 577     return parser.items
 578
 579
 580 def clean_html(html):
 581     """Clean an HTML snippet into a readable string"""
 582
 583     if html is None:  # Convenience for sanitizing descriptions etc.
 584         return html
 585
 586     html = re.sub(r'\s+', ' ', html)
 587     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 588     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 589     # Strip html tags
 590     html = re.sub('<.*?>', '', html)
 591     # Replace html entities
 592     html = unescapeHTML(html)
 593     return html.strip()
 594
 595
 596 class LenientJSONDecoder(json.JSONDecoder):
 597     # TODO: Write tests
 598     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 599         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 600         self._close_attempts = 2 * close_objects
 601         super().__init__(*args, **kwargs)
 602
 603     @staticmethod
 604     def _close_object(err):
 605         doc = err.doc[:err.pos]
 606         # We need to add comma first to get the correct error message
 607         if err.msg.startswith('Expecting \',\''):
 608             return doc + ','
 609         elif not doc.endswith(','):
 610             return
 611
 612         if err.msg.startswith('Expecting property name'):
 613             return doc[:-1] + '}'
 614         elif err.msg.startswith('Expecting value'):
 615             return doc[:-1] + ']'
 616
 617     def decode(self, s):
 618         if self.transform_source:
 619             s = self.transform_source(s)
 620         for attempt in range(self._close_attempts + 1):
 621             try:
 622                 if self.ignore_extra:
 623                     return self.raw_decode(s.lstrip())[0]
 624                 return super().decode(s)
 625             except json.JSONDecodeError as e:
 626                 if e.pos is None:
 627                     raise
 628                 elif attempt < self._close_attempts:
 629                     s = self._close_object(e)
 630                     if s is not None:
 631                         continue
 632                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 633         assert False, 'Too many attempts to decode JSON'
 634
 635
 636 def sanitize_open(filename, open_mode):
 637     """Try to open the given filename, and slightly tweak it if this fails.
 638
 639     Attempts to open the given filename. If this fails, it tries to change
 640     the filename slightly, step by step, until it's either able to open it
 641     or it fails and raises a final exception, like the standard open()
 642     function.
 643
 644     It returns the tuple (stream, definitive_file_name).
 645     """
 646     if filename == '-':
 647         if sys.platform == 'win32':
 648             import msvcrt
 649
 650             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 651             with contextlib.suppress(io.UnsupportedOperation):
 652                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 653         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 654
 655     for attempt in range(2):
 656         try:
 657             try:
 658                 if sys.platform == 'win32':
 659                     # FIXME: An exclusive lock also locks the file from being read.
 660                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 661                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 662                     raise LockingUnsupportedError()
 663                 stream = locked_file(filename, open_mode, block=False).__enter__()
 664             except OSError:
 665                 stream = open(filename, open_mode)
 666             return stream, filename
 667         except OSError as err:
 668             if attempt or err.errno in (errno.EACCES,):
 669                 raise
 670             old_filename, filename = filename, sanitize_path(filename)
 671             if old_filename == filename:
 672                 raise
 673
 674
 675 def timeconvert(timestr):
 676     """Convert RFC 2822 defined time string into system timestamp"""
 677     timestamp = None
 678     timetuple = email.utils.parsedate_tz(timestr)
 679     if timetuple is not None:
 680         timestamp = email.utils.mktime_tz(timetuple)
 681     return timestamp
 682
 683
 684 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 685     """Sanitizes a string so it could be used as part of a filename.
 686     @param restricted   Use a stricter subset of allowed characters
 687     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 688                         If unset, yt-dlp's new sanitization rules are in effect
 689     """
 690     if s == '':
 691         return ''
 692
 693     def replace_insane(char):
 694         if restricted and char in ACCENT_CHARS:
 695             return ACCENT_CHARS[char]
 696         elif not restricted and char == '\n':
 697             return '\0 '
 698         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 699             # Replace with their full-width unicode counterparts
 700             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 701         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 702             return ''
 703         elif char == '"':
 704             return '' if restricted else '\''
 705         elif char == ':':
 706             return '\0_\0-' if restricted else '\0 \0-'
 707         elif char in '\\/|*<>':
 708             return '\0_'
 709         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 710             return '\0_'
 711         return char
 712
 713     # Replace look-alike Unicode glyphs
 714     if restricted and (is_id is NO_DEFAULT or not is_id):
 715         s = unicodedata.normalize('NFKC', s)
 716     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 717     result = ''.join(map(replace_insane, s))
 718     if is_id is NO_DEFAULT:
 719         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 720         STRIP_RE = r'(?:\0.|[ _-])*'
 721         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 722     result = result.replace('\0', '') or '_'
 723
 724     if not is_id:
 725         while '__' in result:
 726             result = result.replace('__', '_')
 727         result = result.strip('_')
 728         # Common case of "Foreign band name - English song title"
 729         if restricted and result.startswith('-_'):
 730             result = result[2:]
 731         if result.startswith('-'):
 732             result = '_' + result[len('-'):]
 733         result = result.lstrip('.')
 734         if not result:
 735             result = '_'
 736     return result
 737
 738
 739 def sanitize_path(s, force=False):
 740     """Sanitizes and normalizes path on Windows"""
 741     if sys.platform == 'win32':
 742         force = False
 743         drive_or_unc, _ = os.path.splitdrive(s)
 744     elif force:
 745         drive_or_unc = ''
 746     else:
 747         return s
 748
 749     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 750     if drive_or_unc:
 751         norm_path.pop(0)
 752     sanitized_path = [
 753         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 754         for path_part in norm_path]
 755     if drive_or_unc:
 756         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 757     elif force and s and s[0] == os.path.sep:
 758         sanitized_path.insert(0, os.path.sep)
 759     return os.path.join(*sanitized_path)
 760
 761
 762 def sanitize_url(url, *, scheme='http'):
 763     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 764     # the number of unwanted failures due to missing protocol
 765     if url is None:
 766         return
 767     elif url.startswith('//'):
 768         return f'{scheme}:{url}'
 769     # Fix some common typos seen so far
 770     COMMON_TYPOS = (
 771         # https://github.com/ytdl-org/youtube-dl/issues/15649
 772         (r'^httpss://', r'https://'),
 773         # https://bx1.be/lives/direct-tv/
 774         (r'^rmtp([es]?)://', r'rtmp\1://'),
 775     )
 776     for mistake, fixup in COMMON_TYPOS:
 777         if re.match(mistake, url):
 778             return re.sub(mistake, fixup, url)
 779     return url
 780
 781
 782 def extract_basic_auth(url):
 783     parts = urllib.parse.urlsplit(url)
 784     if parts.username is None:
 785         return url, None
 786     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 787         parts.hostname if parts.port is None
 788         else '%s:%d' % (parts.hostname, parts.port))))
 789     auth_payload = base64.b64encode(
 790         ('%s:%s' % (parts.username, parts.password or '')).encode())
 791     return url, f'Basic {auth_payload.decode()}'
 792
 793
 794 def sanitized_Request(url, *args, **kwargs):
 795     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 796     if auth_header is not None:
 797         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 798         headers['Authorization'] = auth_header
 799     return urllib.request.Request(url, *args, **kwargs)
 800
 801
 802 def expand_path(s):
 803     """Expand shell variables and ~"""
 804     return os.path.expandvars(compat_expanduser(s))
 805
 806
 807 def orderedSet(iterable, *, lazy=False):
 808     """Remove all duplicates from the input iterable"""
 809     def _iter():
 810         seen = []  # Do not use set since the items can be unhashable
 811         for x in iterable:
 812             if x not in seen:
 813                 seen.append(x)
 814                 yield x
 815
 816     return _iter() if lazy else list(_iter())
 817
 818
 819 def _htmlentity_transform(entity_with_semicolon):
 820     """Transforms an HTML entity to a character."""
 821     entity = entity_with_semicolon[:-1]
 822
 823     # Known non-numeric HTML entity
 824     if entity in html.entities.name2codepoint:
 825         return chr(html.entities.name2codepoint[entity])
 826
 827     # TODO: HTML5 allows entities without a semicolon.
 828     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 829     if entity_with_semicolon in html.entities.html5:
 830         return html.entities.html5[entity_with_semicolon]
 831
 832     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 833     if mobj is not None:
 834         numstr = mobj.group(1)
 835         if numstr.startswith('x'):
 836             base = 16
 837             numstr = '0%s' % numstr
 838         else:
 839             base = 10
 840         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 841         with contextlib.suppress(ValueError):
 842             return chr(int(numstr, base))
 843
 844     # Unknown entity in name, return its literal representation
 845     return '&%s;' % entity
 846
 847
 848 def unescapeHTML(s):
 849     if s is None:
 850         return None
 851     assert isinstance(s, str)
 852
 853     return re.sub(
 854         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 855
 856
 857 def escapeHTML(text):
 858     return (
 859         text
 860         .replace('&', '&amp;')
 861         .replace('<', '&lt;')
 862         .replace('>', '&gt;')
 863         .replace('"', '&quot;')
 864         .replace("'", '&#39;')
 865     )
 866
 867
 868 class netrc_from_content(netrc.netrc):
 869     def __init__(self, content):
 870         self.hosts, self.macros = {}, {}
 871         with io.StringIO(content) as stream:
 872             self._parse('-', stream, False)
 873
 874
 875 class Popen(subprocess.Popen):
 876     if sys.platform == 'win32':
 877         _startupinfo = subprocess.STARTUPINFO()
 878         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 879     else:
 880         _startupinfo = None
 881
 882     @staticmethod
 883     def _fix_pyinstaller_ld_path(env):
 884         """Restore LD_LIBRARY_PATH when using PyInstaller
 885             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 886                  https://github.com/yt-dlp/yt-dlp/issues/4573
 887         """
 888         if not hasattr(sys, '_MEIPASS'):
 889             return
 890
 891         def _fix(key):
 892             orig = env.get(f'{key}_ORIG')
 893             if orig is None:
 894                 env.pop(key, None)
 895             else:
 896                 env[key] = orig
 897
 898         _fix('LD_LIBRARY_PATH')  # Linux
 899         _fix('DYLD_LIBRARY_PATH')  # macOS
 900
 901     def __init__(self, *args, env=None, text=False, **kwargs):
 902         if env is None:
 903             env = os.environ.copy()
 904         self._fix_pyinstaller_ld_path(env)
 905
 906         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 907         if text is True:
 908             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 909             kwargs.setdefault('encoding', 'utf-8')
 910             kwargs.setdefault('errors', 'replace')
 911         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 912
 913     def communicate_or_kill(self, *args, **kwargs):
 914         try:
 915             return self.communicate(*args, **kwargs)
 916         except BaseException:  # Including KeyboardInterrupt
 917             self.kill(timeout=None)
 918             raise
 919
 920     def kill(self, *, timeout=0):
 921         super().kill()
 922         if timeout != 0:
 923             self.wait(timeout=timeout)
 924
 925     @classmethod
 926     def run(cls, *args, timeout=None, **kwargs):
 927         with cls(*args, **kwargs) as proc:
 928             default = '' if proc.__text_mode else b''
 929             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 930             return stdout or default, stderr or default, proc.returncode
 931
 932
 933 def encodeArgument(s):
 934     # Legacy code that uses byte strings
 935     # Uncomment the following line after fixing all post processors
 936     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 937     return s if isinstance(s, str) else s.decode('ascii')
 938
 939
 940 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 941
 942
 943 def timetuple_from_msec(msec):
 944     secs, msec = divmod(msec, 1000)
 945     mins, secs = divmod(secs, 60)
 946     hrs, mins = divmod(mins, 60)
 947     return _timetuple(hrs, mins, secs, msec)
 948
 949
 950 def formatSeconds(secs, delim=':', msec=False):
 951     time = timetuple_from_msec(secs * 1000)
 952     if time.hours:
 953         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 954     elif time.minutes:
 955         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 956     else:
 957         ret = '%d' % time.seconds
 958     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 959
 960
 961 def _ssl_load_windows_store_certs(ssl_context, storename):
 962     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 963     try:
 964         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 965                  if encoding == 'x509_asn' and (
 966                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 967     except PermissionError:
 968         return
 969     for cert in certs:
 970         with contextlib.suppress(ssl.SSLError):
 971             ssl_context.load_verify_locations(cadata=cert)
 972
 973
 974 def make_HTTPS_handler(params, **kwargs):
 975     opts_check_certificate = not params.get('nocheckcertificate')
 976     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 977     context.check_hostname = opts_check_certificate
 978     if params.get('legacyserverconnect'):
 979         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 980         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 981         context.set_ciphers('DEFAULT')
 982     elif (
 983         sys.version_info < (3, 10)
 984         and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
 985         and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
 986     ):
 987         # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
 988         # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
 989         # in some situations [2][3].
 990         # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
 991         # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
 992         # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
 993         # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
 994         # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
 995         # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
 996         # 4. https://peps.python.org/pep-0644/
 997         # 5. https://peps.python.org/pep-0644/#libressl-support
 998         # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
 999         context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1000         context.minimum_version = ssl.TLSVersion.TLSv1_2
1001
1002     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1003     if opts_check_certificate:
1004         if certifi and 'no-certifi' not in params.get('compat_opts', []):
1005             context.load_verify_locations(cafile=certifi.where())
1006         else:
1007             try:
1008                 context.load_default_certs()
1009                 # Work around the issue in load_default_certs when there are bad certificates. See:
1010                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1011                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1012             except ssl.SSLError:
1013                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1014                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1015                     for storename in ('CA', 'ROOT'):
1016                         _ssl_load_windows_store_certs(context, storename)
1017                 context.set_default_verify_paths()
1018
1019     client_certfile = params.get('client_certificate')
1020     if client_certfile:
1021         try:
1022             context.load_cert_chain(
1023                 client_certfile, keyfile=params.get('client_certificate_key'),
1024                 password=params.get('client_certificate_password'))
1025         except ssl.SSLError:
1026             raise YoutubeDLError('Unable to load client certificate')
1027
1028     # Some servers may reject requests if ALPN extension is not sent. See:
1029     # https://github.com/python/cpython/issues/85140
1030     # https://github.com/yt-dlp/yt-dlp/issues/3878
1031     with contextlib.suppress(NotImplementedError):
1032         context.set_alpn_protocols(['http/1.1'])
1033
1034     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1035
1036
1037 def bug_reports_message(before=';'):
1038     from ..update import REPOSITORY
1039
1040     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1041            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1042
1043     before = before.rstrip()
1044     if not before or before.endswith(('.', '!', '?')):
1045         msg = msg[0].title() + msg[1:]
1046
1047     return (before + ' ' if before else '') + msg
1048
1049
1050 class YoutubeDLError(Exception):
1051     """Base exception for YoutubeDL errors."""
1052     msg = None
1053
1054     def __init__(self, msg=None):
1055         if msg is not None:
1056             self.msg = msg
1057         elif self.msg is None:
1058             self.msg = type(self).__name__
1059         super().__init__(self.msg)
1060
1061
1062 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1063 if hasattr(ssl, 'CertificateError'):
1064     network_exceptions.append(ssl.CertificateError)
1065 network_exceptions = tuple(network_exceptions)
1066
1067
1068 class ExtractorError(YoutubeDLError):
1069     """Error during info extraction."""
1070
1071     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1072         """ tb, if given, is the original traceback (so that it can be printed out).
1073         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1074         """
1075         if sys.exc_info()[0] in network_exceptions:
1076             expected = True
1077
1078         self.orig_msg = str(msg)
1079         self.traceback = tb
1080         self.expected = expected
1081         self.cause = cause
1082         self.video_id = video_id
1083         self.ie = ie
1084         self.exc_info = sys.exc_info()  # preserve original exception
1085         if isinstance(self.exc_info[1], ExtractorError):
1086             self.exc_info = self.exc_info[1].exc_info
1087         super().__init__(self.__msg)
1088
1089     @property
1090     def __msg(self):
1091         return ''.join((
1092             format_field(self.ie, None, '[%s] '),
1093             format_field(self.video_id, None, '%s: '),
1094             self.orig_msg,
1095             format_field(self.cause, None, ' (caused by %r)'),
1096             '' if self.expected else bug_reports_message()))
1097
1098     def format_traceback(self):
1099         return join_nonempty(
1100             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1101             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1102             delim='\n') or None
1103
1104     def __setattr__(self, name, value):
1105         super().__setattr__(name, value)
1106         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1107             self.msg = self.__msg or type(self).__name__
1108             self.args = (self.msg, )  # Cannot be property
1109
1110
1111 class UnsupportedError(ExtractorError):
1112     def __init__(self, url):
1113         super().__init__(
1114             'Unsupported URL: %s' % url, expected=True)
1115         self.url = url
1116
1117
1118 class RegexNotFoundError(ExtractorError):
1119     """Error when a regex didn't match"""
1120     pass
1121
1122
1123 class GeoRestrictedError(ExtractorError):
1124     """Geographic restriction Error exception.
1125
1126     This exception may be thrown when a video is not available from your
1127     geographic location due to geographic restrictions imposed by a website.
1128     """
1129
1130     def __init__(self, msg, countries=None, **kwargs):
1131         kwargs['expected'] = True
1132         super().__init__(msg, **kwargs)
1133         self.countries = countries
1134
1135
1136 class UserNotLive(ExtractorError):
1137     """Error when a channel/user is not live"""
1138
1139     def __init__(self, msg=None, **kwargs):
1140         kwargs['expected'] = True
1141         super().__init__(msg or 'The channel is not currently live', **kwargs)
1142
1143
1144 class DownloadError(YoutubeDLError):
1145     """Download Error exception.
1146
1147     This exception may be thrown by FileDownloader objects if they are not
1148     configured to continue on errors. They will contain the appropriate
1149     error message.
1150     """
1151
1152     def __init__(self, msg, exc_info=None):
1153         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1154         super().__init__(msg)
1155         self.exc_info = exc_info
1156
1157
1158 class EntryNotInPlaylist(YoutubeDLError):
1159     """Entry not in playlist exception.
1160
1161     This exception will be thrown by YoutubeDL when a requested entry
1162     is not found in the playlist info_dict
1163     """
1164     msg = 'Entry not found in info'
1165
1166
1167 class SameFileError(YoutubeDLError):
1168     """Same File exception.
1169
1170     This exception will be thrown by FileDownloader objects if they detect
1171     multiple files would have to be downloaded to the same file on disk.
1172     """
1173     msg = 'Fixed output name but more than one file to download'
1174
1175     def __init__(self, filename=None):
1176         if filename is not None:
1177             self.msg += f': {filename}'
1178         super().__init__(self.msg)
1179
1180
1181 class PostProcessingError(YoutubeDLError):
1182     """Post Processing exception.
1183
1184     This exception may be raised by PostProcessor's .run() method to
1185     indicate an error in the postprocessing task.
1186     """
1187
1188
1189 class DownloadCancelled(YoutubeDLError):
1190     """ Exception raised when the download queue should be interrupted """
1191     msg = 'The download was cancelled'
1192
1193
1194 class ExistingVideoReached(DownloadCancelled):
1195     """ --break-on-existing triggered """
1196     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1197
1198
1199 class RejectedVideoReached(DownloadCancelled):
1200     """ --break-match-filter triggered """
1201     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1202
1203
1204 class MaxDownloadsReached(DownloadCancelled):
1205     """ --max-downloads limit has been reached. """
1206     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1207
1208
1209 class ReExtractInfo(YoutubeDLError):
1210     """ Video info needs to be re-extracted. """
1211
1212     def __init__(self, msg, expected=False):
1213         super().__init__(msg)
1214         self.expected = expected
1215
1216
1217 class ThrottledDownload(ReExtractInfo):
1218     """ Download speed below --throttled-rate. """
1219     msg = 'The download speed is below throttle limit'
1220
1221     def __init__(self):
1222         super().__init__(self.msg, expected=False)
1223
1224
1225 class UnavailableVideoError(YoutubeDLError):
1226     """Unavailable Format exception.
1227
1228     This exception will be thrown when a video is requested
1229     in a format that is not available for that video.
1230     """
1231     msg = 'Unable to download video'
1232
1233     def __init__(self, err=None):
1234         if err is not None:
1235             self.msg += f': {err}'
1236         super().__init__(self.msg)
1237
1238
1239 class ContentTooShortError(YoutubeDLError):
1240     """Content Too Short exception.
1241
1242     This exception may be raised by FileDownloader objects when a file they
1243     download is too small for what the server announced first, indicating
1244     the connection was probably interrupted.
1245     """
1246
1247     def __init__(self, downloaded, expected):
1248         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1249         # Both in bytes
1250         self.downloaded = downloaded
1251         self.expected = expected
1252
1253
1254 class XAttrMetadataError(YoutubeDLError):
1255     def __init__(self, code=None, msg='Unknown error'):
1256         super().__init__(msg)
1257         self.code = code
1258         self.msg = msg
1259
1260         # Parsing code and msg
1261         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1262                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1263             self.reason = 'NO_SPACE'
1264         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1265             self.reason = 'VALUE_TOO_LONG'
1266         else:
1267             self.reason = 'NOT_SUPPORTED'
1268
1269
1270 class XAttrUnavailableError(YoutubeDLError):
1271     pass
1272
1273
1274 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1275     hc = http_class(*args, **kwargs)
1276     source_address = ydl_handler._params.get('source_address')
1277
1278     if source_address is not None:
1279         # This is to workaround _create_connection() from socket where it will try all
1280         # address data from getaddrinfo() including IPv6. This filters the result from
1281         # getaddrinfo() based on the source_address value.
1282         # This is based on the cpython socket.create_connection() function.
1283         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1284         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1285             host, port = address
1286             err = None
1287             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1288             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1289             ip_addrs = [addr for addr in addrs if addr[0] == af]
1290             if addrs and not ip_addrs:
1291                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1292                 raise OSError(
1293                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1294                     % (ip_version, source_address[0]))
1295             for res in ip_addrs:
1296                 af, socktype, proto, canonname, sa = res
1297                 sock = None
1298                 try:
1299                     sock = socket.socket(af, socktype, proto)
1300                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1301                         sock.settimeout(timeout)
1302                     sock.bind(source_address)
1303                     sock.connect(sa)
1304                     err = None  # Explicitly break reference cycle
1305                     return sock
1306                 except OSError as _:
1307                     err = _
1308                     if sock is not None:
1309                         sock.close()
1310             if err is not None:
1311                 raise err
1312             else:
1313                 raise OSError('getaddrinfo returns an empty list')
1314         if hasattr(hc, '_create_connection'):
1315             hc._create_connection = _create_connection
1316         hc.source_address = (source_address, 0)
1317
1318     return hc
1319
1320
1321 class YoutubeDLHandler(urllib.request.HTTPHandler):
1322     """Handler for HTTP requests and responses.
1323
1324     This class, when installed with an OpenerDirector, automatically adds
1325     the standard headers to every HTTP request and handles gzipped, deflated and
1326     brotli responses from web servers.
1327
1328     Part of this code was copied from:
1329
1330     http://techknack.net/python-urllib2-handlers/
1331
1332     Andrew Rowls, the author of that code, agreed to release it to the
1333     public domain.
1334     """
1335
1336     def __init__(self, params, *args, **kwargs):
1337         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1338         self._params = params
1339
1340     def http_open(self, req):
1341         conn_class = http.client.HTTPConnection
1342
1343         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1344         if socks_proxy:
1345             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1346             del req.headers['Ytdl-socks-proxy']
1347
1348         return self.do_open(functools.partial(
1349             _create_http_connection, self, conn_class, False),
1350             req)
1351
1352     @staticmethod
1353     def deflate(data):
1354         if not data:
1355             return data
1356         try:
1357             return zlib.decompress(data, -zlib.MAX_WBITS)
1358         except zlib.error:
1359             return zlib.decompress(data)
1360
1361     @staticmethod
1362     def brotli(data):
1363         if not data:
1364             return data
1365         return brotli.decompress(data)
1366
1367     @staticmethod
1368     def gz(data):
1369         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
1370         try:
1371             return gz.read()
1372         except OSError as original_oserror:
1373             # There may be junk add the end of the file
1374             # See http://stackoverflow.com/q/4928560/35070 for details
1375             for i in range(1, 1024):
1376                 try:
1377                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
1378                     return gz.read()
1379                 except OSError:
1380                     continue
1381             else:
1382                 raise original_oserror
1383
1384     def http_request(self, req):
1385         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1386         # always respected by websites, some tend to give out URLs with non percent-encoded
1387         # non-ASCII characters (see telemb.py, ard.py [#3412])
1388         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1389         # To work around aforementioned issue we will replace request's original URL with
1390         # percent-encoded one
1391         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1392         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1393         url = req.get_full_url()
1394         url_escaped = escape_url(url)
1395
1396         # Substitute URL if any change after escaping
1397         if url != url_escaped:
1398             req = update_Request(req, url=url_escaped)
1399
1400         for h, v in self._params.get('http_headers', std_headers).items():
1401             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1402             # The dict keys are capitalized because of this bug by urllib
1403             if h.capitalize() not in req.headers:
1404                 req.add_header(h, v)
1405
1406         if 'Youtubedl-no-compression' in req.headers:  # deprecated
1407             req.headers.pop('Youtubedl-no-compression', None)
1408             req.add_header('Accept-encoding', 'identity')
1409
1410         if 'Accept-encoding' not in req.headers:
1411             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1412
1413         return super().do_request_(req)
1414
1415     def http_response(self, req, resp):
1416         old_resp = resp
1417
1418         # Content-Encoding header lists the encodings in order that they were applied [1].
1419         # To decompress, we simply do the reverse.
1420         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
1421         decoded_response = None
1422         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
1423             if encoding == 'gzip':
1424                 decoded_response = self.gz(decoded_response or resp.read())
1425             elif encoding == 'deflate':
1426                 decoded_response = self.deflate(decoded_response or resp.read())
1427             elif encoding == 'br' and brotli:
1428                 decoded_response = self.brotli(decoded_response or resp.read())
1429
1430         if decoded_response is not None:
1431             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
1432             resp.msg = old_resp.msg
1433         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1434         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1435         if 300 <= resp.code < 400:
1436             location = resp.headers.get('Location')
1437             if location:
1438                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1439                 location = location.encode('iso-8859-1').decode()
1440                 location_escaped = escape_url(location)
1441                 if location != location_escaped:
1442                     del resp.headers['Location']
1443                     resp.headers['Location'] = location_escaped
1444         return resp
1445
1446     https_request = http_request
1447     https_response = http_response
1448
1449
1450 def make_socks_conn_class(base_class, socks_proxy):
1451     assert issubclass(base_class, (
1452         http.client.HTTPConnection, http.client.HTTPSConnection))
1453
1454     url_components = urllib.parse.urlparse(socks_proxy)
1455     if url_components.scheme.lower() == 'socks5':
1456         socks_type = ProxyType.SOCKS5
1457     elif url_components.scheme.lower() in ('socks', 'socks4'):
1458         socks_type = ProxyType.SOCKS4
1459     elif url_components.scheme.lower() == 'socks4a':
1460         socks_type = ProxyType.SOCKS4A
1461
1462     def unquote_if_non_empty(s):
1463         if not s:
1464             return s
1465         return urllib.parse.unquote_plus(s)
1466
1467     proxy_args = (
1468         socks_type,
1469         url_components.hostname, url_components.port or 1080,
1470         True,  # Remote DNS
1471         unquote_if_non_empty(url_components.username),
1472         unquote_if_non_empty(url_components.password),
1473     )
1474
1475     class SocksConnection(base_class):
1476         def connect(self):
1477             self.sock = sockssocket()
1478             self.sock.setproxy(*proxy_args)
1479             if isinstance(self.timeout, (int, float)):
1480                 self.sock.settimeout(self.timeout)
1481             self.sock.connect((self.host, self.port))
1482
1483             if isinstance(self, http.client.HTTPSConnection):
1484                 if hasattr(self, '_context'):  # Python > 2.6
1485                     self.sock = self._context.wrap_socket(
1486                         self.sock, server_hostname=self.host)
1487                 else:
1488                     self.sock = ssl.wrap_socket(self.sock)
1489
1490     return SocksConnection
1491
1492
1493 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1494     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1495         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1496         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1497         self._params = params
1498
1499     def https_open(self, req):
1500         kwargs = {}
1501         conn_class = self._https_conn_class
1502
1503         if hasattr(self, '_context'):  # python > 2.6
1504             kwargs['context'] = self._context
1505         if hasattr(self, '_check_hostname'):  # python 3.x
1506             kwargs['check_hostname'] = self._check_hostname
1507
1508         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1509         if socks_proxy:
1510             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1511             del req.headers['Ytdl-socks-proxy']
1512
1513         try:
1514             return self.do_open(
1515                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1516         except urllib.error.URLError as e:
1517             if (isinstance(e.reason, ssl.SSLError)
1518                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1519                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1520             raise
1521
1522
1523 def is_path_like(f):
1524     return isinstance(f, (str, bytes, os.PathLike))
1525
1526
1527 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1528     def __init__(self, cookiejar=None):
1529         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1530
1531     def http_response(self, request, response):
1532         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1533
1534     https_request = urllib.request.HTTPCookieProcessor.http_request
1535     https_response = http_response
1536
1537
1538 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1539     """YoutubeDL redirect handler
1540
1541     The code is based on HTTPRedirectHandler implementation from CPython [1].
1542
1543     This redirect handler fixes and improves the logic to better align with RFC7261
1544      and what browsers tend to do [2][3]
1545
1546     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1547     2. https://datatracker.ietf.org/doc/html/rfc7231
1548     3. https://github.com/python/cpython/issues/91306
1549     """
1550
1551     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1552
1553     def redirect_request(self, req, fp, code, msg, headers, newurl):
1554         if code not in (301, 302, 303, 307, 308):
1555             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1556
1557         new_method = req.get_method()
1558         new_data = req.data
1559
1560         # Technically the Cookie header should be in unredirected_hdrs,
1561         # however in practice some may set it in normal headers anyway.
1562         # We will remove it here to prevent any leaks.
1563         remove_headers = ['Cookie']
1564
1565         # A 303 must either use GET or HEAD for subsequent request
1566         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1567         if code == 303 and req.get_method() != 'HEAD':
1568             new_method = 'GET'
1569         # 301 and 302 redirects are commonly turned into a GET from a POST
1570         # for subsequent requests by browsers, so we'll do the same.
1571         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1572         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1573         elif code in (301, 302) and req.get_method() == 'POST':
1574             new_method = 'GET'
1575
1576         # only remove payload if method changed (e.g. POST to GET)
1577         if new_method != req.get_method():
1578             new_data = None
1579             remove_headers.extend(['Content-Length', 'Content-Type'])
1580
1581         new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
1582
1583         return urllib.request.Request(
1584             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1585             unverifiable=True, method=new_method, data=new_data)
1586
1587
1588 def extract_timezone(date_str):
1589     m = re.search(
1590         r'''(?x)
1591             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1592             (?P<tz>Z|                                            # just the UTC Z, or
1593                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1594                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1595                    [ ]?                                          # optional space
1596                 (?P<sign>\+|-)                                   # +/-
1597                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1598             $)
1599         ''', date_str)
1600     if not m:
1601         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1602         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1603         if timezone is not None:
1604             date_str = date_str[:-len(m.group('tz'))]
1605         timezone = datetime.timedelta(hours=timezone or 0)
1606     else:
1607         date_str = date_str[:-len(m.group('tz'))]
1608         if not m.group('sign'):
1609             timezone = datetime.timedelta()
1610         else:
1611             sign = 1 if m.group('sign') == '+' else -1
1612             timezone = datetime.timedelta(
1613                 hours=sign * int(m.group('hours')),
1614                 minutes=sign * int(m.group('minutes')))
1615     return timezone, date_str
1616
1617
1618 def parse_iso8601(date_str, delimiter='T', timezone=None):
1619     """ Return a UNIX timestamp from the given date """
1620
1621     if date_str is None:
1622         return None
1623
1624     date_str = re.sub(r'\.[0-9]+', '', date_str)
1625
1626     if timezone is None:
1627         timezone, date_str = extract_timezone(date_str)
1628
1629     with contextlib.suppress(ValueError):
1630         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1631         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1632         return calendar.timegm(dt.timetuple())
1633
1634
1635 def date_formats(day_first=True):
1636     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1637
1638
1639 def unified_strdate(date_str, day_first=True):
1640     """Return a string with the date in the format YYYYMMDD"""
1641
1642     if date_str is None:
1643         return None
1644     upload_date = None
1645     # Replace commas
1646     date_str = date_str.replace(',', ' ')
1647     # Remove AM/PM + timezone
1648     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1649     _, date_str = extract_timezone(date_str)
1650
1651     for expression in date_formats(day_first):
1652         with contextlib.suppress(ValueError):
1653             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1654     if upload_date is None:
1655         timetuple = email.utils.parsedate_tz(date_str)
1656         if timetuple:
1657             with contextlib.suppress(ValueError):
1658                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1659     if upload_date is not None:
1660         return str(upload_date)
1661
1662
1663 def unified_timestamp(date_str, day_first=True):
1664     if not isinstance(date_str, str):
1665         return None
1666
1667     date_str = re.sub(r'\s+', ' ', re.sub(
1668         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1669
1670     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1671     timezone, date_str = extract_timezone(date_str)
1672
1673     # Remove AM/PM + timezone
1674     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1675
1676     # Remove unrecognized timezones from ISO 8601 alike timestamps
1677     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1678     if m:
1679         date_str = date_str[:-len(m.group('tz'))]
1680
1681     # Python only supports microseconds, so remove nanoseconds
1682     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1683     if m:
1684         date_str = m.group(1)
1685
1686     for expression in date_formats(day_first):
1687         with contextlib.suppress(ValueError):
1688             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1689             return calendar.timegm(dt.timetuple())
1690
1691     timetuple = email.utils.parsedate_tz(date_str)
1692     if timetuple:
1693         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1694
1695
1696 def determine_ext(url, default_ext='unknown_video'):
1697     if url is None or '.' not in url:
1698         return default_ext
1699     guess = url.partition('?')[0].rpartition('.')[2]
1700     if re.match(r'^[A-Za-z0-9]+$', guess):
1701         return guess
1702     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1703     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1704         return guess.rstrip('/')
1705     else:
1706         return default_ext
1707
1708
1709 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1710     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1711
1712
1713 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1714     R"""
1715     Return a datetime object from a string.
1716     Supported format:
1717         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1718
1719     @param format       strftime format of DATE
1720     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1721                         auto: round to the unit provided in date_str (if applicable).
1722     """
1723     auto_precision = False
1724     if precision == 'auto':
1725         auto_precision = True
1726         precision = 'microsecond'
1727     today = datetime_round(datetime.datetime.utcnow(), precision)
1728     if date_str in ('now', 'today'):
1729         return today
1730     if date_str == 'yesterday':
1731         return today - datetime.timedelta(days=1)
1732     match = re.match(
1733         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1734         date_str)
1735     if match is not None:
1736         start_time = datetime_from_str(match.group('start'), precision, format)
1737         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1738         unit = match.group('unit')
1739         if unit == 'month' or unit == 'year':
1740             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1741             unit = 'day'
1742         else:
1743             if unit == 'week':
1744                 unit = 'day'
1745                 time *= 7
1746             delta = datetime.timedelta(**{unit + 's': time})
1747             new_date = start_time + delta
1748         if auto_precision:
1749             return datetime_round(new_date, unit)
1750         return new_date
1751
1752     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1753
1754
1755 def date_from_str(date_str, format='%Y%m%d', strict=False):
1756     R"""
1757     Return a date object from a string using datetime_from_str
1758
1759     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1760                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1761     """
1762     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1763         raise ValueError(f'Invalid date format "{date_str}"')
1764     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1765
1766
1767 def datetime_add_months(dt, months):
1768     """Increment/Decrement a datetime object by months."""
1769     month = dt.month + months - 1
1770     year = dt.year + month // 12
1771     month = month % 12 + 1
1772     day = min(dt.day, calendar.monthrange(year, month)[1])
1773     return dt.replace(year, month, day)
1774
1775
1776 def datetime_round(dt, precision='day'):
1777     """
1778     Round a datetime object's time to a specific precision
1779     """
1780     if precision == 'microsecond':
1781         return dt
1782
1783     unit_seconds = {
1784         'day': 86400,
1785         'hour': 3600,
1786         'minute': 60,
1787         'second': 1,
1788     }
1789     roundto = lambda x, n: ((x + n / 2) // n) * n
1790     timestamp = calendar.timegm(dt.timetuple())
1791     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1792
1793
1794 def hyphenate_date(date_str):
1795     """
1796     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1797     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1798     if match is not None:
1799         return '-'.join(match.groups())
1800     else:
1801         return date_str
1802
1803
1804 class DateRange:
1805     """Represents a time interval between two dates"""
1806
1807     def __init__(self, start=None, end=None):
1808         """start and end must be strings in the format accepted by date"""
1809         if start is not None:
1810             self.start = date_from_str(start, strict=True)
1811         else:
1812             self.start = datetime.datetime.min.date()
1813         if end is not None:
1814             self.end = date_from_str(end, strict=True)
1815         else:
1816             self.end = datetime.datetime.max.date()
1817         if self.start > self.end:
1818             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1819
1820     @classmethod
1821     def day(cls, day):
1822         """Returns a range that only contains the given day"""
1823         return cls(day, day)
1824
1825     def __contains__(self, date):
1826         """Check if the date is in the range"""
1827         if not isinstance(date, datetime.date):
1828             date = date_from_str(date)
1829         return self.start <= date <= self.end
1830
1831     def __repr__(self):
1832         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1833
1834     def __eq__(self, other):
1835         return (isinstance(other, DateRange)
1836                 and self.start == other.start and self.end == other.end)
1837
1838
1839 @functools.cache
1840 def system_identifier():
1841     python_implementation = platform.python_implementation()
1842     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1843         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1844     libc_ver = []
1845     with contextlib.suppress(OSError):  # We may not have access to the executable
1846         libc_ver = platform.libc_ver()
1847
1848     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1849         platform.python_version(),
1850         python_implementation,
1851         platform.machine(),
1852         platform.architecture()[0],
1853         platform.platform(),
1854         ssl.OPENSSL_VERSION,
1855         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1856     )
1857
1858
1859 @functools.cache
1860 def get_windows_version():
1861     ''' Get Windows version. returns () if it's not running on Windows '''
1862     if compat_os_name == 'nt':
1863         return version_tuple(platform.win32_ver()[1])
1864     else:
1865         return ()
1866
1867
1868 def write_string(s, out=None, encoding=None):
1869     assert isinstance(s, str)
1870     out = out or sys.stderr
1871     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1872     if not out:
1873         return
1874
1875     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1876         s = re.sub(r'([\r\n]+)', r' \1', s)
1877
1878     enc, buffer = None, out
1879     if 'b' in getattr(out, 'mode', ''):
1880         enc = encoding or preferredencoding()
1881     elif hasattr(out, 'buffer'):
1882         buffer = out.buffer
1883         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1884
1885     buffer.write(s.encode(enc, 'ignore') if enc else s)
1886     out.flush()
1887
1888
1889 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1890     from .. import _IN_CLI
1891     if _IN_CLI:
1892         if msg in deprecation_warning._cache:
1893             return
1894         deprecation_warning._cache.add(msg)
1895         if printer:
1896             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1897         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1898     else:
1899         import warnings
1900         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1901
1902
1903 deprecation_warning._cache = set()
1904
1905
1906 def bytes_to_intlist(bs):
1907     if not bs:
1908         return []
1909     if isinstance(bs[0], int):  # Python 3
1910         return list(bs)
1911     else:
1912         return [ord(c) for c in bs]
1913
1914
1915 def intlist_to_bytes(xs):
1916     if not xs:
1917         return b''
1918     return struct.pack('%dB' % len(xs), *xs)
1919
1920
1921 class LockingUnsupportedError(OSError):
1922     msg = 'File locking is not supported'
1923
1924     def __init__(self):
1925         super().__init__(self.msg)
1926
1927
1928 # Cross-platform file locking
1929 if sys.platform == 'win32':
1930     import ctypes
1931     import ctypes.wintypes
1932     import msvcrt
1933
1934     class OVERLAPPED(ctypes.Structure):
1935         _fields_ = [
1936             ('Internal', ctypes.wintypes.LPVOID),
1937             ('InternalHigh', ctypes.wintypes.LPVOID),
1938             ('Offset', ctypes.wintypes.DWORD),
1939             ('OffsetHigh', ctypes.wintypes.DWORD),
1940             ('hEvent', ctypes.wintypes.HANDLE),
1941         ]
1942
1943     kernel32 = ctypes.WinDLL('kernel32')
1944     LockFileEx = kernel32.LockFileEx
1945     LockFileEx.argtypes = [
1946         ctypes.wintypes.HANDLE,     # hFile
1947         ctypes.wintypes.DWORD,      # dwFlags
1948         ctypes.wintypes.DWORD,      # dwReserved
1949         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1950         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1951         ctypes.POINTER(OVERLAPPED)  # Overlapped
1952     ]
1953     LockFileEx.restype = ctypes.wintypes.BOOL
1954     UnlockFileEx = kernel32.UnlockFileEx
1955     UnlockFileEx.argtypes = [
1956         ctypes.wintypes.HANDLE,     # hFile
1957         ctypes.wintypes.DWORD,      # dwReserved
1958         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1959         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1960         ctypes.POINTER(OVERLAPPED)  # Overlapped
1961     ]
1962     UnlockFileEx.restype = ctypes.wintypes.BOOL
1963     whole_low = 0xffffffff
1964     whole_high = 0x7fffffff
1965
1966     def _lock_file(f, exclusive, block):
1967         overlapped = OVERLAPPED()
1968         overlapped.Offset = 0
1969         overlapped.OffsetHigh = 0
1970         overlapped.hEvent = 0
1971         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1972
1973         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1974                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1975                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1976             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1977             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1978
1979     def _unlock_file(f):
1980         assert f._lock_file_overlapped_p
1981         handle = msvcrt.get_osfhandle(f.fileno())
1982         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1983             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1984
1985 else:
1986     try:
1987         import fcntl
1988
1989         def _lock_file(f, exclusive, block):
1990             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1991             if not block:
1992                 flags |= fcntl.LOCK_NB
1993             try:
1994                 fcntl.flock(f, flags)
1995             except BlockingIOError:
1996                 raise
1997             except OSError:  # AOSP does not have flock()
1998                 fcntl.lockf(f, flags)
1999
2000         def _unlock_file(f):
2001             with contextlib.suppress(OSError):
2002                 return fcntl.flock(f, fcntl.LOCK_UN)
2003             with contextlib.suppress(OSError):
2004                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
2005             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
2006
2007     except ImportError:
2008
2009         def _lock_file(f, exclusive, block):
2010             raise LockingUnsupportedError()
2011
2012         def _unlock_file(f):
2013             raise LockingUnsupportedError()
2014
2015
2016 class locked_file:
2017     locked = False
2018
2019     def __init__(self, filename, mode, block=True, encoding=None):
2020         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2021             raise NotImplementedError(mode)
2022         self.mode, self.block = mode, block
2023
2024         writable = any(f in mode for f in 'wax+')
2025         readable = any(f in mode for f in 'r+')
2026         flags = functools.reduce(operator.ior, (
2027             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2028             getattr(os, 'O_BINARY', 0),  # Windows only
2029             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2030             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2031             os.O_APPEND if 'a' in mode else 0,
2032             os.O_EXCL if 'x' in mode else 0,
2033             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2034         ))
2035
2036         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2037
2038     def __enter__(self):
2039         exclusive = 'r' not in self.mode
2040         try:
2041             _lock_file(self.f, exclusive, self.block)
2042             self.locked = True
2043         except OSError:
2044             self.f.close()
2045             raise
2046         if 'w' in self.mode:
2047             try:
2048                 self.f.truncate()
2049             except OSError as e:
2050                 if e.errno not in (
2051                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2052                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2053                 ):
2054                     raise
2055         return self
2056
2057     def unlock(self):
2058         if not self.locked:
2059             return
2060         try:
2061             _unlock_file(self.f)
2062         finally:
2063             self.locked = False
2064
2065     def __exit__(self, *_):
2066         try:
2067             self.unlock()
2068         finally:
2069             self.f.close()
2070
2071     open = __enter__
2072     close = __exit__
2073
2074     def __getattr__(self, attr):
2075         return getattr(self.f, attr)
2076
2077     def __iter__(self):
2078         return iter(self.f)
2079
2080
2081 @functools.cache
2082 def get_filesystem_encoding():
2083     encoding = sys.getfilesystemencoding()
2084     return encoding if encoding is not None else 'utf-8'
2085
2086
2087 def shell_quote(args):
2088     quoted_args = []
2089     encoding = get_filesystem_encoding()
2090     for a in args:
2091         if isinstance(a, bytes):
2092             # We may get a filename encoded with 'encodeFilename'
2093             a = a.decode(encoding)
2094         quoted_args.append(compat_shlex_quote(a))
2095     return ' '.join(quoted_args)
2096
2097
2098 def smuggle_url(url, data):
2099     """ Pass additional data in a URL for internal use. """
2100
2101     url, idata = unsmuggle_url(url, {})
2102     data.update(idata)
2103     sdata = urllib.parse.urlencode(
2104         {'__youtubedl_smuggle': json.dumps(data)})
2105     return url + '#' + sdata
2106
2107
2108 def unsmuggle_url(smug_url, default=None):
2109     if '#__youtubedl_smuggle' not in smug_url:
2110         return smug_url, default
2111     url, _, sdata = smug_url.rpartition('#')
2112     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2113     data = json.loads(jsond)
2114     return url, data
2115
2116
2117 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2118     """ Formats numbers with decimal sufixes like K, M, etc """
2119     num, factor = float_or_none(num), float(factor)
2120     if num is None or num < 0:
2121         return None
2122     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2123     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2124     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2125     if factor == 1024:
2126         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2127     converted = num / (factor ** exponent)
2128     return fmt % (converted, suffix)
2129
2130
2131 def format_bytes(bytes):
2132     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2133
2134
2135 def lookup_unit_table(unit_table, s, strict=False):
2136     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2137     units_re = '|'.join(re.escape(u) for u in unit_table)
2138     m = (re.fullmatch if strict else re.match)(
2139         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2140     if not m:
2141         return None
2142
2143     num = float(m.group('num').replace(',', '.'))
2144     mult = unit_table[m.group('unit')]
2145     return round(num * mult)
2146
2147
2148 def parse_bytes(s):
2149     """Parse a string indicating a byte quantity into an integer"""
2150     return lookup_unit_table(
2151         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2152         s.upper(), strict=True)
2153
2154
2155 def parse_filesize(s):
2156     if s is None:
2157         return None
2158
2159     # The lower-case forms are of course incorrect and unofficial,
2160     # but we support those too
2161     _UNIT_TABLE = {
2162         'B': 1,
2163         'b': 1,
2164         'bytes': 1,
2165         'KiB': 1024,
2166         'KB': 1000,
2167         'kB': 1024,
2168         'Kb': 1000,
2169         'kb': 1000,
2170         'kilobytes': 1000,
2171         'kibibytes': 1024,
2172         'MiB': 1024 ** 2,
2173         'MB': 1000 ** 2,
2174         'mB': 1024 ** 2,
2175         'Mb': 1000 ** 2,
2176         'mb': 1000 ** 2,
2177         'megabytes': 1000 ** 2,
2178         'mebibytes': 1024 ** 2,
2179         'GiB': 1024 ** 3,
2180         'GB': 1000 ** 3,
2181         'gB': 1024 ** 3,
2182         'Gb': 1000 ** 3,
2183         'gb': 1000 ** 3,
2184         'gigabytes': 1000 ** 3,
2185         'gibibytes': 1024 ** 3,
2186         'TiB': 1024 ** 4,
2187         'TB': 1000 ** 4,
2188         'tB': 1024 ** 4,
2189         'Tb': 1000 ** 4,
2190         'tb': 1000 ** 4,
2191         'terabytes': 1000 ** 4,
2192         'tebibytes': 1024 ** 4,
2193         'PiB': 1024 ** 5,
2194         'PB': 1000 ** 5,
2195         'pB': 1024 ** 5,
2196         'Pb': 1000 ** 5,
2197         'pb': 1000 ** 5,
2198         'petabytes': 1000 ** 5,
2199         'pebibytes': 1024 ** 5,
2200         'EiB': 1024 ** 6,
2201         'EB': 1000 ** 6,
2202         'eB': 1024 ** 6,
2203         'Eb': 1000 ** 6,
2204         'eb': 1000 ** 6,
2205         'exabytes': 1000 ** 6,
2206         'exbibytes': 1024 ** 6,
2207         'ZiB': 1024 ** 7,
2208         'ZB': 1000 ** 7,
2209         'zB': 1024 ** 7,
2210         'Zb': 1000 ** 7,
2211         'zb': 1000 ** 7,
2212         'zettabytes': 1000 ** 7,
2213         'zebibytes': 1024 ** 7,
2214         'YiB': 1024 ** 8,
2215         'YB': 1000 ** 8,
2216         'yB': 1024 ** 8,
2217         'Yb': 1000 ** 8,
2218         'yb': 1000 ** 8,
2219         'yottabytes': 1000 ** 8,
2220         'yobibytes': 1024 ** 8,
2221     }
2222
2223     return lookup_unit_table(_UNIT_TABLE, s)
2224
2225
2226 def parse_count(s):
2227     if s is None:
2228         return None
2229
2230     s = re.sub(r'^[^\d]+\s', '', s).strip()
2231
2232     if re.match(r'^[\d,.]+$', s):
2233         return str_to_int(s)
2234
2235     _UNIT_TABLE = {
2236         'k': 1000,
2237         'K': 1000,
2238         'm': 1000 ** 2,
2239         'M': 1000 ** 2,
2240         'kk': 1000 ** 2,
2241         'KK': 1000 ** 2,
2242         'b': 1000 ** 3,
2243         'B': 1000 ** 3,
2244     }
2245
2246     ret = lookup_unit_table(_UNIT_TABLE, s)
2247     if ret is not None:
2248         return ret
2249
2250     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2251     if mobj:
2252         return str_to_int(mobj.group(1))
2253
2254
2255 def parse_resolution(s, *, lenient=False):
2256     if s is None:
2257         return {}
2258
2259     if lenient:
2260         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2261     else:
2262         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2263     if mobj:
2264         return {
2265             'width': int(mobj.group('w')),
2266             'height': int(mobj.group('h')),
2267         }
2268
2269     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2270     if mobj:
2271         return {'height': int(mobj.group(1))}
2272
2273     mobj = re.search(r'\b([48])[kK]\b', s)
2274     if mobj:
2275         return {'height': int(mobj.group(1)) * 540}
2276
2277     return {}
2278
2279
2280 def parse_bitrate(s):
2281     if not isinstance(s, str):
2282         return
2283     mobj = re.search(r'\b(\d+)\s*kbps', s)
2284     if mobj:
2285         return int(mobj.group(1))
2286
2287
2288 def month_by_name(name, lang='en'):
2289     """ Return the number of a month by (locale-independently) English name """
2290
2291     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2292
2293     try:
2294         return month_names.index(name) + 1
2295     except ValueError:
2296         return None
2297
2298
2299 def month_by_abbreviation(abbrev):
2300     """ Return the number of a month by (locale-independently) English
2301         abbreviations """
2302
2303     try:
2304         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2305     except ValueError:
2306         return None
2307
2308
2309 def fix_xml_ampersands(xml_str):
2310     """Replace all the '&' by '&amp;' in XML"""
2311     return re.sub(
2312         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2313         '&amp;',
2314         xml_str)
2315
2316
2317 def setproctitle(title):
2318     assert isinstance(title, str)
2319
2320     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2321     try:
2322         import ctypes
2323     except ImportError:
2324         return
2325
2326     try:
2327         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2328     except OSError:
2329         return
2330     except TypeError:
2331         # LoadLibrary in Windows Python 2.7.13 only expects
2332         # a bytestring, but since unicode_literals turns
2333         # every string into a unicode string, it fails.
2334         return
2335     title_bytes = title.encode()
2336     buf = ctypes.create_string_buffer(len(title_bytes))
2337     buf.value = title_bytes
2338     try:
2339         libc.prctl(15, buf, 0, 0, 0)
2340     except AttributeError:
2341         return  # Strange libc, just skip this
2342
2343
2344 def remove_start(s, start):
2345     return s[len(start):] if s is not None and s.startswith(start) else s
2346
2347
2348 def remove_end(s, end):
2349     return s[:-len(end)] if s is not None and s.endswith(end) else s
2350
2351
2352 def remove_quotes(s):
2353     if s is None or len(s) < 2:
2354         return s
2355     for quote in ('"', "'", ):
2356         if s[0] == quote and s[-1] == quote:
2357             return s[1:-1]
2358     return s
2359
2360
2361 def get_domain(url):
2362     """
2363     This implementation is inconsistent, but is kept for compatibility.
2364     Use this only for "webpage_url_domain"
2365     """
2366     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2367
2368
2369 def url_basename(url):
2370     path = urllib.parse.urlparse(url).path
2371     return path.strip('/').split('/')[-1]
2372
2373
2374 def base_url(url):
2375     return re.match(r'https?://[^?#]+/', url).group()
2376
2377
2378 def urljoin(base, path):
2379     if isinstance(path, bytes):
2380         path = path.decode()
2381     if not isinstance(path, str) or not path:
2382         return None
2383     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2384         return path
2385     if isinstance(base, bytes):
2386         base = base.decode()
2387     if not isinstance(base, str) or not re.match(
2388             r'^(?:https?:)?//', base):
2389         return None
2390     return urllib.parse.urljoin(base, path)
2391
2392
2393 class HEADRequest(urllib.request.Request):
2394     def get_method(self):
2395         return 'HEAD'
2396
2397
2398 class PUTRequest(urllib.request.Request):
2399     def get_method(self):
2400         return 'PUT'
2401
2402
2403 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2404     if get_attr and v is not None:
2405         v = getattr(v, get_attr, None)
2406     try:
2407         return int(v) * invscale // scale
2408     except (ValueError, TypeError, OverflowError):
2409         return default
2410
2411
2412 def str_or_none(v, default=None):
2413     return default if v is None else str(v)
2414
2415
2416 def str_to_int(int_str):
2417     """ A more relaxed version of int_or_none """
2418     if isinstance(int_str, int):
2419         return int_str
2420     elif isinstance(int_str, str):
2421         int_str = re.sub(r'[,\.\+]', '', int_str)
2422         return int_or_none(int_str)
2423
2424
2425 def float_or_none(v, scale=1, invscale=1, default=None):
2426     if v is None:
2427         return default
2428     try:
2429         return float(v) * invscale / scale
2430     except (ValueError, TypeError):
2431         return default
2432
2433
2434 def bool_or_none(v, default=None):
2435     return v if isinstance(v, bool) else default
2436
2437
2438 def strip_or_none(v, default=None):
2439     return v.strip() if isinstance(v, str) else default
2440
2441
2442 def url_or_none(url):
2443     if not url or not isinstance(url, str):
2444         return None
2445     url = url.strip()
2446     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2447
2448
2449 def request_to_url(req):
2450     if isinstance(req, urllib.request.Request):
2451         return req.get_full_url()
2452     else:
2453         return req
2454
2455
2456 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
2457     datetime_object = None
2458     try:
2459         if isinstance(timestamp, (int, float)):  # unix timestamp
2460             # Using naive datetime here can break timestamp() in Windows
2461             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2462             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2463             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2464             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2465                                + datetime.timedelta(seconds=timestamp))
2466         elif isinstance(timestamp, str):  # assume YYYYMMDD
2467             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2468         date_format = re.sub(  # Support %s on windows
2469             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2470         return datetime_object.strftime(date_format)
2471     except (ValueError, TypeError, AttributeError):
2472         return default
2473
2474
2475 def parse_duration(s):
2476     if not isinstance(s, str):
2477         return None
2478     s = s.strip()
2479     if not s:
2480         return None
2481
2482     days, hours, mins, secs, ms = [None] * 5
2483     m = re.match(r'''(?x)
2484             (?P<before_secs>
2485                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2486             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2487             (?P<ms>[.:][0-9]+)?Z?$
2488         ''', s)
2489     if m:
2490         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2491     else:
2492         m = re.match(
2493             r'''(?ix)(?:P?
2494                 (?:
2495                     [0-9]+\s*y(?:ears?)?,?\s*
2496                 )?
2497                 (?:
2498                     [0-9]+\s*m(?:onths?)?,?\s*
2499                 )?
2500                 (?:
2501                     [0-9]+\s*w(?:eeks?)?,?\s*
2502                 )?
2503                 (?:
2504                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2505                 )?
2506                 T)?
2507                 (?:
2508                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2509                 )?
2510                 (?:
2511                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2512                 )?
2513                 (?:
2514                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2515                 )?Z?$''', s)
2516         if m:
2517             days, hours, mins, secs, ms = m.groups()
2518         else:
2519             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2520             if m:
2521                 hours, mins = m.groups()
2522             else:
2523                 return None
2524
2525     if ms:
2526         ms = ms.replace(':', '.')
2527     return sum(float(part or 0) * mult for part, mult in (
2528         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2529
2530
2531 def prepend_extension(filename, ext, expected_real_ext=None):
2532     name, real_ext = os.path.splitext(filename)
2533     return (
2534         f'{name}.{ext}{real_ext}'
2535         if not expected_real_ext or real_ext[1:] == expected_real_ext
2536         else f'{filename}.{ext}')
2537
2538
2539 def replace_extension(filename, ext, expected_real_ext=None):
2540     name, real_ext = os.path.splitext(filename)
2541     return '{}.{}'.format(
2542         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2543         ext)
2544
2545
2546 def check_executable(exe, args=[]):
2547     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2548     args can be a list of arguments for a short output (like -version) """
2549     try:
2550         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2551     except OSError:
2552         return False
2553     return exe
2554
2555
2556 def _get_exe_version_output(exe, args):
2557     try:
2558         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2559         # SIGTTOU if yt-dlp is run in the background.
2560         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2561         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2562                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2563         if ret:
2564             return None
2565     except OSError:
2566         return False
2567     return stdout
2568
2569
2570 def detect_exe_version(output, version_re=None, unrecognized='present'):
2571     assert isinstance(output, str)
2572     if version_re is None:
2573         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2574     m = re.search(version_re, output)
2575     if m:
2576         return m.group(1)
2577     else:
2578         return unrecognized
2579
2580
2581 def get_exe_version(exe, args=['--version'],
2582                     version_re=None, unrecognized=('present', 'broken')):
2583     """ Returns the version of the specified executable,
2584     or False if the executable is not present """
2585     unrecognized = variadic(unrecognized)
2586     assert len(unrecognized) in (1, 2)
2587     out = _get_exe_version_output(exe, args)
2588     if out is None:
2589         return unrecognized[-1]
2590     return out and detect_exe_version(out, version_re, unrecognized[0])
2591
2592
2593 def frange(start=0, stop=None, step=1):
2594     """Float range"""
2595     if stop is None:
2596         start, stop = 0, start
2597     sign = [-1, 1][step > 0] if step else 0
2598     while sign * start < sign * stop:
2599         yield start
2600         start += step
2601
2602
2603 class LazyList(collections.abc.Sequence):
2604     """Lazy immutable list from an iterable
2605     Note that slices of a LazyList are lists and not LazyList"""
2606
2607     class IndexError(IndexError):
2608         pass
2609
2610     def __init__(self, iterable, *, reverse=False, _cache=None):
2611         self._iterable = iter(iterable)
2612         self._cache = [] if _cache is None else _cache
2613         self._reversed = reverse
2614
2615     def __iter__(self):
2616         if self._reversed:
2617             # We need to consume the entire iterable to iterate in reverse
2618             yield from self.exhaust()
2619             return
2620         yield from self._cache
2621         for item in self._iterable:
2622             self._cache.append(item)
2623             yield item
2624
2625     def _exhaust(self):
2626         self._cache.extend(self._iterable)
2627         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2628         return self._cache
2629
2630     def exhaust(self):
2631         """Evaluate the entire iterable"""
2632         return self._exhaust()[::-1 if self._reversed else 1]
2633
2634     @staticmethod
2635     def _reverse_index(x):
2636         return None if x is None else ~x
2637
2638     def __getitem__(self, idx):
2639         if isinstance(idx, slice):
2640             if self._reversed:
2641                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2642             start, stop, step = idx.start, idx.stop, idx.step or 1
2643         elif isinstance(idx, int):
2644             if self._reversed:
2645                 idx = self._reverse_index(idx)
2646             start, stop, step = idx, idx, 0
2647         else:
2648             raise TypeError('indices must be integers or slices')
2649         if ((start or 0) < 0 or (stop or 0) < 0
2650                 or (start is None and step < 0)
2651                 or (stop is None and step > 0)):
2652             # We need to consume the entire iterable to be able to slice from the end
2653             # Obviously, never use this with infinite iterables
2654             self._exhaust()
2655             try:
2656                 return self._cache[idx]
2657             except IndexError as e:
2658                 raise self.IndexError(e) from e
2659         n = max(start or 0, stop or 0) - len(self._cache) + 1
2660         if n > 0:
2661             self._cache.extend(itertools.islice(self._iterable, n))
2662         try:
2663             return self._cache[idx]
2664         except IndexError as e:
2665             raise self.IndexError(e) from e
2666
2667     def __bool__(self):
2668         try:
2669             self[-1] if self._reversed else self[0]
2670         except self.IndexError:
2671             return False
2672         return True
2673
2674     def __len__(self):
2675         self._exhaust()
2676         return len(self._cache)
2677
2678     def __reversed__(self):
2679         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2680
2681     def __copy__(self):
2682         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2683
2684     def __repr__(self):
2685         # repr and str should mimic a list. So we exhaust the iterable
2686         return repr(self.exhaust())
2687
2688     def __str__(self):
2689         return repr(self.exhaust())
2690
2691
2692 class PagedList:
2693
2694     class IndexError(IndexError):
2695         pass
2696
2697     def __len__(self):
2698         # This is only useful for tests
2699         return len(self.getslice())
2700
2701     def __init__(self, pagefunc, pagesize, use_cache=True):
2702         self._pagefunc = pagefunc
2703         self._pagesize = pagesize
2704         self._pagecount = float('inf')
2705         self._use_cache = use_cache
2706         self._cache = {}
2707
2708     def getpage(self, pagenum):
2709         page_results = self._cache.get(pagenum)
2710         if page_results is None:
2711             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2712         if self._use_cache:
2713             self._cache[pagenum] = page_results
2714         return page_results
2715
2716     def getslice(self, start=0, end=None):
2717         return list(self._getslice(start, end))
2718
2719     def _getslice(self, start, end):
2720         raise NotImplementedError('This method must be implemented by subclasses')
2721
2722     def __getitem__(self, idx):
2723         assert self._use_cache, 'Indexing PagedList requires cache'
2724         if not isinstance(idx, int) or idx < 0:
2725             raise TypeError('indices must be non-negative integers')
2726         entries = self.getslice(idx, idx + 1)
2727         if not entries:
2728             raise self.IndexError()
2729         return entries[0]
2730
2731
2732 class OnDemandPagedList(PagedList):
2733     """Download pages until a page with less than maximum results"""
2734
2735     def _getslice(self, start, end):
2736         for pagenum in itertools.count(start // self._pagesize):
2737             firstid = pagenum * self._pagesize
2738             nextfirstid = pagenum * self._pagesize + self._pagesize
2739             if start >= nextfirstid:
2740                 continue
2741
2742             startv = (
2743                 start % self._pagesize
2744                 if firstid <= start < nextfirstid
2745                 else 0)
2746             endv = (
2747                 ((end - 1) % self._pagesize) + 1
2748                 if (end is not None and firstid <= end <= nextfirstid)
2749                 else None)
2750
2751             try:
2752                 page_results = self.getpage(pagenum)
2753             except Exception:
2754                 self._pagecount = pagenum - 1
2755                 raise
2756             if startv != 0 or endv is not None:
2757                 page_results = page_results[startv:endv]
2758             yield from page_results
2759
2760             # A little optimization - if current page is not "full", ie. does
2761             # not contain page_size videos then we can assume that this page
2762             # is the last one - there are no more ids on further pages -
2763             # i.e. no need to query again.
2764             if len(page_results) + startv < self._pagesize:
2765                 break
2766
2767             # If we got the whole page, but the next page is not interesting,
2768             # break out early as well
2769             if end == nextfirstid:
2770                 break
2771
2772
2773 class InAdvancePagedList(PagedList):
2774     """PagedList with total number of pages known in advance"""
2775
2776     def __init__(self, pagefunc, pagecount, pagesize):
2777         PagedList.__init__(self, pagefunc, pagesize, True)
2778         self._pagecount = pagecount
2779
2780     def _getslice(self, start, end):
2781         start_page = start // self._pagesize
2782         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2783         skip_elems = start - start_page * self._pagesize
2784         only_more = None if end is None else end - start
2785         for pagenum in range(start_page, end_page):
2786             page_results = self.getpage(pagenum)
2787             if skip_elems:
2788                 page_results = page_results[skip_elems:]
2789                 skip_elems = None
2790             if only_more is not None:
2791                 if len(page_results) < only_more:
2792                     only_more -= len(page_results)
2793                 else:
2794                     yield from page_results[:only_more]
2795                     break
2796             yield from page_results
2797
2798
2799 class PlaylistEntries:
2800     MissingEntry = object()
2801     is_exhausted = False
2802
2803     def __init__(self, ydl, info_dict):
2804         self.ydl = ydl
2805
2806         # _entries must be assigned now since infodict can change during iteration
2807         entries = info_dict.get('entries')
2808         if entries is None:
2809             raise EntryNotInPlaylist('There are no entries')
2810         elif isinstance(entries, list):
2811             self.is_exhausted = True
2812
2813         requested_entries = info_dict.get('requested_entries')
2814         self.is_incomplete = requested_entries is not None
2815         if self.is_incomplete:
2816             assert self.is_exhausted
2817             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2818             for i, entry in zip(requested_entries, entries):
2819                 self._entries[i - 1] = entry
2820         elif isinstance(entries, (list, PagedList, LazyList)):
2821             self._entries = entries
2822         else:
2823             self._entries = LazyList(entries)
2824
2825     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2826         (?P<start>[+-]?\d+)?
2827         (?P<range>[:-]
2828             (?P<end>[+-]?\d+|inf(?:inite)?)?
2829             (?::(?P<step>[+-]?\d+))?
2830         )?''')
2831
2832     @classmethod
2833     def parse_playlist_items(cls, string):
2834         for segment in string.split(','):
2835             if not segment:
2836                 raise ValueError('There is two or more consecutive commas')
2837             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2838             if not mobj:
2839                 raise ValueError(f'{segment!r} is not a valid specification')
2840             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2841             if int_or_none(step) == 0:
2842                 raise ValueError(f'Step in {segment!r} cannot be zero')
2843             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2844
2845     def get_requested_items(self):
2846         playlist_items = self.ydl.params.get('playlist_items')
2847         playlist_start = self.ydl.params.get('playliststart', 1)
2848         playlist_end = self.ydl.params.get('playlistend')
2849         # For backwards compatibility, interpret -1 as whole list
2850         if playlist_end in (-1, None):
2851             playlist_end = ''
2852         if not playlist_items:
2853             playlist_items = f'{playlist_start}:{playlist_end}'
2854         elif playlist_start != 1 or playlist_end:
2855             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2856
2857         for index in self.parse_playlist_items(playlist_items):
2858             for i, entry in self[index]:
2859                 yield i, entry
2860                 if not entry:
2861                     continue
2862                 try:
2863                     # The item may have just been added to archive. Don't break due to it
2864                     if not self.ydl.params.get('lazy_playlist'):
2865                         # TODO: Add auto-generated fields
2866                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2867                 except (ExistingVideoReached, RejectedVideoReached):
2868                     return
2869
2870     def get_full_count(self):
2871         if self.is_exhausted and not self.is_incomplete:
2872             return len(self)
2873         elif isinstance(self._entries, InAdvancePagedList):
2874             if self._entries._pagesize == 1:
2875                 return self._entries._pagecount
2876
2877     @functools.cached_property
2878     def _getter(self):
2879         if isinstance(self._entries, list):
2880             def get_entry(i):
2881                 try:
2882                     entry = self._entries[i]
2883                 except IndexError:
2884                     entry = self.MissingEntry
2885                     if not self.is_incomplete:
2886                         raise self.IndexError()
2887                 if entry is self.MissingEntry:
2888                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2889                 return entry
2890         else:
2891             def get_entry(i):
2892                 try:
2893                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2894                 except (LazyList.IndexError, PagedList.IndexError):
2895                     raise self.IndexError()
2896         return get_entry
2897
2898     def __getitem__(self, idx):
2899         if isinstance(idx, int):
2900             idx = slice(idx, idx)
2901
2902         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2903         step = 1 if idx.step is None else idx.step
2904         if idx.start is None:
2905             start = 0 if step > 0 else len(self) - 1
2906         else:
2907             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2908
2909         # NB: Do not call len(self) when idx == [:]
2910         if idx.stop is None:
2911             stop = 0 if step < 0 else float('inf')
2912         else:
2913             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2914         stop += [-1, 1][step > 0]
2915
2916         for i in frange(start, stop, step):
2917             if i < 0:
2918                 continue
2919             try:
2920                 entry = self._getter(i)
2921             except self.IndexError:
2922                 self.is_exhausted = True
2923                 if step > 0:
2924                     break
2925                 continue
2926             yield i + 1, entry
2927
2928     def __len__(self):
2929         return len(tuple(self[:]))
2930
2931     class IndexError(IndexError):
2932         pass
2933
2934
2935 def uppercase_escape(s):
2936     unicode_escape = codecs.getdecoder('unicode_escape')
2937     return re.sub(
2938         r'\\U[0-9a-fA-F]{8}',
2939         lambda m: unicode_escape(m.group(0))[0],
2940         s)
2941
2942
2943 def lowercase_escape(s):
2944     unicode_escape = codecs.getdecoder('unicode_escape')
2945     return re.sub(
2946         r'\\u[0-9a-fA-F]{4}',
2947         lambda m: unicode_escape(m.group(0))[0],
2948         s)
2949
2950
2951 def escape_rfc3986(s):
2952     """Escape non-ASCII characters as suggested by RFC 3986"""
2953     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2954
2955
2956 def escape_url(url):
2957     """Escape URL as suggested by RFC 3986"""
2958     url_parsed = urllib.parse.urlparse(url)
2959     return url_parsed._replace(
2960         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2961         path=escape_rfc3986(url_parsed.path),
2962         params=escape_rfc3986(url_parsed.params),
2963         query=escape_rfc3986(url_parsed.query),
2964         fragment=escape_rfc3986(url_parsed.fragment)
2965     ).geturl()
2966
2967
2968 def parse_qs(url, **kwargs):
2969     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2970
2971
2972 def read_batch_urls(batch_fd):
2973     def fixup(url):
2974         if not isinstance(url, str):
2975             url = url.decode('utf-8', 'replace')
2976         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2977         for bom in BOM_UTF8:
2978             if url.startswith(bom):
2979                 url = url[len(bom):]
2980         url = url.lstrip()
2981         if not url or url.startswith(('#', ';', ']')):
2982             return False
2983         # "#" cannot be stripped out since it is part of the URI
2984         # However, it can be safely stripped out if following a whitespace
2985         return re.split(r'\s#', url, 1)[0].rstrip()
2986
2987     with contextlib.closing(batch_fd) as fd:
2988         return [url for url in map(fixup, fd) if url]
2989
2990
2991 def urlencode_postdata(*args, **kargs):
2992     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2993
2994
2995 def update_url(url, *, query_update=None, **kwargs):
2996     """Replace URL components specified by kwargs
2997        @param url           str or parse url tuple
2998        @param query_update  update query
2999        @returns             str
3000     """
3001     if isinstance(url, str):
3002         if not kwargs and not query_update:
3003             return url
3004         else:
3005             url = urllib.parse.urlparse(url)
3006     if query_update:
3007         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3008         kwargs['query'] = urllib.parse.urlencode({
3009             **urllib.parse.parse_qs(url.query),
3010             **query_update
3011         }, True)
3012     return urllib.parse.urlunparse(url._replace(**kwargs))
3013
3014
3015 def update_url_query(url, query):
3016     return update_url(url, query_update=query)
3017
3018
3019 def update_Request(req, url=None, data=None, headers=None, query=None):
3020     req_headers = req.headers.copy()
3021     req_headers.update(headers or {})
3022     req_data = data or req.data
3023     req_url = update_url_query(url or req.get_full_url(), query)
3024     req_get_method = req.get_method()
3025     if req_get_method == 'HEAD':
3026         req_type = HEADRequest
3027     elif req_get_method == 'PUT':
3028         req_type = PUTRequest
3029     else:
3030         req_type = urllib.request.Request
3031     new_req = req_type(
3032         req_url, data=req_data, headers=req_headers,
3033         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3034     if hasattr(req, 'timeout'):
3035         new_req.timeout = req.timeout
3036     return new_req
3037
3038
3039 def _multipart_encode_impl(data, boundary):
3040     content_type = 'multipart/form-data; boundary=%s' % boundary
3041
3042     out = b''
3043     for k, v in data.items():
3044         out += b'--' + boundary.encode('ascii') + b'\r\n'
3045         if isinstance(k, str):
3046             k = k.encode()
3047         if isinstance(v, str):
3048             v = v.encode()
3049         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3050         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3051         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3052         if boundary.encode('ascii') in content:
3053             raise ValueError('Boundary overlaps with data')
3054         out += content
3055
3056     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3057
3058     return out, content_type
3059
3060
3061 def multipart_encode(data, boundary=None):
3062     '''
3063     Encode a dict to RFC 7578-compliant form-data
3064
3065     data:
3066         A dict where keys and values can be either Unicode or bytes-like
3067         objects.
3068     boundary:
3069         If specified a Unicode object, it's used as the boundary. Otherwise
3070         a random boundary is generated.
3071
3072     Reference: https://tools.ietf.org/html/rfc7578
3073     '''
3074     has_specified_boundary = boundary is not None
3075
3076     while True:
3077         if boundary is None:
3078             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3079
3080         try:
3081             out, content_type = _multipart_encode_impl(data, boundary)
3082             break
3083         except ValueError:
3084             if has_specified_boundary:
3085                 raise
3086             boundary = None
3087
3088     return out, content_type
3089
3090
3091 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3092     if blocked_types is NO_DEFAULT:
3093         blocked_types = (str, bytes, collections.abc.Mapping)
3094     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3095
3096
3097 def variadic(x, allowed_types=NO_DEFAULT):
3098     if not isinstance(allowed_types, (tuple, type)):
3099         deprecation_warning('allowed_types should be a tuple or a type')
3100         allowed_types = tuple(allowed_types)
3101     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3102
3103
3104 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3105     for f in funcs:
3106         try:
3107             val = f(*args, **kwargs)
3108         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3109             pass
3110         else:
3111             if expected_type is None or isinstance(val, expected_type):
3112                 return val
3113
3114
3115 def try_get(src, getter, expected_type=None):
3116     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3117
3118
3119 def filter_dict(dct, cndn=lambda _, v: v is not None):
3120     return {k: v for k, v in dct.items() if cndn(k, v)}
3121
3122
3123 def merge_dicts(*dicts):
3124     merged = {}
3125     for a_dict in dicts:
3126         for k, v in a_dict.items():
3127             if (v is not None and k not in merged
3128                     or isinstance(v, str) and merged[k] == ''):
3129                 merged[k] = v
3130     return merged
3131
3132
3133 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3134     return string if isinstance(string, str) else str(string, encoding, errors)
3135
3136
3137 US_RATINGS = {
3138     'G': 0,
3139     'PG': 10,
3140     'PG-13': 13,
3141     'R': 16,
3142     'NC': 18,
3143 }
3144
3145
3146 TV_PARENTAL_GUIDELINES = {
3147     'TV-Y': 0,
3148     'TV-Y7': 7,
3149     'TV-G': 0,
3150     'TV-PG': 0,
3151     'TV-14': 14,
3152     'TV-MA': 17,
3153 }
3154
3155
3156 def parse_age_limit(s):
3157     # isinstance(False, int) is True. So type() must be used instead
3158     if type(s) is int:  # noqa: E721
3159         return s if 0 <= s <= 21 else None
3160     elif not isinstance(s, str):
3161         return None
3162     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3163     if m:
3164         return int(m.group('age'))
3165     s = s.upper()
3166     if s in US_RATINGS:
3167         return US_RATINGS[s]
3168     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3169     if m:
3170         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3171     return None
3172
3173
3174 def strip_jsonp(code):
3175     return re.sub(
3176         r'''(?sx)^
3177             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3178             (?:\s*&&\s*(?P=func_name))?
3179             \s*\(\s*(?P<callback_data>.*)\);?
3180             \s*?(?://[^\n]*)*$''',
3181         r'\g<callback_data>', code)
3182
3183
3184 def js_to_json(code, vars={}, *, strict=False):
3185     # vars is a dict of var, val pairs to substitute
3186     STRING_QUOTES = '\'"`'
3187     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3188     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3189     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3190     INTEGER_TABLE = (
3191         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3192         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3193     )
3194
3195     def process_escape(match):
3196         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3197         escape = match.group(1) or match.group(2)
3198
3199         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3200                 else R'\u00' if escape == 'x'
3201                 else '' if escape == '\n'
3202                 else escape)
3203
3204     def template_substitute(match):
3205         evaluated = js_to_json(match.group(1), vars, strict=strict)
3206         if evaluated[0] == '"':
3207             return json.loads(evaluated)
3208         return evaluated
3209
3210     def fix_kv(m):
3211         v = m.group(0)
3212         if v in ('true', 'false', 'null'):
3213             return v
3214         elif v in ('undefined', 'void 0'):
3215             return 'null'
3216         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3217             return ''
3218
3219         if v[0] in STRING_QUOTES:
3220             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3221             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3222             return f'"{escaped}"'
3223
3224         for regex, base in INTEGER_TABLE:
3225             im = re.match(regex, v)
3226             if im:
3227                 i = int(im.group(1), base)
3228                 return f'"{i}":' if v.endswith(':') else str(i)
3229
3230         if v in vars:
3231             try:
3232                 if not strict:
3233                     json.loads(vars[v])
3234             except json.JSONDecodeError:
3235                 return json.dumps(vars[v])
3236             else:
3237                 return vars[v]
3238
3239         if not strict:
3240             return f'"{v}"'
3241
3242         raise ValueError(f'Unknown value: {v}')
3243
3244     def create_map(mobj):
3245         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3246
3247     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3248     if not strict:
3249         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3250         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3251         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3252         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3253
3254     return re.sub(rf'''(?sx)
3255         {STRING_RE}|
3256         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3257         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3258         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3259         [0-9]+(?={SKIP_RE}:)|
3260         !+
3261         ''', fix_kv, code)
3262
3263
3264 def qualities(quality_ids):
3265     """ Get a numeric quality value out of a list of possible values """
3266     def q(qid):
3267         try:
3268             return quality_ids.index(qid)
3269         except ValueError:
3270             return -1
3271     return q
3272
3273
3274 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3275
3276
3277 DEFAULT_OUTTMPL = {
3278     'default': '%(title)s [%(id)s].%(ext)s',
3279     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3280 }
3281 OUTTMPL_TYPES = {
3282     'chapter': None,
3283     'subtitle': None,
3284     'thumbnail': None,
3285     'description': 'description',
3286     'annotation': 'annotations.xml',
3287     'infojson': 'info.json',
3288     'link': None,
3289     'pl_video': None,
3290     'pl_thumbnail': None,
3291     'pl_description': 'description',
3292     'pl_infojson': 'info.json',
3293 }
3294
3295 # As of [1] format syntax is:
3296 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3297 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3298 STR_FORMAT_RE_TMPL = r'''(?x)
3299     (?<!%)(?P<prefix>(?:%%)*)
3300     %
3301     (?P<has_key>\((?P<key>{0})\))?
3302     (?P<format>
3303         (?P<conversion>[#0\-+ ]+)?
3304         (?P<min_width>\d+)?
3305         (?P<precision>\.\d+)?
3306         (?P<len_mod>[hlL])?  # unused in python
3307         {1}  # conversion type
3308     )
3309 '''
3310
3311
3312 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
3313
3314
3315 def limit_length(s, length):
3316     """ Add ellipses to overly long strings """
3317     if s is None:
3318         return None
3319     ELLIPSES = '...'
3320     if len(s) > length:
3321         return s[:length - len(ELLIPSES)] + ELLIPSES
3322     return s
3323
3324
3325 def version_tuple(v):
3326     return tuple(int(e) for e in re.split(r'[-.]', v))
3327
3328
3329 def is_outdated_version(version, limit, assume_new=True):
3330     if not version:
3331         return not assume_new
3332     try:
3333         return version_tuple(version) < version_tuple(limit)
3334     except ValueError:
3335         return not assume_new
3336
3337
3338 def ytdl_is_updateable():
3339     """ Returns if yt-dlp can be updated with -U """
3340
3341     from ..update import is_non_updateable
3342
3343     return not is_non_updateable()
3344
3345
3346 def args_to_str(args):
3347     # Get a short string representation for a subprocess command
3348     return ' '.join(compat_shlex_quote(a) for a in args)
3349
3350
3351 def error_to_str(err):
3352     return f'{type(err).__name__}: {err}'
3353
3354
3355 def mimetype2ext(mt, default=NO_DEFAULT):
3356     if not isinstance(mt, str):
3357         if default is not NO_DEFAULT:
3358             return default
3359         return None
3360
3361     MAP = {
3362         # video
3363         '3gpp': '3gp',
3364         'mp2t': 'ts',
3365         'mp4': 'mp4',
3366         'mpeg': 'mpeg',
3367         'mpegurl': 'm3u8',
3368         'quicktime': 'mov',
3369         'webm': 'webm',
3370         'vp9': 'vp9',
3371         'x-flv': 'flv',
3372         'x-m4v': 'm4v',
3373         'x-matroska': 'mkv',
3374         'x-mng': 'mng',
3375         'x-mp4-fragmented': 'mp4',
3376         'x-ms-asf': 'asf',
3377         'x-ms-wmv': 'wmv',
3378         'x-msvideo': 'avi',
3379
3380         # application (streaming playlists)
3381         'dash+xml': 'mpd',
3382         'f4m+xml': 'f4m',
3383         'hds+xml': 'f4m',
3384         'vnd.apple.mpegurl': 'm3u8',
3385         'vnd.ms-sstr+xml': 'ism',
3386         'x-mpegurl': 'm3u8',
3387
3388         # audio
3389         'audio/mp4': 'm4a',
3390         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3391         # Using .mp3 as it's the most popular one
3392         'audio/mpeg': 'mp3',
3393         'audio/webm': 'webm',
3394         'audio/x-matroska': 'mka',
3395         'audio/x-mpegurl': 'm3u',
3396         'midi': 'mid',
3397         'ogg': 'ogg',
3398         'wav': 'wav',
3399         'wave': 'wav',
3400         'x-aac': 'aac',
3401         'x-flac': 'flac',
3402         'x-m4a': 'm4a',
3403         'x-realaudio': 'ra',
3404         'x-wav': 'wav',
3405
3406         # image
3407         'avif': 'avif',
3408         'bmp': 'bmp',
3409         'gif': 'gif',
3410         'jpeg': 'jpg',
3411         'png': 'png',
3412         'svg+xml': 'svg',
3413         'tiff': 'tif',
3414         'vnd.wap.wbmp': 'wbmp',
3415         'webp': 'webp',
3416         'x-icon': 'ico',
3417         'x-jng': 'jng',
3418         'x-ms-bmp': 'bmp',
3419
3420         # caption
3421         'filmstrip+json': 'fs',
3422         'smptett+xml': 'tt',
3423         'ttaf+xml': 'dfxp',
3424         'ttml+xml': 'ttml',
3425         'x-ms-sami': 'sami',
3426
3427         # misc
3428         'gzip': 'gz',
3429         'json': 'json',
3430         'xml': 'xml',
3431         'zip': 'zip',
3432     }
3433
3434     mimetype = mt.partition(';')[0].strip().lower()
3435     _, _, subtype = mimetype.rpartition('/')
3436
3437     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3438     if ext:
3439         return ext
3440     elif default is not NO_DEFAULT:
3441         return default
3442     return subtype.replace('+', '.')
3443
3444
3445 def ext2mimetype(ext_or_url):
3446     if not ext_or_url:
3447         return None
3448     if '.' not in ext_or_url:
3449         ext_or_url = f'file.{ext_or_url}'
3450     return mimetypes.guess_type(ext_or_url)[0]
3451
3452
3453 def parse_codecs(codecs_str):
3454     # http://tools.ietf.org/html/rfc6381
3455     if not codecs_str:
3456         return {}
3457     split_codecs = list(filter(None, map(
3458         str.strip, codecs_str.strip().strip(',').split(','))))
3459     vcodec, acodec, scodec, hdr = None, None, None, None
3460     for full_codec in split_codecs:
3461         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3462         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3463                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3464             if vcodec:
3465                 continue
3466             vcodec = full_codec
3467             if parts[0] in ('dvh1', 'dvhe'):
3468                 hdr = 'DV'
3469             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3470                 hdr = 'HDR10'
3471             elif parts[:2] == ['vp9', '2']:
3472                 hdr = 'HDR10'
3473         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3474                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3475             acodec = acodec or full_codec
3476         elif parts[0] in ('stpp', 'wvtt'):
3477             scodec = scodec or full_codec
3478         else:
3479             write_string(f'WARNING: Unknown codec {full_codec}\n')
3480     if vcodec or acodec or scodec:
3481         return {
3482             'vcodec': vcodec or 'none',
3483             'acodec': acodec or 'none',
3484             'dynamic_range': hdr,
3485             **({'scodec': scodec} if scodec is not None else {}),
3486         }
3487     elif len(split_codecs) == 2:
3488         return {
3489             'vcodec': split_codecs[0],
3490             'acodec': split_codecs[1],
3491         }
3492     return {}
3493
3494
3495 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3496     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3497
3498     allow_mkv = not preferences or 'mkv' in preferences
3499
3500     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3501         return 'mkv'  # TODO: any other format allows this?
3502
3503     # TODO: All codecs supported by parse_codecs isn't handled here
3504     COMPATIBLE_CODECS = {
3505         'mp4': {
3506             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3507             'h264', 'aacl', 'ec-3',  # Set in ISM
3508         },
3509         'webm': {
3510             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3511             'vp9x', 'vp8x',  # in the webm spec
3512         },
3513     }
3514
3515     sanitize_codec = functools.partial(
3516         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3517     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3518
3519     for ext in preferences or COMPATIBLE_CODECS.keys():
3520         codec_set = COMPATIBLE_CODECS.get(ext, set())
3521         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3522             return ext
3523
3524     COMPATIBLE_EXTS = (
3525         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3526         {'webm', 'weba'},
3527     )
3528     for ext in preferences or vexts:
3529         current_exts = {ext, *vexts, *aexts}
3530         if ext == 'mkv' or current_exts == {ext} or any(
3531                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3532             return ext
3533     return 'mkv' if allow_mkv else preferences[-1]
3534
3535
3536 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3537     getheader = url_handle.headers.get
3538
3539     cd = getheader('Content-Disposition')
3540     if cd:
3541         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3542         if m:
3543             e = determine_ext(m.group('filename'), default_ext=None)
3544             if e:
3545                 return e
3546
3547     meta_ext = getheader('x-amz-meta-name')
3548     if meta_ext:
3549         e = meta_ext.rpartition('.')[2]
3550         if e:
3551             return e
3552
3553     return mimetype2ext(getheader('Content-Type'), default=default)
3554
3555
3556 def encode_data_uri(data, mime_type):
3557     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3558
3559
3560 def age_restricted(content_limit, age_limit):
3561     """ Returns True iff the content should be blocked """
3562
3563     if age_limit is None:  # No limit set
3564         return False
3565     if content_limit is None:
3566         return False  # Content available for everyone
3567     return age_limit < content_limit
3568
3569
3570 # List of known byte-order-marks (BOM)
3571 BOMS = [
3572     (b'\xef\xbb\xbf', 'utf-8'),
3573     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3574     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3575     (b'\xff\xfe', 'utf-16-le'),
3576     (b'\xfe\xff', 'utf-16-be'),
3577 ]
3578
3579
3580 def is_html(first_bytes):
3581     """ Detect whether a file contains HTML by examining its first bytes. """
3582
3583     encoding = 'utf-8'
3584     for bom, enc in BOMS:
3585         while first_bytes.startswith(bom):
3586             encoding, first_bytes = enc, first_bytes[len(bom):]
3587
3588     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3589
3590
3591 def determine_protocol(info_dict):
3592     protocol = info_dict.get('protocol')
3593     if protocol is not None:
3594         return protocol
3595
3596     url = sanitize_url(info_dict['url'])
3597     if url.startswith('rtmp'):
3598         return 'rtmp'
3599     elif url.startswith('mms'):
3600         return 'mms'
3601     elif url.startswith('rtsp'):
3602         return 'rtsp'
3603
3604     ext = determine_ext(url)
3605     if ext == 'm3u8':
3606         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3607     elif ext == 'f4m':
3608         return 'f4m'
3609
3610     return urllib.parse.urlparse(url).scheme
3611
3612
3613 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3614     """ Render a list of rows, each as a list of values.
3615     Text after a \t will be right aligned """
3616     def width(string):
3617         return len(remove_terminal_sequences(string).replace('\t', ''))
3618
3619     def get_max_lens(table):
3620         return [max(width(str(v)) for v in col) for col in zip(*table)]
3621
3622     def filter_using_list(row, filterArray):
3623         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3624
3625     max_lens = get_max_lens(data) if hide_empty else []
3626     header_row = filter_using_list(header_row, max_lens)
3627     data = [filter_using_list(row, max_lens) for row in data]
3628
3629     table = [header_row] + data
3630     max_lens = get_max_lens(table)
3631     extra_gap += 1
3632     if delim:
3633         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3634         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3635     for row in table:
3636         for pos, text in enumerate(map(str, row)):
3637             if '\t' in text:
3638                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3639             else:
3640                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3641     ret = '\n'.join(''.join(row).rstrip() for row in table)
3642     return ret
3643
3644
3645 def _match_one(filter_part, dct, incomplete):
3646     # TODO: Generalize code with YoutubeDL._build_format_filter
3647     STRING_OPERATORS = {
3648         '*=': operator.contains,
3649         '^=': lambda attr, value: attr.startswith(value),
3650         '$=': lambda attr, value: attr.endswith(value),
3651         '~=': lambda attr, value: re.search(value, attr),
3652     }
3653     COMPARISON_OPERATORS = {
3654         **STRING_OPERATORS,
3655         '<=': operator.le,  # "<=" must be defined above "<"
3656         '<': operator.lt,
3657         '>=': operator.ge,
3658         '>': operator.gt,
3659         '=': operator.eq,
3660     }
3661
3662     if isinstance(incomplete, bool):
3663         is_incomplete = lambda _: incomplete
3664     else:
3665         is_incomplete = lambda k: k in incomplete
3666
3667     operator_rex = re.compile(r'''(?x)
3668         (?P<key>[a-z_]+)
3669         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3670         (?:
3671             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3672             (?P<strval>.+?)
3673         )
3674         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3675     m = operator_rex.fullmatch(filter_part.strip())
3676     if m:
3677         m = m.groupdict()
3678         unnegated_op = COMPARISON_OPERATORS[m['op']]
3679         if m['negation']:
3680             op = lambda attr, value: not unnegated_op(attr, value)
3681         else:
3682             op = unnegated_op
3683         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3684         if m['quote']:
3685             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3686         actual_value = dct.get(m['key'])
3687         numeric_comparison = None
3688         if isinstance(actual_value, (int, float)):
3689             # If the original field is a string and matching comparisonvalue is
3690             # a number we should respect the origin of the original field
3691             # and process comparison value as a string (see
3692             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3693             try:
3694                 numeric_comparison = int(comparison_value)
3695             except ValueError:
3696                 numeric_comparison = parse_filesize(comparison_value)
3697                 if numeric_comparison is None:
3698                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3699                 if numeric_comparison is None:
3700                     numeric_comparison = parse_duration(comparison_value)
3701         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3702             raise ValueError('Operator %s only supports string values!' % m['op'])
3703         if actual_value is None:
3704             return is_incomplete(m['key']) or m['none_inclusive']
3705         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3706
3707     UNARY_OPERATORS = {
3708         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3709         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3710     }
3711     operator_rex = re.compile(r'''(?x)
3712         (?P<op>%s)\s*(?P<key>[a-z_]+)
3713         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3714     m = operator_rex.fullmatch(filter_part.strip())
3715     if m:
3716         op = UNARY_OPERATORS[m.group('op')]
3717         actual_value = dct.get(m.group('key'))
3718         if is_incomplete(m.group('key')) and actual_value is None:
3719             return True
3720         return op(actual_value)
3721
3722     raise ValueError('Invalid filter part %r' % filter_part)
3723
3724
3725 def match_str(filter_str, dct, incomplete=False):
3726     """ Filter a dictionary with a simple string syntax.
3727     @returns           Whether the filter passes
3728     @param incomplete  Set of keys that is expected to be missing from dct.
3729                        Can be True/False to indicate all/none of the keys may be missing.
3730                        All conditions on incomplete keys pass if the key is missing
3731     """
3732     return all(
3733         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3734         for filter_part in re.split(r'(?<!\\)&', filter_str))
3735
3736
3737 def match_filter_func(filters, breaking_filters=None):
3738     if not filters and not breaking_filters:
3739         return None
3740     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3741     filters = set(variadic(filters or []))
3742
3743     interactive = '-' in filters
3744     if interactive:
3745         filters.remove('-')
3746
3747     def _match_func(info_dict, incomplete=False):
3748         ret = breaking_filters(info_dict, incomplete)
3749         if ret is not None:
3750             raise RejectedVideoReached(ret)
3751
3752         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3753             return NO_DEFAULT if interactive and not incomplete else None
3754         else:
3755             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3756             filter_str = ') | ('.join(map(str.strip, filters))
3757             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3758     return _match_func
3759
3760
3761 class download_range_func:
3762     def __init__(self, chapters, ranges, from_info=False):
3763         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3764
3765     def __call__(self, info_dict, ydl):
3766
3767         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3768                    else 'Cannot match chapters since chapter information is unavailable')
3769         for regex in self.chapters or []:
3770             for i, chapter in enumerate(info_dict.get('chapters') or []):
3771                 if re.search(regex, chapter['title']):
3772                     warning = None
3773                     yield {**chapter, 'index': i}
3774         if self.chapters and warning:
3775             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3776
3777         for start, end in self.ranges or []:
3778             yield {
3779                 'start_time': self._handle_negative_timestamp(start, info_dict),
3780                 'end_time': self._handle_negative_timestamp(end, info_dict),
3781             }
3782
3783         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3784             yield {
3785                 'start_time': info_dict.get('start_time') or 0,
3786                 'end_time': info_dict.get('end_time') or float('inf'),
3787             }
3788         elif not self.ranges and not self.chapters:
3789             yield {}
3790
3791     @staticmethod
3792     def _handle_negative_timestamp(time, info):
3793         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3794
3795     def __eq__(self, other):
3796         return (isinstance(other, download_range_func)
3797                 and self.chapters == other.chapters and self.ranges == other.ranges)
3798
3799     def __repr__(self):
3800         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3801
3802
3803 def parse_dfxp_time_expr(time_expr):
3804     if not time_expr:
3805         return
3806
3807     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3808     if mobj:
3809         return float(mobj.group('time_offset'))
3810
3811     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3812     if mobj:
3813         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3814
3815
3816 def srt_subtitles_timecode(seconds):
3817     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3818
3819
3820 def ass_subtitles_timecode(seconds):
3821     time = timetuple_from_msec(seconds * 1000)
3822     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3823
3824
3825 def dfxp2srt(dfxp_data):
3826     '''
3827     @param dfxp_data A bytes-like object containing DFXP data
3828     @returns A unicode object containing converted SRT data
3829     '''
3830     LEGACY_NAMESPACES = (
3831         (b'http://www.w3.org/ns/ttml', [
3832             b'http://www.w3.org/2004/11/ttaf1',
3833             b'http://www.w3.org/2006/04/ttaf1',
3834             b'http://www.w3.org/2006/10/ttaf1',
3835         ]),
3836         (b'http://www.w3.org/ns/ttml#styling', [
3837             b'http://www.w3.org/ns/ttml#style',
3838         ]),
3839     )
3840
3841     SUPPORTED_STYLING = [
3842         'color',
3843         'fontFamily',
3844         'fontSize',
3845         'fontStyle',
3846         'fontWeight',
3847         'textDecoration'
3848     ]
3849
3850     _x = functools.partial(xpath_with_ns, ns_map={
3851         'xml': 'http://www.w3.org/XML/1998/namespace',
3852         'ttml': 'http://www.w3.org/ns/ttml',
3853         'tts': 'http://www.w3.org/ns/ttml#styling',
3854     })
3855
3856     styles = {}
3857     default_style = {}
3858
3859     class TTMLPElementParser:
3860         _out = ''
3861         _unclosed_elements = []
3862         _applied_styles = []
3863
3864         def start(self, tag, attrib):
3865             if tag in (_x('ttml:br'), 'br'):
3866                 self._out += '\n'
3867             else:
3868                 unclosed_elements = []
3869                 style = {}
3870                 element_style_id = attrib.get('style')
3871                 if default_style:
3872                     style.update(default_style)
3873                 if element_style_id:
3874                     style.update(styles.get(element_style_id, {}))
3875                 for prop in SUPPORTED_STYLING:
3876                     prop_val = attrib.get(_x('tts:' + prop))
3877                     if prop_val:
3878                         style[prop] = prop_val
3879                 if style:
3880                     font = ''
3881                     for k, v in sorted(style.items()):
3882                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3883                             continue
3884                         if k == 'color':
3885                             font += ' color="%s"' % v
3886                         elif k == 'fontSize':
3887                             font += ' size="%s"' % v
3888                         elif k == 'fontFamily':
3889                             font += ' face="%s"' % v
3890                         elif k == 'fontWeight' and v == 'bold':
3891                             self._out += '<b>'
3892                             unclosed_elements.append('b')
3893                         elif k == 'fontStyle' and v == 'italic':
3894                             self._out += '<i>'
3895                             unclosed_elements.append('i')
3896                         elif k == 'textDecoration' and v == 'underline':
3897                             self._out += '<u>'
3898                             unclosed_elements.append('u')
3899                     if font:
3900                         self._out += '<font' + font + '>'
3901                         unclosed_elements.append('font')
3902                     applied_style = {}
3903                     if self._applied_styles:
3904                         applied_style.update(self._applied_styles[-1])
3905                     applied_style.update(style)
3906                     self._applied_styles.append(applied_style)
3907                 self._unclosed_elements.append(unclosed_elements)
3908
3909         def end(self, tag):
3910             if tag not in (_x('ttml:br'), 'br'):
3911                 unclosed_elements = self._unclosed_elements.pop()
3912                 for element in reversed(unclosed_elements):
3913                     self._out += '</%s>' % element
3914                 if unclosed_elements and self._applied_styles:
3915                     self._applied_styles.pop()
3916
3917         def data(self, data):
3918             self._out += data
3919
3920         def close(self):
3921             return self._out.strip()
3922
3923     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3924     # This will not trigger false positives since only UTF-8 text is being replaced
3925     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3926
3927     def parse_node(node):
3928         target = TTMLPElementParser()
3929         parser = xml.etree.ElementTree.XMLParser(target=target)
3930         parser.feed(xml.etree.ElementTree.tostring(node))
3931         return parser.close()
3932
3933     for k, v in LEGACY_NAMESPACES:
3934         for ns in v:
3935             dfxp_data = dfxp_data.replace(ns, k)
3936
3937     dfxp = compat_etree_fromstring(dfxp_data)
3938     out = []
3939     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3940
3941     if not paras:
3942         raise ValueError('Invalid dfxp/TTML subtitle')
3943
3944     repeat = False
3945     while True:
3946         for style in dfxp.findall(_x('.//ttml:style')):
3947             style_id = style.get('id') or style.get(_x('xml:id'))
3948             if not style_id:
3949                 continue
3950             parent_style_id = style.get('style')
3951             if parent_style_id:
3952                 if parent_style_id not in styles:
3953                     repeat = True
3954                     continue
3955                 styles[style_id] = styles[parent_style_id].copy()
3956             for prop in SUPPORTED_STYLING:
3957                 prop_val = style.get(_x('tts:' + prop))
3958                 if prop_val:
3959                     styles.setdefault(style_id, {})[prop] = prop_val
3960         if repeat:
3961             repeat = False
3962         else:
3963             break
3964
3965     for p in ('body', 'div'):
3966         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3967         if ele is None:
3968             continue
3969         style = styles.get(ele.get('style'))
3970         if not style:
3971             continue
3972         default_style.update(style)
3973
3974     for para, index in zip(paras, itertools.count(1)):
3975         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3976         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3977         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3978         if begin_time is None:
3979             continue
3980         if not end_time:
3981             if not dur:
3982                 continue
3983             end_time = begin_time + dur
3984         out.append('%d\n%s --> %s\n%s\n\n' % (
3985             index,
3986             srt_subtitles_timecode(begin_time),
3987             srt_subtitles_timecode(end_time),
3988             parse_node(para)))
3989
3990     return ''.join(out)
3991
3992
3993 def cli_option(params, command_option, param, separator=None):
3994     param = params.get(param)
3995     return ([] if param is None
3996             else [command_option, str(param)] if separator is None
3997             else [f'{command_option}{separator}{param}'])
3998
3999
4000 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4001     param = params.get(param)
4002     assert param in (True, False, None)
4003     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4004
4005
4006 def cli_valueless_option(params, command_option, param, expected_value=True):
4007     return [command_option] if params.get(param) == expected_value else []
4008
4009
4010 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4011     if isinstance(argdict, (list, tuple)):  # for backward compatibility
4012         if use_compat:
4013             return argdict
4014         else:
4015             argdict = None
4016     if argdict is None:
4017         return default
4018     assert isinstance(argdict, dict)
4019
4020     assert isinstance(keys, (list, tuple))
4021     for key_list in keys:
4022         arg_list = list(filter(
4023             lambda x: x is not None,
4024             [argdict.get(key.lower()) for key in variadic(key_list)]))
4025         if arg_list:
4026             return [arg for args in arg_list for arg in args]
4027     return default
4028
4029
4030 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4031     main_key, exe = main_key.lower(), exe.lower()
4032     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4033     keys = [f'{root_key}{k}' for k in (keys or [''])]
4034     if root_key in keys:
4035         if main_key != exe:
4036             keys.append((main_key, exe))
4037         keys.append('default')
4038     else:
4039         use_compat = False
4040     return cli_configuration_args(argdict, keys, default, use_compat)
4041
4042
4043 class ISO639Utils:
4044     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4045     _lang_map = {
4046         'aa': 'aar',
4047         'ab': 'abk',
4048         'ae': 'ave',
4049         'af': 'afr',
4050         'ak': 'aka',
4051         'am': 'amh',
4052         'an': 'arg',
4053         'ar': 'ara',
4054         'as': 'asm',
4055         'av': 'ava',
4056         'ay': 'aym',
4057         'az': 'aze',
4058         'ba': 'bak',
4059         'be': 'bel',
4060         'bg': 'bul',
4061         'bh': 'bih',
4062         'bi': 'bis',
4063         'bm': 'bam',
4064         'bn': 'ben',
4065         'bo': 'bod',
4066         'br': 'bre',
4067         'bs': 'bos',
4068         'ca': 'cat',
4069         'ce': 'che',
4070         'ch': 'cha',
4071         'co': 'cos',
4072         'cr': 'cre',
4073         'cs': 'ces',
4074         'cu': 'chu',
4075         'cv': 'chv',
4076         'cy': 'cym',
4077         'da': 'dan',
4078         'de': 'deu',
4079         'dv': 'div',
4080         'dz': 'dzo',
4081         'ee': 'ewe',
4082         'el': 'ell',
4083         'en': 'eng',
4084         'eo': 'epo',
4085         'es': 'spa',
4086         'et': 'est',
4087         'eu': 'eus',
4088         'fa': 'fas',
4089         'ff': 'ful',
4090         'fi': 'fin',
4091         'fj': 'fij',
4092         'fo': 'fao',
4093         'fr': 'fra',
4094         'fy': 'fry',
4095         'ga': 'gle',
4096         'gd': 'gla',
4097         'gl': 'glg',
4098         'gn': 'grn',
4099         'gu': 'guj',
4100         'gv': 'glv',
4101         'ha': 'hau',
4102         'he': 'heb',
4103         'iw': 'heb',  # Replaced by he in 1989 revision
4104         'hi': 'hin',
4105         'ho': 'hmo',
4106         'hr': 'hrv',
4107         'ht': 'hat',
4108         'hu': 'hun',
4109         'hy': 'hye',
4110         'hz': 'her',
4111         'ia': 'ina',
4112         'id': 'ind',
4113         'in': 'ind',  # Replaced by id in 1989 revision
4114         'ie': 'ile',
4115         'ig': 'ibo',
4116         'ii': 'iii',
4117         'ik': 'ipk',
4118         'io': 'ido',
4119         'is': 'isl',
4120         'it': 'ita',
4121         'iu': 'iku',
4122         'ja': 'jpn',
4123         'jv': 'jav',
4124         'ka': 'kat',
4125         'kg': 'kon',
4126         'ki': 'kik',
4127         'kj': 'kua',
4128         'kk': 'kaz',
4129         'kl': 'kal',
4130         'km': 'khm',
4131         'kn': 'kan',
4132         'ko': 'kor',
4133         'kr': 'kau',
4134         'ks': 'kas',
4135         'ku': 'kur',
4136         'kv': 'kom',
4137         'kw': 'cor',
4138         'ky': 'kir',
4139         'la': 'lat',
4140         'lb': 'ltz',
4141         'lg': 'lug',
4142         'li': 'lim',
4143         'ln': 'lin',
4144         'lo': 'lao',
4145         'lt': 'lit',
4146         'lu': 'lub',
4147         'lv': 'lav',
4148         'mg': 'mlg',
4149         'mh': 'mah',
4150         'mi': 'mri',
4151         'mk': 'mkd',
4152         'ml': 'mal',
4153         'mn': 'mon',
4154         'mr': 'mar',
4155         'ms': 'msa',
4156         'mt': 'mlt',
4157         'my': 'mya',
4158         'na': 'nau',
4159         'nb': 'nob',
4160         'nd': 'nde',
4161         'ne': 'nep',
4162         'ng': 'ndo',
4163         'nl': 'nld',
4164         'nn': 'nno',
4165         'no': 'nor',
4166         'nr': 'nbl',
4167         'nv': 'nav',
4168         'ny': 'nya',
4169         'oc': 'oci',
4170         'oj': 'oji',
4171         'om': 'orm',
4172         'or': 'ori',
4173         'os': 'oss',
4174         'pa': 'pan',
4175         'pe': 'per',
4176         'pi': 'pli',
4177         'pl': 'pol',
4178         'ps': 'pus',
4179         'pt': 'por',
4180         'qu': 'que',
4181         'rm': 'roh',
4182         'rn': 'run',
4183         'ro': 'ron',
4184         'ru': 'rus',
4185         'rw': 'kin',
4186         'sa': 'san',
4187         'sc': 'srd',
4188         'sd': 'snd',
4189         'se': 'sme',
4190         'sg': 'sag',
4191         'si': 'sin',
4192         'sk': 'slk',
4193         'sl': 'slv',
4194         'sm': 'smo',
4195         'sn': 'sna',
4196         'so': 'som',
4197         'sq': 'sqi',
4198         'sr': 'srp',
4199         'ss': 'ssw',
4200         'st': 'sot',
4201         'su': 'sun',
4202         'sv': 'swe',
4203         'sw': 'swa',
4204         'ta': 'tam',
4205         'te': 'tel',
4206         'tg': 'tgk',
4207         'th': 'tha',
4208         'ti': 'tir',
4209         'tk': 'tuk',
4210         'tl': 'tgl',
4211         'tn': 'tsn',
4212         'to': 'ton',
4213         'tr': 'tur',
4214         'ts': 'tso',
4215         'tt': 'tat',
4216         'tw': 'twi',
4217         'ty': 'tah',
4218         'ug': 'uig',
4219         'uk': 'ukr',
4220         'ur': 'urd',
4221         'uz': 'uzb',
4222         've': 'ven',
4223         'vi': 'vie',
4224         'vo': 'vol',
4225         'wa': 'wln',
4226         'wo': 'wol',
4227         'xh': 'xho',
4228         'yi': 'yid',
4229         'ji': 'yid',  # Replaced by yi in 1989 revision
4230         'yo': 'yor',
4231         'za': 'zha',
4232         'zh': 'zho',
4233         'zu': 'zul',
4234     }
4235
4236     @classmethod
4237     def short2long(cls, code):
4238         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4239         return cls._lang_map.get(code[:2])
4240
4241     @classmethod
4242     def long2short(cls, code):
4243         """Convert language code from ISO 639-2/T to ISO 639-1"""
4244         for short_name, long_name in cls._lang_map.items():
4245             if long_name == code:
4246                 return short_name
4247
4248
4249 class ISO3166Utils:
4250     # From http://data.okfn.org/data/core/country-list
4251     _country_map = {
4252         'AF': 'Afghanistan',
4253         'AX': 'Åland Islands',
4254         'AL': 'Albania',
4255         'DZ': 'Algeria',
4256         'AS': 'American Samoa',
4257         'AD': 'Andorra',
4258         'AO': 'Angola',
4259         'AI': 'Anguilla',
4260         'AQ': 'Antarctica',
4261         'AG': 'Antigua and Barbuda',
4262         'AR': 'Argentina',
4263         'AM': 'Armenia',
4264         'AW': 'Aruba',
4265         'AU': 'Australia',
4266         'AT': 'Austria',
4267         'AZ': 'Azerbaijan',
4268         'BS': 'Bahamas',
4269         'BH': 'Bahrain',
4270         'BD': 'Bangladesh',
4271         'BB': 'Barbados',
4272         'BY': 'Belarus',
4273         'BE': 'Belgium',
4274         'BZ': 'Belize',
4275         'BJ': 'Benin',
4276         'BM': 'Bermuda',
4277         'BT': 'Bhutan',
4278         'BO': 'Bolivia, Plurinational State of',
4279         'BQ': 'Bonaire, Sint Eustatius and Saba',
4280         'BA': 'Bosnia and Herzegovina',
4281         'BW': 'Botswana',
4282         'BV': 'Bouvet Island',
4283         'BR': 'Brazil',
4284         'IO': 'British Indian Ocean Territory',
4285         'BN': 'Brunei Darussalam',
4286         'BG': 'Bulgaria',
4287         'BF': 'Burkina Faso',
4288         'BI': 'Burundi',
4289         'KH': 'Cambodia',
4290         'CM': 'Cameroon',
4291         'CA': 'Canada',
4292         'CV': 'Cape Verde',
4293         'KY': 'Cayman Islands',
4294         'CF': 'Central African Republic',
4295         'TD': 'Chad',
4296         'CL': 'Chile',
4297         'CN': 'China',
4298         'CX': 'Christmas Island',
4299         'CC': 'Cocos (Keeling) Islands',
4300         'CO': 'Colombia',
4301         'KM': 'Comoros',
4302         'CG': 'Congo',
4303         'CD': 'Congo, the Democratic Republic of the',
4304         'CK': 'Cook Islands',
4305         'CR': 'Costa Rica',
4306         'CI': 'Côte d\'Ivoire',
4307         'HR': 'Croatia',
4308         'CU': 'Cuba',
4309         'CW': 'Curaçao',
4310         'CY': 'Cyprus',
4311         'CZ': 'Czech Republic',
4312         'DK': 'Denmark',
4313         'DJ': 'Djibouti',
4314         'DM': 'Dominica',
4315         'DO': 'Dominican Republic',
4316         'EC': 'Ecuador',
4317         'EG': 'Egypt',
4318         'SV': 'El Salvador',
4319         'GQ': 'Equatorial Guinea',
4320         'ER': 'Eritrea',
4321         'EE': 'Estonia',
4322         'ET': 'Ethiopia',
4323         'FK': 'Falkland Islands (Malvinas)',
4324         'FO': 'Faroe Islands',
4325         'FJ': 'Fiji',
4326         'FI': 'Finland',
4327         'FR': 'France',
4328         'GF': 'French Guiana',
4329         'PF': 'French Polynesia',
4330         'TF': 'French Southern Territories',
4331         'GA': 'Gabon',
4332         'GM': 'Gambia',
4333         'GE': 'Georgia',
4334         'DE': 'Germany',
4335         'GH': 'Ghana',
4336         'GI': 'Gibraltar',
4337         'GR': 'Greece',
4338         'GL': 'Greenland',
4339         'GD': 'Grenada',
4340         'GP': 'Guadeloupe',
4341         'GU': 'Guam',
4342         'GT': 'Guatemala',
4343         'GG': 'Guernsey',
4344         'GN': 'Guinea',
4345         'GW': 'Guinea-Bissau',
4346         'GY': 'Guyana',
4347         'HT': 'Haiti',
4348         'HM': 'Heard Island and McDonald Islands',
4349         'VA': 'Holy See (Vatican City State)',
4350         'HN': 'Honduras',
4351         'HK': 'Hong Kong',
4352         'HU': 'Hungary',
4353         'IS': 'Iceland',
4354         'IN': 'India',
4355         'ID': 'Indonesia',
4356         'IR': 'Iran, Islamic Republic of',
4357         'IQ': 'Iraq',
4358         'IE': 'Ireland',
4359         'IM': 'Isle of Man',
4360         'IL': 'Israel',
4361         'IT': 'Italy',
4362         'JM': 'Jamaica',
4363         'JP': 'Japan',
4364         'JE': 'Jersey',
4365         'JO': 'Jordan',
4366         'KZ': 'Kazakhstan',
4367         'KE': 'Kenya',
4368         'KI': 'Kiribati',
4369         'KP': 'Korea, Democratic People\'s Republic of',
4370         'KR': 'Korea, Republic of',
4371         'KW': 'Kuwait',
4372         'KG': 'Kyrgyzstan',
4373         'LA': 'Lao People\'s Democratic Republic',
4374         'LV': 'Latvia',
4375         'LB': 'Lebanon',
4376         'LS': 'Lesotho',
4377         'LR': 'Liberia',
4378         'LY': 'Libya',
4379         'LI': 'Liechtenstein',
4380         'LT': 'Lithuania',
4381         'LU': 'Luxembourg',
4382         'MO': 'Macao',
4383         'MK': 'Macedonia, the Former Yugoslav Republic of',
4384         'MG': 'Madagascar',
4385         'MW': 'Malawi',
4386         'MY': 'Malaysia',
4387         'MV': 'Maldives',
4388         'ML': 'Mali',
4389         'MT': 'Malta',
4390         'MH': 'Marshall Islands',
4391         'MQ': 'Martinique',
4392         'MR': 'Mauritania',
4393         'MU': 'Mauritius',
4394         'YT': 'Mayotte',
4395         'MX': 'Mexico',
4396         'FM': 'Micronesia, Federated States of',
4397         'MD': 'Moldova, Republic of',
4398         'MC': 'Monaco',
4399         'MN': 'Mongolia',
4400         'ME': 'Montenegro',
4401         'MS': 'Montserrat',
4402         'MA': 'Morocco',
4403         'MZ': 'Mozambique',
4404         'MM': 'Myanmar',
4405         'NA': 'Namibia',
4406         'NR': 'Nauru',
4407         'NP': 'Nepal',
4408         'NL': 'Netherlands',
4409         'NC': 'New Caledonia',
4410         'NZ': 'New Zealand',
4411         'NI': 'Nicaragua',
4412         'NE': 'Niger',
4413         'NG': 'Nigeria',
4414         'NU': 'Niue',
4415         'NF': 'Norfolk Island',
4416         'MP': 'Northern Mariana Islands',
4417         'NO': 'Norway',
4418         'OM': 'Oman',
4419         'PK': 'Pakistan',
4420         'PW': 'Palau',
4421         'PS': 'Palestine, State of',
4422         'PA': 'Panama',
4423         'PG': 'Papua New Guinea',
4424         'PY': 'Paraguay',
4425         'PE': 'Peru',
4426         'PH': 'Philippines',
4427         'PN': 'Pitcairn',
4428         'PL': 'Poland',
4429         'PT': 'Portugal',
4430         'PR': 'Puerto Rico',
4431         'QA': 'Qatar',
4432         'RE': 'Réunion',
4433         'RO': 'Romania',
4434         'RU': 'Russian Federation',
4435         'RW': 'Rwanda',
4436         'BL': 'Saint Barthélemy',
4437         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4438         'KN': 'Saint Kitts and Nevis',
4439         'LC': 'Saint Lucia',
4440         'MF': 'Saint Martin (French part)',
4441         'PM': 'Saint Pierre and Miquelon',
4442         'VC': 'Saint Vincent and the Grenadines',
4443         'WS': 'Samoa',
4444         'SM': 'San Marino',
4445         'ST': 'Sao Tome and Principe',
4446         'SA': 'Saudi Arabia',
4447         'SN': 'Senegal',
4448         'RS': 'Serbia',
4449         'SC': 'Seychelles',
4450         'SL': 'Sierra Leone',
4451         'SG': 'Singapore',
4452         'SX': 'Sint Maarten (Dutch part)',
4453         'SK': 'Slovakia',
4454         'SI': 'Slovenia',
4455         'SB': 'Solomon Islands',
4456         'SO': 'Somalia',
4457         'ZA': 'South Africa',
4458         'GS': 'South Georgia and the South Sandwich Islands',
4459         'SS': 'South Sudan',
4460         'ES': 'Spain',
4461         'LK': 'Sri Lanka',
4462         'SD': 'Sudan',
4463         'SR': 'Suriname',
4464         'SJ': 'Svalbard and Jan Mayen',
4465         'SZ': 'Swaziland',
4466         'SE': 'Sweden',
4467         'CH': 'Switzerland',
4468         'SY': 'Syrian Arab Republic',
4469         'TW': 'Taiwan, Province of China',
4470         'TJ': 'Tajikistan',
4471         'TZ': 'Tanzania, United Republic of',
4472         'TH': 'Thailand',
4473         'TL': 'Timor-Leste',
4474         'TG': 'Togo',
4475         'TK': 'Tokelau',
4476         'TO': 'Tonga',
4477         'TT': 'Trinidad and Tobago',
4478         'TN': 'Tunisia',
4479         'TR': 'Turkey',
4480         'TM': 'Turkmenistan',
4481         'TC': 'Turks and Caicos Islands',
4482         'TV': 'Tuvalu',
4483         'UG': 'Uganda',
4484         'UA': 'Ukraine',
4485         'AE': 'United Arab Emirates',
4486         'GB': 'United Kingdom',
4487         'US': 'United States',
4488         'UM': 'United States Minor Outlying Islands',
4489         'UY': 'Uruguay',
4490         'UZ': 'Uzbekistan',
4491         'VU': 'Vanuatu',
4492         'VE': 'Venezuela, Bolivarian Republic of',
4493         'VN': 'Viet Nam',
4494         'VG': 'Virgin Islands, British',
4495         'VI': 'Virgin Islands, U.S.',
4496         'WF': 'Wallis and Futuna',
4497         'EH': 'Western Sahara',
4498         'YE': 'Yemen',
4499         'ZM': 'Zambia',
4500         'ZW': 'Zimbabwe',
4501         # Not ISO 3166 codes, but used for IP blocks
4502         'AP': 'Asia/Pacific Region',
4503         'EU': 'Europe',
4504     }
4505
4506     @classmethod
4507     def short2full(cls, code):
4508         """Convert an ISO 3166-2 country code to the corresponding full name"""
4509         return cls._country_map.get(code.upper())
4510
4511
4512 class GeoUtils:
4513     # Major IPv4 address blocks per country
4514     _country_ip_map = {
4515         'AD': '46.172.224.0/19',
4516         'AE': '94.200.0.0/13',
4517         'AF': '149.54.0.0/17',
4518         'AG': '209.59.64.0/18',
4519         'AI': '204.14.248.0/21',
4520         'AL': '46.99.0.0/16',
4521         'AM': '46.70.0.0/15',
4522         'AO': '105.168.0.0/13',
4523         'AP': '182.50.184.0/21',
4524         'AQ': '23.154.160.0/24',
4525         'AR': '181.0.0.0/12',
4526         'AS': '202.70.112.0/20',
4527         'AT': '77.116.0.0/14',
4528         'AU': '1.128.0.0/11',
4529         'AW': '181.41.0.0/18',
4530         'AX': '185.217.4.0/22',
4531         'AZ': '5.197.0.0/16',
4532         'BA': '31.176.128.0/17',
4533         'BB': '65.48.128.0/17',
4534         'BD': '114.130.0.0/16',
4535         'BE': '57.0.0.0/8',
4536         'BF': '102.178.0.0/15',
4537         'BG': '95.42.0.0/15',
4538         'BH': '37.131.0.0/17',
4539         'BI': '154.117.192.0/18',
4540         'BJ': '137.255.0.0/16',
4541         'BL': '185.212.72.0/23',
4542         'BM': '196.12.64.0/18',
4543         'BN': '156.31.0.0/16',
4544         'BO': '161.56.0.0/16',
4545         'BQ': '161.0.80.0/20',
4546         'BR': '191.128.0.0/12',
4547         'BS': '24.51.64.0/18',
4548         'BT': '119.2.96.0/19',
4549         'BW': '168.167.0.0/16',
4550         'BY': '178.120.0.0/13',
4551         'BZ': '179.42.192.0/18',
4552         'CA': '99.224.0.0/11',
4553         'CD': '41.243.0.0/16',
4554         'CF': '197.242.176.0/21',
4555         'CG': '160.113.0.0/16',
4556         'CH': '85.0.0.0/13',
4557         'CI': '102.136.0.0/14',
4558         'CK': '202.65.32.0/19',
4559         'CL': '152.172.0.0/14',
4560         'CM': '102.244.0.0/14',
4561         'CN': '36.128.0.0/10',
4562         'CO': '181.240.0.0/12',
4563         'CR': '201.192.0.0/12',
4564         'CU': '152.206.0.0/15',
4565         'CV': '165.90.96.0/19',
4566         'CW': '190.88.128.0/17',
4567         'CY': '31.153.0.0/16',
4568         'CZ': '88.100.0.0/14',
4569         'DE': '53.0.0.0/8',
4570         'DJ': '197.241.0.0/17',
4571         'DK': '87.48.0.0/12',
4572         'DM': '192.243.48.0/20',
4573         'DO': '152.166.0.0/15',
4574         'DZ': '41.96.0.0/12',
4575         'EC': '186.68.0.0/15',
4576         'EE': '90.190.0.0/15',
4577         'EG': '156.160.0.0/11',
4578         'ER': '196.200.96.0/20',
4579         'ES': '88.0.0.0/11',
4580         'ET': '196.188.0.0/14',
4581         'EU': '2.16.0.0/13',
4582         'FI': '91.152.0.0/13',
4583         'FJ': '144.120.0.0/16',
4584         'FK': '80.73.208.0/21',
4585         'FM': '119.252.112.0/20',
4586         'FO': '88.85.32.0/19',
4587         'FR': '90.0.0.0/9',
4588         'GA': '41.158.0.0/15',
4589         'GB': '25.0.0.0/8',
4590         'GD': '74.122.88.0/21',
4591         'GE': '31.146.0.0/16',
4592         'GF': '161.22.64.0/18',
4593         'GG': '62.68.160.0/19',
4594         'GH': '154.160.0.0/12',
4595         'GI': '95.164.0.0/16',
4596         'GL': '88.83.0.0/19',
4597         'GM': '160.182.0.0/15',
4598         'GN': '197.149.192.0/18',
4599         'GP': '104.250.0.0/19',
4600         'GQ': '105.235.224.0/20',
4601         'GR': '94.64.0.0/13',
4602         'GT': '168.234.0.0/16',
4603         'GU': '168.123.0.0/16',
4604         'GW': '197.214.80.0/20',
4605         'GY': '181.41.64.0/18',
4606         'HK': '113.252.0.0/14',
4607         'HN': '181.210.0.0/16',
4608         'HR': '93.136.0.0/13',
4609         'HT': '148.102.128.0/17',
4610         'HU': '84.0.0.0/14',
4611         'ID': '39.192.0.0/10',
4612         'IE': '87.32.0.0/12',
4613         'IL': '79.176.0.0/13',
4614         'IM': '5.62.80.0/20',
4615         'IN': '117.192.0.0/10',
4616         'IO': '203.83.48.0/21',
4617         'IQ': '37.236.0.0/14',
4618         'IR': '2.176.0.0/12',
4619         'IS': '82.221.0.0/16',
4620         'IT': '79.0.0.0/10',
4621         'JE': '87.244.64.0/18',
4622         'JM': '72.27.0.0/17',
4623         'JO': '176.29.0.0/16',
4624         'JP': '133.0.0.0/8',
4625         'KE': '105.48.0.0/12',
4626         'KG': '158.181.128.0/17',
4627         'KH': '36.37.128.0/17',
4628         'KI': '103.25.140.0/22',
4629         'KM': '197.255.224.0/20',
4630         'KN': '198.167.192.0/19',
4631         'KP': '175.45.176.0/22',
4632         'KR': '175.192.0.0/10',
4633         'KW': '37.36.0.0/14',
4634         'KY': '64.96.0.0/15',
4635         'KZ': '2.72.0.0/13',
4636         'LA': '115.84.64.0/18',
4637         'LB': '178.135.0.0/16',
4638         'LC': '24.92.144.0/20',
4639         'LI': '82.117.0.0/19',
4640         'LK': '112.134.0.0/15',
4641         'LR': '102.183.0.0/16',
4642         'LS': '129.232.0.0/17',
4643         'LT': '78.56.0.0/13',
4644         'LU': '188.42.0.0/16',
4645         'LV': '46.109.0.0/16',
4646         'LY': '41.252.0.0/14',
4647         'MA': '105.128.0.0/11',
4648         'MC': '88.209.64.0/18',
4649         'MD': '37.246.0.0/16',
4650         'ME': '178.175.0.0/17',
4651         'MF': '74.112.232.0/21',
4652         'MG': '154.126.0.0/17',
4653         'MH': '117.103.88.0/21',
4654         'MK': '77.28.0.0/15',
4655         'ML': '154.118.128.0/18',
4656         'MM': '37.111.0.0/17',
4657         'MN': '49.0.128.0/17',
4658         'MO': '60.246.0.0/16',
4659         'MP': '202.88.64.0/20',
4660         'MQ': '109.203.224.0/19',
4661         'MR': '41.188.64.0/18',
4662         'MS': '208.90.112.0/22',
4663         'MT': '46.11.0.0/16',
4664         'MU': '105.16.0.0/12',
4665         'MV': '27.114.128.0/18',
4666         'MW': '102.70.0.0/15',
4667         'MX': '187.192.0.0/11',
4668         'MY': '175.136.0.0/13',
4669         'MZ': '197.218.0.0/15',
4670         'NA': '41.182.0.0/16',
4671         'NC': '101.101.0.0/18',
4672         'NE': '197.214.0.0/18',
4673         'NF': '203.17.240.0/22',
4674         'NG': '105.112.0.0/12',
4675         'NI': '186.76.0.0/15',
4676         'NL': '145.96.0.0/11',
4677         'NO': '84.208.0.0/13',
4678         'NP': '36.252.0.0/15',
4679         'NR': '203.98.224.0/19',
4680         'NU': '49.156.48.0/22',
4681         'NZ': '49.224.0.0/14',
4682         'OM': '5.36.0.0/15',
4683         'PA': '186.72.0.0/15',
4684         'PE': '186.160.0.0/14',
4685         'PF': '123.50.64.0/18',
4686         'PG': '124.240.192.0/19',
4687         'PH': '49.144.0.0/13',
4688         'PK': '39.32.0.0/11',
4689         'PL': '83.0.0.0/11',
4690         'PM': '70.36.0.0/20',
4691         'PR': '66.50.0.0/16',
4692         'PS': '188.161.0.0/16',
4693         'PT': '85.240.0.0/13',
4694         'PW': '202.124.224.0/20',
4695         'PY': '181.120.0.0/14',
4696         'QA': '37.210.0.0/15',
4697         'RE': '102.35.0.0/16',
4698         'RO': '79.112.0.0/13',
4699         'RS': '93.86.0.0/15',
4700         'RU': '5.136.0.0/13',
4701         'RW': '41.186.0.0/16',
4702         'SA': '188.48.0.0/13',
4703         'SB': '202.1.160.0/19',
4704         'SC': '154.192.0.0/11',
4705         'SD': '102.120.0.0/13',
4706         'SE': '78.64.0.0/12',
4707         'SG': '8.128.0.0/10',
4708         'SI': '188.196.0.0/14',
4709         'SK': '78.98.0.0/15',
4710         'SL': '102.143.0.0/17',
4711         'SM': '89.186.32.0/19',
4712         'SN': '41.82.0.0/15',
4713         'SO': '154.115.192.0/18',
4714         'SR': '186.179.128.0/17',
4715         'SS': '105.235.208.0/21',
4716         'ST': '197.159.160.0/19',
4717         'SV': '168.243.0.0/16',
4718         'SX': '190.102.0.0/20',
4719         'SY': '5.0.0.0/16',
4720         'SZ': '41.84.224.0/19',
4721         'TC': '65.255.48.0/20',
4722         'TD': '154.68.128.0/19',
4723         'TG': '196.168.0.0/14',
4724         'TH': '171.96.0.0/13',
4725         'TJ': '85.9.128.0/18',
4726         'TK': '27.96.24.0/21',
4727         'TL': '180.189.160.0/20',
4728         'TM': '95.85.96.0/19',
4729         'TN': '197.0.0.0/11',
4730         'TO': '175.176.144.0/21',
4731         'TR': '78.160.0.0/11',
4732         'TT': '186.44.0.0/15',
4733         'TV': '202.2.96.0/19',
4734         'TW': '120.96.0.0/11',
4735         'TZ': '156.156.0.0/14',
4736         'UA': '37.52.0.0/14',
4737         'UG': '102.80.0.0/13',
4738         'US': '6.0.0.0/8',
4739         'UY': '167.56.0.0/13',
4740         'UZ': '84.54.64.0/18',
4741         'VA': '212.77.0.0/19',
4742         'VC': '207.191.240.0/21',
4743         'VE': '186.88.0.0/13',
4744         'VG': '66.81.192.0/20',
4745         'VI': '146.226.0.0/16',
4746         'VN': '14.160.0.0/11',
4747         'VU': '202.80.32.0/20',
4748         'WF': '117.20.32.0/21',
4749         'WS': '202.4.32.0/19',
4750         'YE': '134.35.0.0/16',
4751         'YT': '41.242.116.0/22',
4752         'ZA': '41.0.0.0/11',
4753         'ZM': '102.144.0.0/13',
4754         'ZW': '102.177.192.0/18',
4755     }
4756
4757     @classmethod
4758     def random_ipv4(cls, code_or_block):
4759         if len(code_or_block) == 2:
4760             block = cls._country_ip_map.get(code_or_block.upper())
4761             if not block:
4762                 return None
4763         else:
4764             block = code_or_block
4765         addr, preflen = block.split('/')
4766         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4767         addr_max = addr_min | (0xffffffff >> int(preflen))
4768         return str(socket.inet_ntoa(
4769             struct.pack('!L', random.randint(addr_min, addr_max))))
4770
4771
4772 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4773     def __init__(self, proxies=None):
4774         # Set default handlers
4775         for type in ('http', 'https'):
4776             setattr(self, '%s_open' % type,
4777                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4778                         meth(r, proxy, type))
4779         urllib.request.ProxyHandler.__init__(self, proxies)
4780
4781     def proxy_open(self, req, proxy, type):
4782         req_proxy = req.headers.get('Ytdl-request-proxy')
4783         if req_proxy is not None:
4784             proxy = req_proxy
4785             del req.headers['Ytdl-request-proxy']
4786
4787         if proxy == '__noproxy__':
4788             return None  # No Proxy
4789         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4790             req.add_header('Ytdl-socks-proxy', proxy)
4791             # yt-dlp's http/https handlers do wrapping the socket with socks
4792             return None
4793         return urllib.request.ProxyHandler.proxy_open(
4794             self, req, proxy, type)
4795
4796
4797 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4798 # released into Public Domain
4799 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4800
4801 def long_to_bytes(n, blocksize=0):
4802     """long_to_bytes(n:long, blocksize:int) : string
4803     Convert a long integer to a byte string.
4804
4805     If optional blocksize is given and greater than zero, pad the front of the
4806     byte string with binary zeros so that the length is a multiple of
4807     blocksize.
4808     """
4809     # after much testing, this algorithm was deemed to be the fastest
4810     s = b''
4811     n = int(n)
4812     while n > 0:
4813         s = struct.pack('>I', n & 0xffffffff) + s
4814         n = n >> 32
4815     # strip off leading zeros
4816     for i in range(len(s)):
4817         if s[i] != b'\000'[0]:
4818             break
4819     else:
4820         # only happens when n == 0
4821         s = b'\000'
4822         i = 0
4823     s = s[i:]
4824     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4825     # de-padding being done above, but sigh...
4826     if blocksize > 0 and len(s) % blocksize:
4827         s = (blocksize - len(s) % blocksize) * b'\000' + s
4828     return s
4829
4830
4831 def bytes_to_long(s):
4832     """bytes_to_long(string) : long
4833     Convert a byte string to a long integer.
4834
4835     This is (essentially) the inverse of long_to_bytes().
4836     """
4837     acc = 0
4838     length = len(s)
4839     if length % 4:
4840         extra = (4 - length % 4)
4841         s = b'\000' * extra + s
4842         length = length + extra
4843     for i in range(0, length, 4):
4844         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4845     return acc
4846
4847
4848 def ohdave_rsa_encrypt(data, exponent, modulus):
4849     '''
4850     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4851
4852     Input:
4853         data: data to encrypt, bytes-like object
4854         exponent, modulus: parameter e and N of RSA algorithm, both integer
4855     Output: hex string of encrypted data
4856
4857     Limitation: supports one block encryption only
4858     '''
4859
4860     payload = int(binascii.hexlify(data[::-1]), 16)
4861     encrypted = pow(payload, exponent, modulus)
4862     return '%x' % encrypted
4863
4864
4865 def pkcs1pad(data, length):
4866     """
4867     Padding input data with PKCS#1 scheme
4868
4869     @param {int[]} data        input data
4870     @param {int}   length      target length
4871     @returns {int[]}           padded data
4872     """
4873     if len(data) > length - 11:
4874         raise ValueError('Input data too long for PKCS#1 padding')
4875
4876     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4877     return [0, 2] + pseudo_random + [0] + data
4878
4879
4880 def _base_n_table(n, table):
4881     if not table and not n:
4882         raise ValueError('Either table or n must be specified')
4883     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4884
4885     if n and n != len(table):
4886         raise ValueError(f'base {n} exceeds table length {len(table)}')
4887     return table
4888
4889
4890 def encode_base_n(num, n=None, table=None):
4891     """Convert given int to a base-n string"""
4892     table = _base_n_table(n, table)
4893     if not num:
4894         return table[0]
4895
4896     result, base = '', len(table)
4897     while num:
4898         result = table[num % base] + result
4899         num = num // base
4900     return result
4901
4902
4903 def decode_base_n(string, n=None, table=None):
4904     """Convert given base-n string to int"""
4905     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4906     result, base = 0, len(table)
4907     for char in string:
4908         result = result * base + table[char]
4909     return result
4910
4911
4912 def decode_packed_codes(code):
4913     mobj = re.search(PACKED_CODES_RE, code)
4914     obfuscated_code, base, count, symbols = mobj.groups()
4915     base = int(base)
4916     count = int(count)
4917     symbols = symbols.split('|')
4918     symbol_table = {}
4919
4920     while count:
4921         count -= 1
4922         base_n_count = encode_base_n(count, base)
4923         symbol_table[base_n_count] = symbols[count] or base_n_count
4924
4925     return re.sub(
4926         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4927         obfuscated_code)
4928
4929
4930 def caesar(s, alphabet, shift):
4931     if shift == 0:
4932         return s
4933     l = len(alphabet)
4934     return ''.join(
4935         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4936         for c in s)
4937
4938
4939 def rot47(s):
4940     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4941
4942
4943 def parse_m3u8_attributes(attrib):
4944     info = {}
4945     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4946         if val.startswith('"'):
4947             val = val[1:-1]
4948         info[key] = val
4949     return info
4950
4951
4952 def urshift(val, n):
4953     return val >> n if val >= 0 else (val + 0x100000000) >> n
4954
4955
4956 def write_xattr(path, key, value):
4957     # Windows: Write xattrs to NTFS Alternate Data Streams:
4958     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4959     if compat_os_name == 'nt':
4960         assert ':' not in key
4961         assert os.path.exists(path)
4962
4963         try:
4964             with open(f'{path}:{key}', 'wb') as f:
4965                 f.write(value)
4966         except OSError as e:
4967             raise XAttrMetadataError(e.errno, e.strerror)
4968         return
4969
4970     # UNIX Method 1. Use xattrs/pyxattrs modules
4971
4972     setxattr = None
4973     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4974         # Unicode arguments are not supported in pyxattr until version 0.5.0
4975         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4976         if version_tuple(xattr.__version__) >= (0, 5, 0):
4977             setxattr = xattr.set
4978     elif xattr:
4979         setxattr = xattr.setxattr
4980
4981     if setxattr:
4982         try:
4983             setxattr(path, key, value)
4984         except OSError as e:
4985             raise XAttrMetadataError(e.errno, e.strerror)
4986         return
4987
4988     # UNIX Method 2. Use setfattr/xattr executables
4989     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4990            else 'xattr' if check_executable('xattr', ['-h']) else None)
4991     if not exe:
4992         raise XAttrUnavailableError(
4993             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4994             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4995
4996     value = value.decode()
4997     try:
4998         _, stderr, returncode = Popen.run(
4999             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5000             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5001     except OSError as e:
5002         raise XAttrMetadataError(e.errno, e.strerror)
5003     if returncode:
5004         raise XAttrMetadataError(returncode, stderr)
5005
5006
5007 def random_birthday(year_field, month_field, day_field):
5008     start_date = datetime.date(1950, 1, 1)
5009     end_date = datetime.date(1995, 12, 31)
5010     offset = random.randint(0, (end_date - start_date).days)
5011     random_date = start_date + datetime.timedelta(offset)
5012     return {
5013         year_field: str(random_date.year),
5014         month_field: str(random_date.month),
5015         day_field: str(random_date.day),
5016     }
5017
5018
5019 def find_available_port(interface=''):
5020     try:
5021         with socket.socket() as sock:
5022             sock.bind((interface, 0))
5023             return sock.getsockname()[1]
5024     except OSError:
5025         return None
5026
5027
5028 # Templates for internet shortcut files, which are plain text files.
5029 DOT_URL_LINK_TEMPLATE = '''\
5030 [InternetShortcut]
5031 URL=%(url)s
5032 '''
5033
5034 DOT_WEBLOC_LINK_TEMPLATE = '''\
5035 <?xml version="1.0" encoding="UTF-8"?>
5036 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5037 <plist version="1.0">
5038 <dict>
5039 \t<key>URL</key>
5040 \t<string>%(url)s</string>
5041 </dict>
5042 </plist>
5043 '''
5044
5045 DOT_DESKTOP_LINK_TEMPLATE = '''\
5046 [Desktop Entry]
5047 Encoding=UTF-8
5048 Name=%(filename)s
5049 Type=Link
5050 URL=%(url)s
5051 Icon=text-html
5052 '''
5053
5054 LINK_TEMPLATES = {
5055     'url': DOT_URL_LINK_TEMPLATE,
5056     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5057     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5058 }
5059
5060
5061 def iri_to_uri(iri):
5062     """
5063     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5064
5065     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5066     """
5067
5068     iri_parts = urllib.parse.urlparse(iri)
5069
5070     if '[' in iri_parts.netloc:
5071         raise ValueError('IPv6 URIs are not, yet, supported.')
5072         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5073
5074     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5075
5076     net_location = ''
5077     if iri_parts.username:
5078         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5079         if iri_parts.password is not None:
5080             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5081         net_location += '@'
5082
5083     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5084     # The 'idna' encoding produces ASCII text.
5085     if iri_parts.port is not None and iri_parts.port != 80:
5086         net_location += ':' + str(iri_parts.port)
5087
5088     return urllib.parse.urlunparse(
5089         (iri_parts.scheme,
5090             net_location,
5091
5092             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5093
5094             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5095             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5096
5097             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5098             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5099
5100             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5101
5102     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5103
5104
5105 def to_high_limit_path(path):
5106     if sys.platform in ['win32', 'cygwin']:
5107         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5108         return '\\\\?\\' + os.path.abspath(path)
5109
5110     return path
5111
5112
5113 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5114     val = traversal.traverse_obj(obj, *variadic(field))
5115     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5116         return default
5117     return template % func(val)
5118
5119
5120 def clean_podcast_url(url):
5121     url = re.sub(r'''(?x)
5122         (?:
5123             (?:
5124                 chtbl\.com/track|
5125                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5126                 play\.podtrac\.com|
5127                 chrt\.fm/track|
5128                 mgln\.ai/e
5129             )(?:/[^/.]+)?|
5130             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5131             flex\.acast\.com|
5132             pd(?:
5133                 cn\.co| # https://podcorn.com/analytics-prefix/
5134                 st\.fm # https://podsights.com/docs/
5135             )/e|
5136             [0-9]\.gum\.fm|
5137             pscrb\.fm/rss/p
5138         )/''', '', url)
5139     return re.sub(r'^\w+://(\w+://)', r'\1', url)
5140
5141
5142 _HEX_TABLE = '0123456789abcdef'
5143
5144
5145 def random_uuidv4():
5146     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5147
5148
5149 def make_dir(path, to_screen=None):
5150     try:
5151         dn = os.path.dirname(path)
5152         if dn:
5153             os.makedirs(dn, exist_ok=True)
5154         return True
5155     except OSError as err:
5156         if callable(to_screen) is not None:
5157             to_screen(f'unable to create directory {err}')
5158         return False
5159
5160
5161 def get_executable_path():
5162     from ..update import _get_variant_and_executable_path
5163
5164     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5165
5166
5167 def get_user_config_dirs(package_name):
5168     # .config (e.g. ~/.config/package_name)
5169     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5170     yield os.path.join(xdg_config_home, package_name)
5171
5172     # appdata (%APPDATA%/package_name)
5173     appdata_dir = os.getenv('appdata')
5174     if appdata_dir:
5175         yield os.path.join(appdata_dir, package_name)
5176
5177     # home (~/.package_name)
5178     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5179
5180
5181 def get_system_config_dirs(package_name):
5182     # /etc/package_name
5183     yield os.path.join('/etc', package_name)
5184
5185
5186 def time_seconds(**kwargs):
5187     """
5188     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5189     """
5190     return time.time() + datetime.timedelta(**kwargs).total_seconds()
5191
5192
5193 # create a JSON Web Signature (jws) with HS256 algorithm
5194 # the resulting format is in JWS Compact Serialization
5195 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5196 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5197 def jwt_encode_hs256(payload_data, key, headers={}):
5198     header_data = {
5199         'alg': 'HS256',
5200         'typ': 'JWT',
5201     }
5202     if headers:
5203         header_data.update(headers)
5204     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5205     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5206     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5207     signature_b64 = base64.b64encode(h.digest())
5208     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5209     return token
5210
5211
5212 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5213 def jwt_decode_hs256(jwt):
5214     header_b64, payload_b64, signature_b64 = jwt.split('.')
5215     # add trailing ='s that may have been stripped, superfluous ='s are ignored
5216     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5217     return payload_data
5218
5219
5220 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5221
5222
5223 @functools.cache
5224 def supports_terminal_sequences(stream):
5225     if compat_os_name == 'nt':
5226         if not WINDOWS_VT_MODE:
5227             return False
5228     elif not os.getenv('TERM'):
5229         return False
5230     try:
5231         return stream.isatty()
5232     except BaseException:
5233         return False
5234
5235
5236 def windows_enable_vt_mode():
5237     """Ref: https://bugs.python.org/issue30075 """
5238     if get_windows_version() < (10, 0, 10586):
5239         return
5240
5241     import ctypes
5242     import ctypes.wintypes
5243     import msvcrt
5244
5245     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5246
5247     dll = ctypes.WinDLL('kernel32', use_last_error=False)
5248     handle = os.open('CONOUT$', os.O_RDWR)
5249     try:
5250         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5251         dw_original_mode = ctypes.wintypes.DWORD()
5252         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5253         if not success:
5254             raise Exception('GetConsoleMode failed')
5255
5256         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5257             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5258         if not success:
5259             raise Exception('SetConsoleMode failed')
5260     finally:
5261         os.close(handle)
5262
5263     global WINDOWS_VT_MODE
5264     WINDOWS_VT_MODE = True
5265     supports_terminal_sequences.cache_clear()
5266
5267
5268 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5269
5270
5271 def remove_terminal_sequences(string):
5272     return _terminal_sequences_re.sub('', string)
5273
5274
5275 def number_of_digits(number):
5276     return len('%d' % number)
5277
5278
5279 def join_nonempty(*values, delim='-', from_dict=None):
5280     if from_dict is not None:
5281         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
5282     return delim.join(map(str, filter(None, values)))
5283
5284
5285 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5286     """
5287     Find the largest format dimensions in terms of video width and, for each thumbnail:
5288     * Modify the URL: Match the width with the provided regex and replace with the former width
5289     * Update dimensions
5290
5291     This function is useful with video services that scale the provided thumbnails on demand
5292     """
5293     _keys = ('width', 'height')
5294     max_dimensions = max(
5295         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5296         default=(0, 0))
5297     if not max_dimensions[0]:
5298         return thumbnails
5299     return [
5300         merge_dicts(
5301             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5302             dict(zip(_keys, max_dimensions)), thumbnail)
5303         for thumbnail in thumbnails
5304     ]
5305
5306
5307 def parse_http_range(range):
5308     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5309     if not range:
5310         return None, None, None
5311     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5312     if not crg:
5313         return None, None, None
5314     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5315
5316
5317 def read_stdin(what):
5318     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5319     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5320     return sys.stdin
5321
5322
5323 def determine_file_encoding(data):
5324     """
5325     Detect the text encoding used
5326     @returns (encoding, bytes to skip)
5327     """
5328
5329     # BOM marks are given priority over declarations
5330     for bom, enc in BOMS:
5331         if data.startswith(bom):
5332             return enc, len(bom)
5333
5334     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5335     # We ignore the endianness to get a good enough match
5336     data = data.replace(b'\0', b'')
5337     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5338     return mobj.group(1).decode() if mobj else None, 0
5339
5340
5341 class Config:
5342     own_args = None
5343     parsed_args = None
5344     filename = None
5345     __initialized = False
5346
5347     def __init__(self, parser, label=None):
5348         self.parser, self.label = parser, label
5349         self._loaded_paths, self.configs = set(), []
5350
5351     def init(self, args=None, filename=None):
5352         assert not self.__initialized
5353         self.own_args, self.filename = args, filename
5354         return self.load_configs()
5355
5356     def load_configs(self):
5357         directory = ''
5358         if self.filename:
5359             location = os.path.realpath(self.filename)
5360             directory = os.path.dirname(location)
5361             if location in self._loaded_paths:
5362                 return False
5363             self._loaded_paths.add(location)
5364
5365         self.__initialized = True
5366         opts, _ = self.parser.parse_known_args(self.own_args)
5367         self.parsed_args = self.own_args
5368         for location in opts.config_locations or []:
5369             if location == '-':
5370                 if location in self._loaded_paths:
5371                     continue
5372                 self._loaded_paths.add(location)
5373                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5374                 continue
5375             location = os.path.join(directory, expand_path(location))
5376             if os.path.isdir(location):
5377                 location = os.path.join(location, 'yt-dlp.conf')
5378             if not os.path.exists(location):
5379                 self.parser.error(f'config location {location} does not exist')
5380             self.append_config(self.read_file(location), location)
5381         return True
5382
5383     def __str__(self):
5384         label = join_nonempty(
5385             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5386             delim=' ')
5387         return join_nonempty(
5388             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5389             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5390             delim='\n')
5391
5392     @staticmethod
5393     def read_file(filename, default=[]):
5394         try:
5395             optionf = open(filename, 'rb')
5396         except OSError:
5397             return default  # silently skip if file is not present
5398         try:
5399             enc, skip = determine_file_encoding(optionf.read(512))
5400             optionf.seek(skip, io.SEEK_SET)
5401         except OSError:
5402             enc = None  # silently skip read errors
5403         try:
5404             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5405             contents = optionf.read().decode(enc or preferredencoding())
5406             res = shlex.split(contents, comments=True)
5407         except Exception as err:
5408             raise ValueError(f'Unable to parse "{filename}": {err}')
5409         finally:
5410             optionf.close()
5411         return res
5412
5413     @staticmethod
5414     def hide_login_info(opts):
5415         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5416         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5417
5418         def _scrub_eq(o):
5419             m = eqre.match(o)
5420             if m:
5421                 return m.group('key') + '=PRIVATE'
5422             else:
5423                 return o
5424
5425         opts = list(map(_scrub_eq, opts))
5426         for idx, opt in enumerate(opts):
5427             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5428                 opts[idx + 1] = 'PRIVATE'
5429         return opts
5430
5431     def append_config(self, *args, label=None):
5432         config = type(self)(self.parser, label)
5433         config._loaded_paths = self._loaded_paths
5434         if config.init(*args):
5435             self.configs.append(config)
5436
5437     @property
5438     def all_args(self):
5439         for config in reversed(self.configs):
5440             yield from config.all_args
5441         yield from self.parsed_args or []
5442
5443     def parse_known_args(self, **kwargs):
5444         return self.parser.parse_known_args(self.all_args, **kwargs)
5445
5446     def parse_args(self):
5447         return self.parser.parse_args(self.all_args)
5448
5449
5450 class WebSocketsWrapper:
5451     """Wraps websockets module to use in non-async scopes"""
5452     pool = None
5453
5454     def __init__(self, url, headers=None, connect=True):
5455         self.loop = asyncio.new_event_loop()
5456         # XXX: "loop" is deprecated
5457         self.conn = websockets.connect(
5458             url, extra_headers=headers, ping_interval=None,
5459             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5460         if connect:
5461             self.__enter__()
5462         atexit.register(self.__exit__, None, None, None)
5463
5464     def __enter__(self):
5465         if not self.pool:
5466             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5467         return self
5468
5469     def send(self, *args):
5470         self.run_with_loop(self.pool.send(*args), self.loop)
5471
5472     def recv(self, *args):
5473         return self.run_with_loop(self.pool.recv(*args), self.loop)
5474
5475     def __exit__(self, type, value, traceback):
5476         try:
5477             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5478         finally:
5479             self.loop.close()
5480             self._cancel_all_tasks(self.loop)
5481
5482     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5483     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5484     @staticmethod
5485     def run_with_loop(main, loop):
5486         if not asyncio.iscoroutine(main):
5487             raise ValueError(f'a coroutine was expected, got {main!r}')
5488
5489         try:
5490             return loop.run_until_complete(main)
5491         finally:
5492             loop.run_until_complete(loop.shutdown_asyncgens())
5493             if hasattr(loop, 'shutdown_default_executor'):
5494                 loop.run_until_complete(loop.shutdown_default_executor())
5495
5496     @staticmethod
5497     def _cancel_all_tasks(loop):
5498         to_cancel = asyncio.all_tasks(loop)
5499
5500         if not to_cancel:
5501             return
5502
5503         for task in to_cancel:
5504             task.cancel()
5505
5506         # XXX: "loop" is removed in python 3.10+
5507         loop.run_until_complete(
5508             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5509
5510         for task in to_cancel:
5511             if task.cancelled():
5512                 continue
5513             if task.exception() is not None:
5514                 loop.call_exception_handler({
5515                     'message': 'unhandled exception during asyncio.run() shutdown',
5516                     'exception': task.exception(),
5517                     'task': task,
5518                 })
5519
5520
5521 def merge_headers(*dicts):
5522     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5523     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5524
5525
5526 def cached_method(f):
5527     """Cache a method"""
5528     signature = inspect.signature(f)
5529
5530     @functools.wraps(f)
5531     def wrapper(self, *args, **kwargs):
5532         bound_args = signature.bind(self, *args, **kwargs)
5533         bound_args.apply_defaults()
5534         key = tuple(bound_args.arguments.values())[1:]
5535
5536         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5537         if key not in cache:
5538             cache[key] = f(self, *args, **kwargs)
5539         return cache[key]
5540     return wrapper
5541
5542
5543 class classproperty:
5544     """property access for class methods with optional caching"""
5545     def __new__(cls, func=None, *args, **kwargs):
5546         if not func:
5547             return functools.partial(cls, *args, **kwargs)
5548         return super().__new__(cls)
5549
5550     def __init__(self, func, *, cache=False):
5551         functools.update_wrapper(self, func)
5552         self.func = func
5553         self._cache = {} if cache else None
5554
5555     def __get__(self, _, cls):
5556         if self._cache is None:
5557             return self.func(cls)
5558         elif cls not in self._cache:
5559             self._cache[cls] = self.func(cls)
5560         return self._cache[cls]
5561
5562
5563 class function_with_repr:
5564     def __init__(self, func, repr_=None):
5565         functools.update_wrapper(self, func)
5566         self.func, self.__repr = func, repr_
5567
5568     def __call__(self, *args, **kwargs):
5569         return self.func(*args, **kwargs)
5570
5571     def __repr__(self):
5572         if self.__repr:
5573             return self.__repr
5574         return f'{self.func.__module__}.{self.func.__qualname__}'
5575
5576
5577 class Namespace(types.SimpleNamespace):
5578     """Immutable namespace"""
5579
5580     def __iter__(self):
5581         return iter(self.__dict__.values())
5582
5583     @property
5584     def items_(self):
5585         return self.__dict__.items()
5586
5587
5588 MEDIA_EXTENSIONS = Namespace(
5589     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5590     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5591     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5592     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5593     thumbnails=('jpg', 'png', 'webp'),
5594     storyboards=('mhtml', ),
5595     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5596     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5597 )
5598 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5599 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5600
5601 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5602
5603
5604 class RetryManager:
5605     """Usage:
5606         for retry in RetryManager(...):
5607             try:
5608                 ...
5609             except SomeException as err:
5610                 retry.error = err
5611                 continue
5612     """
5613     attempt, _error = 0, None
5614
5615     def __init__(self, _retries, _error_callback, **kwargs):
5616         self.retries = _retries or 0
5617         self.error_callback = functools.partial(_error_callback, **kwargs)
5618
5619     def _should_retry(self):
5620         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5621
5622     @property
5623     def error(self):
5624         if self._error is NO_DEFAULT:
5625             return None
5626         return self._error
5627
5628     @error.setter
5629     def error(self, value):
5630         self._error = value
5631
5632     def __iter__(self):
5633         while self._should_retry():
5634             self.error = NO_DEFAULT
5635             self.attempt += 1
5636             yield self
5637             if self.error:
5638                 self.error_callback(self.error, self.attempt, self.retries)
5639
5640     @staticmethod
5641     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5642         """Utility function for reporting retries"""
5643         if count > retries:
5644             if error:
5645                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5646             raise e
5647
5648         if not count:
5649             return warn(e)
5650         elif isinstance(e, ExtractorError):
5651             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5652         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5653
5654         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5655         if delay:
5656             info(f'Sleeping {delay:.2f} seconds ...')
5657             time.sleep(delay)
5658
5659
5660 def make_archive_id(ie, video_id):
5661     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5662     return f'{ie_key.lower()} {video_id}'
5663
5664
5665 def truncate_string(s, left, right=0):
5666     assert left > 3 and right >= 0
5667     if s is None or len(s) <= left + right:
5668         return s
5669     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5670
5671
5672 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5673     assert 'all' in alias_dict, '"all" alias is required'
5674     requested = list(start or [])
5675     for val in options:
5676         discard = val.startswith('-')
5677         if discard:
5678             val = val[1:]
5679
5680         if val in alias_dict:
5681             val = alias_dict[val] if not discard else [
5682                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5683             # NB: Do not allow regex in aliases for performance
5684             requested = orderedSet_from_options(val, alias_dict, start=requested)
5685             continue
5686
5687         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5688                    else [val] if val in alias_dict['all'] else None)
5689         if current is None:
5690             raise ValueError(val)
5691
5692         if discard:
5693             for item in current:
5694                 while item in requested:
5695                     requested.remove(item)
5696         else:
5697             requested.extend(current)
5698
5699     return orderedSet(requested)
5700
5701
5702 # TODO: Rewrite
5703 class FormatSorter:
5704     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5705
5706     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5707                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5708                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5709     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5710                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5711                     'fps', 'fs_approx', 'source', 'id')
5712
5713     settings = {
5714         'vcodec': {'type': 'ordered', 'regex': True,
5715                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5716         'acodec': {'type': 'ordered', 'regex': True,
5717                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5718         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5719                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5720         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5721                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5722         'vext': {'type': 'ordered', 'field': 'video_ext',
5723                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5724                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5725         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5726                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5727                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5728         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5729         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5730                        'field': ('vcodec', 'acodec'),
5731                        'function': lambda it: int(any(v != 'none' for v in it))},
5732         'ie_pref': {'priority': True, 'type': 'extractor'},
5733         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5734         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5735         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5736         'quality': {'convert': 'float', 'default': -1},
5737         'filesize': {'convert': 'bytes'},
5738         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5739         'id': {'convert': 'string', 'field': 'format_id'},
5740         'height': {'convert': 'float_none'},
5741         'width': {'convert': 'float_none'},
5742         'fps': {'convert': 'float_none'},
5743         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5744         'tbr': {'convert': 'float_none'},
5745         'vbr': {'convert': 'float_none'},
5746         'abr': {'convert': 'float_none'},
5747         'asr': {'convert': 'float_none'},
5748         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5749
5750         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5751         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5752                'function': lambda it: next(filter(None, it), None)},
5753         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5754                  'function': lambda it: next(filter(None, it), None)},
5755         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5756         'res': {'type': 'multiple', 'field': ('height', 'width'),
5757                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5758
5759         # Actual field names
5760         'format_id': {'type': 'alias', 'field': 'id'},
5761         'preference': {'type': 'alias', 'field': 'ie_pref'},
5762         'language_preference': {'type': 'alias', 'field': 'lang'},
5763         'source_preference': {'type': 'alias', 'field': 'source'},
5764         'protocol': {'type': 'alias', 'field': 'proto'},
5765         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5766         'audio_channels': {'type': 'alias', 'field': 'channels'},
5767
5768         # Deprecated
5769         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5770         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5771         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5772         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5773         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5774         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5775         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5776         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5777         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5778         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5779         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5780         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5781         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5782         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5783         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5784         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5785         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5786         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5787         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5788         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5789     }
5790
5791     def __init__(self, ydl, field_preference):
5792         self.ydl = ydl
5793         self._order = []
5794         self.evaluate_params(self.ydl.params, field_preference)
5795         if ydl.params.get('verbose'):
5796             self.print_verbose_info(self.ydl.write_debug)
5797
5798     def _get_field_setting(self, field, key):
5799         if field not in self.settings:
5800             if key in ('forced', 'priority'):
5801                 return False
5802             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5803                                         'deprecated and may be removed in a future version')
5804             self.settings[field] = {}
5805         propObj = self.settings[field]
5806         if key not in propObj:
5807             type = propObj.get('type')
5808             if key == 'field':
5809                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5810             elif key == 'convert':
5811                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5812             else:
5813                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5814             propObj[key] = default
5815         return propObj[key]
5816
5817     def _resolve_field_value(self, field, value, convertNone=False):
5818         if value is None:
5819             if not convertNone:
5820                 return None
5821         else:
5822             value = value.lower()
5823         conversion = self._get_field_setting(field, 'convert')
5824         if conversion == 'ignore':
5825             return None
5826         if conversion == 'string':
5827             return value
5828         elif conversion == 'float_none':
5829             return float_or_none(value)
5830         elif conversion == 'bytes':
5831             return parse_bytes(value)
5832         elif conversion == 'order':
5833             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5834             use_regex = self._get_field_setting(field, 'regex')
5835             list_length = len(order_list)
5836             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5837             if use_regex and value is not None:
5838                 for i, regex in enumerate(order_list):
5839                     if regex and re.match(regex, value):
5840                         return list_length - i
5841                 return list_length - empty_pos  # not in list
5842             else:  # not regex or  value = None
5843                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5844         else:
5845             if value.isnumeric():
5846                 return float(value)
5847             else:
5848                 self.settings[field]['convert'] = 'string'
5849                 return value
5850
5851     def evaluate_params(self, params, sort_extractor):
5852         self._use_free_order = params.get('prefer_free_formats', False)
5853         self._sort_user = params.get('format_sort', [])
5854         self._sort_extractor = sort_extractor
5855
5856         def add_item(field, reverse, closest, limit_text):
5857             field = field.lower()
5858             if field in self._order:
5859                 return
5860             self._order.append(field)
5861             limit = self._resolve_field_value(field, limit_text)
5862             data = {
5863                 'reverse': reverse,
5864                 'closest': False if limit is None else closest,
5865                 'limit_text': limit_text,
5866                 'limit': limit}
5867             if field in self.settings:
5868                 self.settings[field].update(data)
5869             else:
5870                 self.settings[field] = data
5871
5872         sort_list = (
5873             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5874             + (tuple() if params.get('format_sort_force', False)
5875                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5876             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5877
5878         for item in sort_list:
5879             match = re.match(self.regex, item)
5880             if match is None:
5881                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5882             field = match.group('field')
5883             if field is None:
5884                 continue
5885             if self._get_field_setting(field, 'type') == 'alias':
5886                 alias, field = field, self._get_field_setting(field, 'field')
5887                 if self._get_field_setting(alias, 'deprecated'):
5888                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5889                                                 f'be removed in a future version. Please use {field} instead')
5890             reverse = match.group('reverse') is not None
5891             closest = match.group('separator') == '~'
5892             limit_text = match.group('limit')
5893
5894             has_limit = limit_text is not None
5895             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5896             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5897
5898             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5899             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5900             limit_count = len(limits)
5901             for (i, f) in enumerate(fields):
5902                 add_item(f, reverse, closest,
5903                          limits[i] if i < limit_count
5904                          else limits[0] if has_limit and not has_multiple_limits
5905                          else None)
5906
5907     def print_verbose_info(self, write_debug):
5908         if self._sort_user:
5909             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5910         if self._sort_extractor:
5911             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5912         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5913             '+' if self._get_field_setting(field, 'reverse') else '', field,
5914             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5915                           self._get_field_setting(field, 'limit_text'),
5916                           self._get_field_setting(field, 'limit'))
5917             if self._get_field_setting(field, 'limit_text') is not None else '')
5918             for field in self._order if self._get_field_setting(field, 'visible')]))
5919
5920     def _calculate_field_preference_from_value(self, format, field, type, value):
5921         reverse = self._get_field_setting(field, 'reverse')
5922         closest = self._get_field_setting(field, 'closest')
5923         limit = self._get_field_setting(field, 'limit')
5924
5925         if type == 'extractor':
5926             maximum = self._get_field_setting(field, 'max')
5927             if value is None or (maximum is not None and value >= maximum):
5928                 value = -1
5929         elif type == 'boolean':
5930             in_list = self._get_field_setting(field, 'in_list')
5931             not_in_list = self._get_field_setting(field, 'not_in_list')
5932             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5933         elif type == 'ordered':
5934             value = self._resolve_field_value(field, value, True)
5935
5936         # try to convert to number
5937         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5938         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5939         if is_num:
5940             value = val_num
5941
5942         return ((-10, 0) if value is None
5943                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5944                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5945                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5946                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5947                 else (-1, value, 0))
5948
5949     def _calculate_field_preference(self, format, field):
5950         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5951         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5952         if type == 'multiple':
5953             type = 'field'  # Only 'field' is allowed in multiple for now
5954             actual_fields = self._get_field_setting(field, 'field')
5955
5956             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5957         else:
5958             value = get_value(field)
5959         return self._calculate_field_preference_from_value(format, field, type, value)
5960
5961     def calculate_preference(self, format):
5962         # Determine missing protocol
5963         if not format.get('protocol'):
5964             format['protocol'] = determine_protocol(format)
5965
5966         # Determine missing ext
5967         if not format.get('ext') and 'url' in format:
5968             format['ext'] = determine_ext(format['url'])
5969         if format.get('vcodec') == 'none':
5970             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5971             format['video_ext'] = 'none'
5972         else:
5973             format['video_ext'] = format['ext']
5974             format['audio_ext'] = 'none'
5975         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5976         #    format['preference'] = -1000
5977
5978         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5979             # HEVC-over-FLV is out-of-spec by FLV's original spec
5980             # ref. https://trac.ffmpeg.org/ticket/6389
5981             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5982             format['preference'] = -100
5983
5984         # Determine missing bitrates
5985         if format.get('vcodec') == 'none':
5986             format['vbr'] = 0
5987         if format.get('acodec') == 'none':
5988             format['abr'] = 0
5989         if not format.get('vbr') and format.get('vcodec') != 'none':
5990             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5991         if not format.get('abr') and format.get('acodec') != 'none':
5992             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5993         if not format.get('tbr'):
5994             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5995
5996         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5997
5998
5999 # XXX: Temporary
6000 class _YDLLogger:
6001     def __init__(self, ydl=None):
6002         self._ydl = ydl
6003
6004     def debug(self, message):
6005         if self._ydl:
6006             self._ydl.write_debug(message)
6007
6008     def info(self, message):
6009         if self._ydl:
6010             self._ydl.to_screen(message)
6011
6012     def warning(self, message, *, once=False):
6013         if self._ydl:
6014             self._ydl.report_warning(message, only_once=once)
6015
6016     def error(self, message, *, is_error=True):
6017         if self._ydl:
6018             self._ydl.report_error(message, is_error=is_error)
6019
6020     def stdout(self, message):
6021         if self._ydl:
6022             self._ydl.to_stdout(message)
6023
6024     def stderr(self, message):
6025         if self._ydl:
6026             self._ydl.to_stderr(message)