yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import netrc
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import shlex
  35 import socket
  36 import ssl
  37 import struct
  38 import subprocess
  39 import sys
  40 import tempfile
  41 import time
  42 import traceback
  43 import types
  44 import unicodedata
  45 import urllib.error
  46 import urllib.parse
  47 import urllib.request
  48 import xml.etree.ElementTree
  49 import zlib
  50
  51 from . import traversal
  52
  53 from ..compat import functools  # isort: split
  54 from ..compat import (
  55     compat_etree_fromstring,
  56     compat_expanduser,
  57     compat_HTMLParseError,
  58     compat_os_name,
  59     compat_shlex_quote,
  60 )
  61 from ..dependencies import brotli, certifi, websockets, xattr
  62 from ..socks import ProxyType, sockssocket
  63
  64 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  65
  66 # This is not clearly defined otherwise
  67 compiled_regex_type = type(re.compile(''))
  68
  69
  70 def random_user_agent():
  71     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  72     _CHROME_VERSIONS = (
  73         '90.0.4430.212',
  74         '90.0.4430.24',
  75         '90.0.4430.70',
  76         '90.0.4430.72',
  77         '90.0.4430.85',
  78         '90.0.4430.93',
  79         '91.0.4472.101',
  80         '91.0.4472.106',
  81         '91.0.4472.114',
  82         '91.0.4472.124',
  83         '91.0.4472.164',
  84         '91.0.4472.19',
  85         '91.0.4472.77',
  86         '92.0.4515.107',
  87         '92.0.4515.115',
  88         '92.0.4515.131',
  89         '92.0.4515.159',
  90         '92.0.4515.43',
  91         '93.0.4556.0',
  92         '93.0.4577.15',
  93         '93.0.4577.63',
  94         '93.0.4577.82',
  95         '94.0.4606.41',
  96         '94.0.4606.54',
  97         '94.0.4606.61',
  98         '94.0.4606.71',
  99         '94.0.4606.81',
 100         '94.0.4606.85',
 101         '95.0.4638.17',
 102         '95.0.4638.50',
 103         '95.0.4638.54',
 104         '95.0.4638.69',
 105         '95.0.4638.74',
 106         '96.0.4664.18',
 107         '96.0.4664.45',
 108         '96.0.4664.55',
 109         '96.0.4664.93',
 110         '97.0.4692.20',
 111     )
 112     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 113
 114
 115 SUPPORTED_ENCODINGS = [
 116     'gzip', 'deflate'
 117 ]
 118 if brotli:
 119     SUPPORTED_ENCODINGS.append('br')
 120
 121 std_headers = {
 122     'User-Agent': random_user_agent(),
 123     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 124     'Accept-Language': 'en-us,en;q=0.5',
 125     'Sec-Fetch-Mode': 'navigate',
 126 }
 127
 128
 129 USER_AGENTS = {
 130     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 131 }
 132
 133
 134 class NO_DEFAULT:
 135     pass
 136
 137
 138 def IDENTITY(x):
 139     return x
 140
 141
 142 ENGLISH_MONTH_NAMES = [
 143     'January', 'February', 'March', 'April', 'May', 'June',
 144     'July', 'August', 'September', 'October', 'November', 'December']
 145
 146 MONTH_NAMES = {
 147     'en': ENGLISH_MONTH_NAMES,
 148     'fr': [
 149         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 150         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 151     # these follow the genitive grammatical case (dopełniacz)
 152     # some websites might be using nominative, which will require another month list
 153     # https://en.wikibooks.org/wiki/Polish/Noun_cases
 154     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
 155            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
 156 }
 157
 158 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 159 TIMEZONE_NAMES = {
 160     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 161     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 162     'EST': -5, 'EDT': -4,  # Eastern
 163     'CST': -6, 'CDT': -5,  # Central
 164     'MST': -7, 'MDT': -6,  # Mountain
 165     'PST': -8, 'PDT': -7   # Pacific
 166 }
 167
 168 # needed for sanitizing filenames in restricted mode
 169 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 170                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 171                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 172
 173 DATE_FORMATS = (
 174     '%d %B %Y',
 175     '%d %b %Y',
 176     '%B %d %Y',
 177     '%B %dst %Y',
 178     '%B %dnd %Y',
 179     '%B %drd %Y',
 180     '%B %dth %Y',
 181     '%b %d %Y',
 182     '%b %dst %Y',
 183     '%b %dnd %Y',
 184     '%b %drd %Y',
 185     '%b %dth %Y',
 186     '%b %dst %Y %I:%M',
 187     '%b %dnd %Y %I:%M',
 188     '%b %drd %Y %I:%M',
 189     '%b %dth %Y %I:%M',
 190     '%Y %m %d',
 191     '%Y-%m-%d',
 192     '%Y.%m.%d.',
 193     '%Y/%m/%d',
 194     '%Y/%m/%d %H:%M',
 195     '%Y/%m/%d %H:%M:%S',
 196     '%Y%m%d%H%M',
 197     '%Y%m%d%H%M%S',
 198     '%Y%m%d',
 199     '%Y-%m-%d %H:%M',
 200     '%Y-%m-%d %H:%M:%S',
 201     '%Y-%m-%d %H:%M:%S.%f',
 202     '%Y-%m-%d %H:%M:%S:%f',
 203     '%d.%m.%Y %H:%M',
 204     '%d.%m.%Y %H.%M',
 205     '%Y-%m-%dT%H:%M:%SZ',
 206     '%Y-%m-%dT%H:%M:%S.%fZ',
 207     '%Y-%m-%dT%H:%M:%S.%f0Z',
 208     '%Y-%m-%dT%H:%M:%S',
 209     '%Y-%m-%dT%H:%M:%S.%f',
 210     '%Y-%m-%dT%H:%M',
 211     '%b %d %Y at %H:%M',
 212     '%b %d %Y at %H:%M:%S',
 213     '%B %d %Y at %H:%M',
 214     '%B %d %Y at %H:%M:%S',
 215     '%H:%M %d-%b-%Y',
 216 )
 217
 218 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 219 DATE_FORMATS_DAY_FIRST.extend([
 220     '%d-%m-%Y',
 221     '%d.%m.%Y',
 222     '%d.%m.%y',
 223     '%d/%m/%Y',
 224     '%d/%m/%y',
 225     '%d/%m/%Y %H:%M:%S',
 226     '%d-%m-%Y %H:%M',
 227     '%H:%M %d/%m/%Y',
 228 ])
 229
 230 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 231 DATE_FORMATS_MONTH_FIRST.extend([
 232     '%m-%d-%Y',
 233     '%m.%d.%Y',
 234     '%m/%d/%Y',
 235     '%m/%d/%y',
 236     '%m/%d/%Y %H:%M:%S',
 237 ])
 238
 239 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 240 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 241
 242 NUMBER_RE = r'\d+(?:\.\d+)?'
 243
 244
 245 @functools.cache
 246 def preferredencoding():
 247     """Get preferred encoding.
 248
 249     Returns the best encoding scheme for the system, based on
 250     locale.getpreferredencoding() and some further tweaks.
 251     """
 252     try:
 253         pref = locale.getpreferredencoding()
 254         'TEST'.encode(pref)
 255     except Exception:
 256         pref = 'UTF-8'
 257
 258     return pref
 259
 260
 261 def write_json_file(obj, fn):
 262     """ Encode obj as JSON and write it to fn, atomically if possible """
 263
 264     tf = tempfile.NamedTemporaryFile(
 265         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 266         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 267
 268     try:
 269         with tf:
 270             json.dump(obj, tf, ensure_ascii=False)
 271         if sys.platform == 'win32':
 272             # Need to remove existing file on Windows, else os.rename raises
 273             # WindowsError or FileExistsError.
 274             with contextlib.suppress(OSError):
 275                 os.unlink(fn)
 276         with contextlib.suppress(OSError):
 277             mask = os.umask(0)
 278             os.umask(mask)
 279             os.chmod(tf.name, 0o666 & ~mask)
 280         os.rename(tf.name, fn)
 281     except Exception:
 282         with contextlib.suppress(OSError):
 283             os.remove(tf.name)
 284         raise
 285
 286
 287 def find_xpath_attr(node, xpath, key, val=None):
 288     """ Find the xpath xpath[@key=val] """
 289     assert re.match(r'^[a-zA-Z_-]+$', key)
 290     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 291     return node.find(expr)
 292
 293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 294 # the namespace parameter
 295
 296
 297 def xpath_with_ns(path, ns_map):
 298     components = [c.split(':') for c in path.split('/')]
 299     replaced = []
 300     for c in components:
 301         if len(c) == 1:
 302             replaced.append(c[0])
 303         else:
 304             ns, tag = c
 305             replaced.append('{%s}%s' % (ns_map[ns], tag))
 306     return '/'.join(replaced)
 307
 308
 309 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 310     def _find_xpath(xpath):
 311         return node.find(xpath)
 312
 313     if isinstance(xpath, str):
 314         n = _find_xpath(xpath)
 315     else:
 316         for xp in xpath:
 317             n = _find_xpath(xp)
 318             if n is not None:
 319                 break
 320
 321     if n is None:
 322         if default is not NO_DEFAULT:
 323             return default
 324         elif fatal:
 325             name = xpath if name is None else name
 326             raise ExtractorError('Could not find XML element %s' % name)
 327         else:
 328             return None
 329     return n
 330
 331
 332 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 333     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 334     if n is None or n == default:
 335         return n
 336     if n.text is None:
 337         if default is not NO_DEFAULT:
 338             return default
 339         elif fatal:
 340             name = xpath if name is None else name
 341             raise ExtractorError('Could not find XML element\'s text %s' % name)
 342         else:
 343             return None
 344     return n.text
 345
 346
 347 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 348     n = find_xpath_attr(node, xpath, key)
 349     if n is None:
 350         if default is not NO_DEFAULT:
 351             return default
 352         elif fatal:
 353             name = f'{xpath}[@{key}]' if name is None else name
 354             raise ExtractorError('Could not find XML attribute %s' % name)
 355         else:
 356             return None
 357     return n.attrib[key]
 358
 359
 360 def get_element_by_id(id, html, **kwargs):
 361     """Return the content of the tag with the specified ID in the passed HTML document"""
 362     return get_element_by_attribute('id', id, html, **kwargs)
 363
 364
 365 def get_element_html_by_id(id, html, **kwargs):
 366     """Return the html of the tag with the specified ID in the passed HTML document"""
 367     return get_element_html_by_attribute('id', id, html, **kwargs)
 368
 369
 370 def get_element_by_class(class_name, html):
 371     """Return the content of the first tag with the specified class in the passed HTML document"""
 372     retval = get_elements_by_class(class_name, html)
 373     return retval[0] if retval else None
 374
 375
 376 def get_element_html_by_class(class_name, html):
 377     """Return the html of the first tag with the specified class in the passed HTML document"""
 378     retval = get_elements_html_by_class(class_name, html)
 379     return retval[0] if retval else None
 380
 381
 382 def get_element_by_attribute(attribute, value, html, **kwargs):
 383     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 384     return retval[0] if retval else None
 385
 386
 387 def get_element_html_by_attribute(attribute, value, html, **kargs):
 388     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 389     return retval[0] if retval else None
 390
 391
 392 def get_elements_by_class(class_name, html, **kargs):
 393     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 394     return get_elements_by_attribute(
 395         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 396         html, escape_value=False)
 397
 398
 399 def get_elements_html_by_class(class_name, html):
 400     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 401     return get_elements_html_by_attribute(
 402         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 403         html, escape_value=False)
 404
 405
 406 def get_elements_by_attribute(*args, **kwargs):
 407     """Return the content of the tag with the specified attribute in the passed HTML document"""
 408     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 409
 410
 411 def get_elements_html_by_attribute(*args, **kwargs):
 412     """Return the html of the tag with the specified attribute in the passed HTML document"""
 413     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 414
 415
 416 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 417     """
 418     Return the text (content) and the html (whole) of the tag with the specified
 419     attribute in the passed HTML document
 420     """
 421     if not value:
 422         return
 423
 424     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 425
 426     value = re.escape(value) if escape_value else value
 427
 428     partial_element_re = rf'''(?x)
 429         <(?P<tag>{tag})
 430          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 431          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 432         '''
 433
 434     for m in re.finditer(partial_element_re, html):
 435         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 436
 437         yield (
 438             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 439             whole
 440         )
 441
 442
 443 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 444     """
 445     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 446     closing tag for the first opening tag it has encountered, and can be used
 447     as a context manager
 448     """
 449
 450     class HTMLBreakOnClosingTagException(Exception):
 451         pass
 452
 453     def __init__(self):
 454         self.tagstack = collections.deque()
 455         html.parser.HTMLParser.__init__(self)
 456
 457     def __enter__(self):
 458         return self
 459
 460     def __exit__(self, *_):
 461         self.close()
 462
 463     def close(self):
 464         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 465         # so data remains buffered; we no longer have any interest in it, thus
 466         # override this method to discard it
 467         pass
 468
 469     def handle_starttag(self, tag, _):
 470         self.tagstack.append(tag)
 471
 472     def handle_endtag(self, tag):
 473         if not self.tagstack:
 474             raise compat_HTMLParseError('no tags in the stack')
 475         while self.tagstack:
 476             inner_tag = self.tagstack.pop()
 477             if inner_tag == tag:
 478                 break
 479         else:
 480             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 481         if not self.tagstack:
 482             raise self.HTMLBreakOnClosingTagException()
 483
 484
 485 # XXX: This should be far less strict
 486 def get_element_text_and_html_by_tag(tag, html):
 487     """
 488     For the first element with the specified tag in the passed HTML document
 489     return its' content (text) and the whole element (html)
 490     """
 491     def find_or_raise(haystack, needle, exc):
 492         try:
 493             return haystack.index(needle)
 494         except ValueError:
 495             raise exc
 496     closing_tag = f'</{tag}>'
 497     whole_start = find_or_raise(
 498         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 499     content_start = find_or_raise(
 500         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 501     content_start += whole_start + 1
 502     with HTMLBreakOnClosingTagParser() as parser:
 503         parser.feed(html[whole_start:content_start])
 504         if not parser.tagstack or parser.tagstack[0] != tag:
 505             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 506         offset = content_start
 507         while offset < len(html):
 508             next_closing_tag_start = find_or_raise(
 509                 html[offset:], closing_tag,
 510                 compat_HTMLParseError(f'closing {tag} tag not found'))
 511             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 512             try:
 513                 parser.feed(html[offset:offset + next_closing_tag_end])
 514                 offset += next_closing_tag_end
 515             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 516                 return html[content_start:offset + next_closing_tag_start], \
 517                     html[whole_start:offset + next_closing_tag_end]
 518         raise compat_HTMLParseError('unexpected end of html')
 519
 520
 521 class HTMLAttributeParser(html.parser.HTMLParser):
 522     """Trivial HTML parser to gather the attributes for a single element"""
 523
 524     def __init__(self):
 525         self.attrs = {}
 526         html.parser.HTMLParser.__init__(self)
 527
 528     def handle_starttag(self, tag, attrs):
 529         self.attrs = dict(attrs)
 530         raise compat_HTMLParseError('done')
 531
 532
 533 class HTMLListAttrsParser(html.parser.HTMLParser):
 534     """HTML parser to gather the attributes for the elements of a list"""
 535
 536     def __init__(self):
 537         html.parser.HTMLParser.__init__(self)
 538         self.items = []
 539         self._level = 0
 540
 541     def handle_starttag(self, tag, attrs):
 542         if tag == 'li' and self._level == 0:
 543             self.items.append(dict(attrs))
 544         self._level += 1
 545
 546     def handle_endtag(self, tag):
 547         self._level -= 1
 548
 549
 550 def extract_attributes(html_element):
 551     """Given a string for an HTML element such as
 552     <el
 553          a="foo" B="bar" c="&98;az" d=boz
 554          empty= noval entity="&amp;"
 555          sq='"' dq="'"
 556     >
 557     Decode and return a dictionary of attributes.
 558     {
 559         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 560         'empty': '', 'noval': None, 'entity': '&',
 561         'sq': '"', 'dq': '\''
 562     }.
 563     """
 564     parser = HTMLAttributeParser()
 565     with contextlib.suppress(compat_HTMLParseError):
 566         parser.feed(html_element)
 567         parser.close()
 568     return parser.attrs
 569
 570
 571 def parse_list(webpage):
 572     """Given a string for an series of HTML <li> elements,
 573     return a dictionary of their attributes"""
 574     parser = HTMLListAttrsParser()
 575     parser.feed(webpage)
 576     parser.close()
 577     return parser.items
 578
 579
 580 def clean_html(html):
 581     """Clean an HTML snippet into a readable string"""
 582
 583     if html is None:  # Convenience for sanitizing descriptions etc.
 584         return html
 585
 586     html = re.sub(r'\s+', ' ', html)
 587     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 588     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 589     # Strip html tags
 590     html = re.sub('<.*?>', '', html)
 591     # Replace html entities
 592     html = unescapeHTML(html)
 593     return html.strip()
 594
 595
 596 class LenientJSONDecoder(json.JSONDecoder):
 597     # TODO: Write tests
 598     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 599         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 600         self._close_attempts = 2 * close_objects
 601         super().__init__(*args, **kwargs)
 602
 603     @staticmethod
 604     def _close_object(err):
 605         doc = err.doc[:err.pos]
 606         # We need to add comma first to get the correct error message
 607         if err.msg.startswith('Expecting \',\''):
 608             return doc + ','
 609         elif not doc.endswith(','):
 610             return
 611
 612         if err.msg.startswith('Expecting property name'):
 613             return doc[:-1] + '}'
 614         elif err.msg.startswith('Expecting value'):
 615             return doc[:-1] + ']'
 616
 617     def decode(self, s):
 618         if self.transform_source:
 619             s = self.transform_source(s)
 620         for attempt in range(self._close_attempts + 1):
 621             try:
 622                 if self.ignore_extra:
 623                     return self.raw_decode(s.lstrip())[0]
 624                 return super().decode(s)
 625             except json.JSONDecodeError as e:
 626                 if e.pos is None:
 627                     raise
 628                 elif attempt < self._close_attempts:
 629                     s = self._close_object(e)
 630                     if s is not None:
 631                         continue
 632                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 633         assert False, 'Too many attempts to decode JSON'
 634
 635
 636 def sanitize_open(filename, open_mode):
 637     """Try to open the given filename, and slightly tweak it if this fails.
 638
 639     Attempts to open the given filename. If this fails, it tries to change
 640     the filename slightly, step by step, until it's either able to open it
 641     or it fails and raises a final exception, like the standard open()
 642     function.
 643
 644     It returns the tuple (stream, definitive_file_name).
 645     """
 646     if filename == '-':
 647         if sys.platform == 'win32':
 648             import msvcrt
 649
 650             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 651             with contextlib.suppress(io.UnsupportedOperation):
 652                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 653         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 654
 655     for attempt in range(2):
 656         try:
 657             try:
 658                 if sys.platform == 'win32':
 659                     # FIXME: An exclusive lock also locks the file from being read.
 660                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 661                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 662                     raise LockingUnsupportedError()
 663                 stream = locked_file(filename, open_mode, block=False).__enter__()
 664             except OSError:
 665                 stream = open(filename, open_mode)
 666             return stream, filename
 667         except OSError as err:
 668             if attempt or err.errno in (errno.EACCES,):
 669                 raise
 670             old_filename, filename = filename, sanitize_path(filename)
 671             if old_filename == filename:
 672                 raise
 673
 674
 675 def timeconvert(timestr):
 676     """Convert RFC 2822 defined time string into system timestamp"""
 677     timestamp = None
 678     timetuple = email.utils.parsedate_tz(timestr)
 679     if timetuple is not None:
 680         timestamp = email.utils.mktime_tz(timetuple)
 681     return timestamp
 682
 683
 684 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 685     """Sanitizes a string so it could be used as part of a filename.
 686     @param restricted   Use a stricter subset of allowed characters
 687     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 688                         If unset, yt-dlp's new sanitization rules are in effect
 689     """
 690     if s == '':
 691         return ''
 692
 693     def replace_insane(char):
 694         if restricted and char in ACCENT_CHARS:
 695             return ACCENT_CHARS[char]
 696         elif not restricted and char == '\n':
 697             return '\0 '
 698         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 699             # Replace with their full-width unicode counterparts
 700             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 701         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 702             return ''
 703         elif char == '"':
 704             return '' if restricted else '\''
 705         elif char == ':':
 706             return '\0_\0-' if restricted else '\0 \0-'
 707         elif char in '\\/|*<>':
 708             return '\0_'
 709         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 710             return '\0_'
 711         return char
 712
 713     # Replace look-alike Unicode glyphs
 714     if restricted and (is_id is NO_DEFAULT or not is_id):
 715         s = unicodedata.normalize('NFKC', s)
 716     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 717     result = ''.join(map(replace_insane, s))
 718     if is_id is NO_DEFAULT:
 719         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 720         STRIP_RE = r'(?:\0.|[ _-])*'
 721         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 722     result = result.replace('\0', '') or '_'
 723
 724     if not is_id:
 725         while '__' in result:
 726             result = result.replace('__', '_')
 727         result = result.strip('_')
 728         # Common case of "Foreign band name - English song title"
 729         if restricted and result.startswith('-_'):
 730             result = result[2:]
 731         if result.startswith('-'):
 732             result = '_' + result[len('-'):]
 733         result = result.lstrip('.')
 734         if not result:
 735             result = '_'
 736     return result
 737
 738
 739 def sanitize_path(s, force=False):
 740     """Sanitizes and normalizes path on Windows"""
 741     if sys.platform == 'win32':
 742         force = False
 743         drive_or_unc, _ = os.path.splitdrive(s)
 744     elif force:
 745         drive_or_unc = ''
 746     else:
 747         return s
 748
 749     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 750     if drive_or_unc:
 751         norm_path.pop(0)
 752     sanitized_path = [
 753         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 754         for path_part in norm_path]
 755     if drive_or_unc:
 756         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 757     elif force and s and s[0] == os.path.sep:
 758         sanitized_path.insert(0, os.path.sep)
 759     return os.path.join(*sanitized_path)
 760
 761
 762 def sanitize_url(url, *, scheme='http'):
 763     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 764     # the number of unwanted failures due to missing protocol
 765     if url is None:
 766         return
 767     elif url.startswith('//'):
 768         return f'{scheme}:{url}'
 769     # Fix some common typos seen so far
 770     COMMON_TYPOS = (
 771         # https://github.com/ytdl-org/youtube-dl/issues/15649
 772         (r'^httpss://', r'https://'),
 773         # https://bx1.be/lives/direct-tv/
 774         (r'^rmtp([es]?)://', r'rtmp\1://'),
 775     )
 776     for mistake, fixup in COMMON_TYPOS:
 777         if re.match(mistake, url):
 778             return re.sub(mistake, fixup, url)
 779     return url
 780
 781
 782 def extract_basic_auth(url):
 783     parts = urllib.parse.urlsplit(url)
 784     if parts.username is None:
 785         return url, None
 786     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 787         parts.hostname if parts.port is None
 788         else '%s:%d' % (parts.hostname, parts.port))))
 789     auth_payload = base64.b64encode(
 790         ('%s:%s' % (parts.username, parts.password or '')).encode())
 791     return url, f'Basic {auth_payload.decode()}'
 792
 793
 794 def sanitized_Request(url, *args, **kwargs):
 795     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 796     if auth_header is not None:
 797         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 798         headers['Authorization'] = auth_header
 799     return urllib.request.Request(url, *args, **kwargs)
 800
 801
 802 def expand_path(s):
 803     """Expand shell variables and ~"""
 804     return os.path.expandvars(compat_expanduser(s))
 805
 806
 807 def orderedSet(iterable, *, lazy=False):
 808     """Remove all duplicates from the input iterable"""
 809     def _iter():
 810         seen = []  # Do not use set since the items can be unhashable
 811         for x in iterable:
 812             if x not in seen:
 813                 seen.append(x)
 814                 yield x
 815
 816     return _iter() if lazy else list(_iter())
 817
 818
 819 def _htmlentity_transform(entity_with_semicolon):
 820     """Transforms an HTML entity to a character."""
 821     entity = entity_with_semicolon[:-1]
 822
 823     # Known non-numeric HTML entity
 824     if entity in html.entities.name2codepoint:
 825         return chr(html.entities.name2codepoint[entity])
 826
 827     # TODO: HTML5 allows entities without a semicolon.
 828     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 829     if entity_with_semicolon in html.entities.html5:
 830         return html.entities.html5[entity_with_semicolon]
 831
 832     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 833     if mobj is not None:
 834         numstr = mobj.group(1)
 835         if numstr.startswith('x'):
 836             base = 16
 837             numstr = '0%s' % numstr
 838         else:
 839             base = 10
 840         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 841         with contextlib.suppress(ValueError):
 842             return chr(int(numstr, base))
 843
 844     # Unknown entity in name, return its literal representation
 845     return '&%s;' % entity
 846
 847
 848 def unescapeHTML(s):
 849     if s is None:
 850         return None
 851     assert isinstance(s, str)
 852
 853     return re.sub(
 854         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 855
 856
 857 def escapeHTML(text):
 858     return (
 859         text
 860         .replace('&', '&amp;')
 861         .replace('<', '&lt;')
 862         .replace('>', '&gt;')
 863         .replace('"', '&quot;')
 864         .replace("'", '&#39;')
 865     )
 866
 867
 868 class netrc_from_content(netrc.netrc):
 869     def __init__(self, content):
 870         self.hosts, self.macros = {}, {}
 871         with io.StringIO(content) as stream:
 872             self._parse('-', stream, False)
 873
 874
 875 def process_communicate_or_kill(p, *args, **kwargs):
 876     deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
 877                         f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
 878     return Popen.communicate_or_kill(p, *args, **kwargs)
 879
 880
 881 class Popen(subprocess.Popen):
 882     if sys.platform == 'win32':
 883         _startupinfo = subprocess.STARTUPINFO()
 884         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 885     else:
 886         _startupinfo = None
 887
 888     @staticmethod
 889     def _fix_pyinstaller_ld_path(env):
 890         """Restore LD_LIBRARY_PATH when using PyInstaller
 891             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 892                  https://github.com/yt-dlp/yt-dlp/issues/4573
 893         """
 894         if not hasattr(sys, '_MEIPASS'):
 895             return
 896
 897         def _fix(key):
 898             orig = env.get(f'{key}_ORIG')
 899             if orig is None:
 900                 env.pop(key, None)
 901             else:
 902                 env[key] = orig
 903
 904         _fix('LD_LIBRARY_PATH')  # Linux
 905         _fix('DYLD_LIBRARY_PATH')  # macOS
 906
 907     def __init__(self, *args, env=None, text=False, **kwargs):
 908         if env is None:
 909             env = os.environ.copy()
 910         self._fix_pyinstaller_ld_path(env)
 911
 912         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 913         if text is True:
 914             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 915             kwargs.setdefault('encoding', 'utf-8')
 916             kwargs.setdefault('errors', 'replace')
 917         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 918
 919     def communicate_or_kill(self, *args, **kwargs):
 920         try:
 921             return self.communicate(*args, **kwargs)
 922         except BaseException:  # Including KeyboardInterrupt
 923             self.kill(timeout=None)
 924             raise
 925
 926     def kill(self, *, timeout=0):
 927         super().kill()
 928         if timeout != 0:
 929             self.wait(timeout=timeout)
 930
 931     @classmethod
 932     def run(cls, *args, timeout=None, **kwargs):
 933         with cls(*args, **kwargs) as proc:
 934             default = '' if proc.__text_mode else b''
 935             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 936             return stdout or default, stderr or default, proc.returncode
 937
 938
 939 def encodeArgument(s):
 940     # Legacy code that uses byte strings
 941     # Uncomment the following line after fixing all post processors
 942     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 943     return s if isinstance(s, str) else s.decode('ascii')
 944
 945
 946 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 947
 948
 949 def timetuple_from_msec(msec):
 950     secs, msec = divmod(msec, 1000)
 951     mins, secs = divmod(secs, 60)
 952     hrs, mins = divmod(mins, 60)
 953     return _timetuple(hrs, mins, secs, msec)
 954
 955
 956 def formatSeconds(secs, delim=':', msec=False):
 957     time = timetuple_from_msec(secs * 1000)
 958     if time.hours:
 959         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 960     elif time.minutes:
 961         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 962     else:
 963         ret = '%d' % time.seconds
 964     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 965
 966
 967 def _ssl_load_windows_store_certs(ssl_context, storename):
 968     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 969     try:
 970         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 971                  if encoding == 'x509_asn' and (
 972                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 973     except PermissionError:
 974         return
 975     for cert in certs:
 976         with contextlib.suppress(ssl.SSLError):
 977             ssl_context.load_verify_locations(cadata=cert)
 978
 979
 980 def make_HTTPS_handler(params, **kwargs):
 981     opts_check_certificate = not params.get('nocheckcertificate')
 982     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 983     context.check_hostname = opts_check_certificate
 984     if params.get('legacyserverconnect'):
 985         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 986         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 987         context.set_ciphers('DEFAULT')
 988     elif (
 989         sys.version_info < (3, 10)
 990         and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
 991         and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
 992     ):
 993         # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
 994         # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
 995         # in some situations [2][3].
 996         # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
 997         # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
 998         # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
 999         # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1000         # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1001         # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1002         # 4. https://peps.python.org/pep-0644/
1003         # 5. https://peps.python.org/pep-0644/#libressl-support
1004         # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
1005         context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1006         context.minimum_version = ssl.TLSVersion.TLSv1_2
1007
1008     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1009     if opts_check_certificate:
1010         if certifi and 'no-certifi' not in params.get('compat_opts', []):
1011             context.load_verify_locations(cafile=certifi.where())
1012         else:
1013             try:
1014                 context.load_default_certs()
1015                 # Work around the issue in load_default_certs when there are bad certificates. See:
1016                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1017                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1018             except ssl.SSLError:
1019                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1020                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1021                     for storename in ('CA', 'ROOT'):
1022                         _ssl_load_windows_store_certs(context, storename)
1023                 context.set_default_verify_paths()
1024
1025     client_certfile = params.get('client_certificate')
1026     if client_certfile:
1027         try:
1028             context.load_cert_chain(
1029                 client_certfile, keyfile=params.get('client_certificate_key'),
1030                 password=params.get('client_certificate_password'))
1031         except ssl.SSLError:
1032             raise YoutubeDLError('Unable to load client certificate')
1033
1034     # Some servers may reject requests if ALPN extension is not sent. See:
1035     # https://github.com/python/cpython/issues/85140
1036     # https://github.com/yt-dlp/yt-dlp/issues/3878
1037     with contextlib.suppress(NotImplementedError):
1038         context.set_alpn_protocols(['http/1.1'])
1039
1040     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1041
1042
1043 def bug_reports_message(before=';'):
1044     from ..update import REPOSITORY
1045
1046     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1047            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1048
1049     before = before.rstrip()
1050     if not before or before.endswith(('.', '!', '?')):
1051         msg = msg[0].title() + msg[1:]
1052
1053     return (before + ' ' if before else '') + msg
1054
1055
1056 class YoutubeDLError(Exception):
1057     """Base exception for YoutubeDL errors."""
1058     msg = None
1059
1060     def __init__(self, msg=None):
1061         if msg is not None:
1062             self.msg = msg
1063         elif self.msg is None:
1064             self.msg = type(self).__name__
1065         super().__init__(self.msg)
1066
1067
1068 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1069 if hasattr(ssl, 'CertificateError'):
1070     network_exceptions.append(ssl.CertificateError)
1071 network_exceptions = tuple(network_exceptions)
1072
1073
1074 class ExtractorError(YoutubeDLError):
1075     """Error during info extraction."""
1076
1077     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1078         """ tb, if given, is the original traceback (so that it can be printed out).
1079         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1080         """
1081         if sys.exc_info()[0] in network_exceptions:
1082             expected = True
1083
1084         self.orig_msg = str(msg)
1085         self.traceback = tb
1086         self.expected = expected
1087         self.cause = cause
1088         self.video_id = video_id
1089         self.ie = ie
1090         self.exc_info = sys.exc_info()  # preserve original exception
1091         if isinstance(self.exc_info[1], ExtractorError):
1092             self.exc_info = self.exc_info[1].exc_info
1093         super().__init__(self.__msg)
1094
1095     @property
1096     def __msg(self):
1097         return ''.join((
1098             format_field(self.ie, None, '[%s] '),
1099             format_field(self.video_id, None, '%s: '),
1100             self.orig_msg,
1101             format_field(self.cause, None, ' (caused by %r)'),
1102             '' if self.expected else bug_reports_message()))
1103
1104     def format_traceback(self):
1105         return join_nonempty(
1106             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1107             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1108             delim='\n') or None
1109
1110     def __setattr__(self, name, value):
1111         super().__setattr__(name, value)
1112         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1113             self.msg = self.__msg or type(self).__name__
1114             self.args = (self.msg, )  # Cannot be property
1115
1116
1117 class UnsupportedError(ExtractorError):
1118     def __init__(self, url):
1119         super().__init__(
1120             'Unsupported URL: %s' % url, expected=True)
1121         self.url = url
1122
1123
1124 class RegexNotFoundError(ExtractorError):
1125     """Error when a regex didn't match"""
1126     pass
1127
1128
1129 class GeoRestrictedError(ExtractorError):
1130     """Geographic restriction Error exception.
1131
1132     This exception may be thrown when a video is not available from your
1133     geographic location due to geographic restrictions imposed by a website.
1134     """
1135
1136     def __init__(self, msg, countries=None, **kwargs):
1137         kwargs['expected'] = True
1138         super().__init__(msg, **kwargs)
1139         self.countries = countries
1140
1141
1142 class UserNotLive(ExtractorError):
1143     """Error when a channel/user is not live"""
1144
1145     def __init__(self, msg=None, **kwargs):
1146         kwargs['expected'] = True
1147         super().__init__(msg or 'The channel is not currently live', **kwargs)
1148
1149
1150 class DownloadError(YoutubeDLError):
1151     """Download Error exception.
1152
1153     This exception may be thrown by FileDownloader objects if they are not
1154     configured to continue on errors. They will contain the appropriate
1155     error message.
1156     """
1157
1158     def __init__(self, msg, exc_info=None):
1159         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1160         super().__init__(msg)
1161         self.exc_info = exc_info
1162
1163
1164 class EntryNotInPlaylist(YoutubeDLError):
1165     """Entry not in playlist exception.
1166
1167     This exception will be thrown by YoutubeDL when a requested entry
1168     is not found in the playlist info_dict
1169     """
1170     msg = 'Entry not found in info'
1171
1172
1173 class SameFileError(YoutubeDLError):
1174     """Same File exception.
1175
1176     This exception will be thrown by FileDownloader objects if they detect
1177     multiple files would have to be downloaded to the same file on disk.
1178     """
1179     msg = 'Fixed output name but more than one file to download'
1180
1181     def __init__(self, filename=None):
1182         if filename is not None:
1183             self.msg += f': {filename}'
1184         super().__init__(self.msg)
1185
1186
1187 class PostProcessingError(YoutubeDLError):
1188     """Post Processing exception.
1189
1190     This exception may be raised by PostProcessor's .run() method to
1191     indicate an error in the postprocessing task.
1192     """
1193
1194
1195 class DownloadCancelled(YoutubeDLError):
1196     """ Exception raised when the download queue should be interrupted """
1197     msg = 'The download was cancelled'
1198
1199
1200 class ExistingVideoReached(DownloadCancelled):
1201     """ --break-on-existing triggered """
1202     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1203
1204
1205 class RejectedVideoReached(DownloadCancelled):
1206     """ --break-match-filter triggered """
1207     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1208
1209
1210 class MaxDownloadsReached(DownloadCancelled):
1211     """ --max-downloads limit has been reached. """
1212     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1213
1214
1215 class ReExtractInfo(YoutubeDLError):
1216     """ Video info needs to be re-extracted. """
1217
1218     def __init__(self, msg, expected=False):
1219         super().__init__(msg)
1220         self.expected = expected
1221
1222
1223 class ThrottledDownload(ReExtractInfo):
1224     """ Download speed below --throttled-rate. """
1225     msg = 'The download speed is below throttle limit'
1226
1227     def __init__(self):
1228         super().__init__(self.msg, expected=False)
1229
1230
1231 class UnavailableVideoError(YoutubeDLError):
1232     """Unavailable Format exception.
1233
1234     This exception will be thrown when a video is requested
1235     in a format that is not available for that video.
1236     """
1237     msg = 'Unable to download video'
1238
1239     def __init__(self, err=None):
1240         if err is not None:
1241             self.msg += f': {err}'
1242         super().__init__(self.msg)
1243
1244
1245 class ContentTooShortError(YoutubeDLError):
1246     """Content Too Short exception.
1247
1248     This exception may be raised by FileDownloader objects when a file they
1249     download is too small for what the server announced first, indicating
1250     the connection was probably interrupted.
1251     """
1252
1253     def __init__(self, downloaded, expected):
1254         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1255         # Both in bytes
1256         self.downloaded = downloaded
1257         self.expected = expected
1258
1259
1260 class XAttrMetadataError(YoutubeDLError):
1261     def __init__(self, code=None, msg='Unknown error'):
1262         super().__init__(msg)
1263         self.code = code
1264         self.msg = msg
1265
1266         # Parsing code and msg
1267         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1268                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1269             self.reason = 'NO_SPACE'
1270         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1271             self.reason = 'VALUE_TOO_LONG'
1272         else:
1273             self.reason = 'NOT_SUPPORTED'
1274
1275
1276 class XAttrUnavailableError(YoutubeDLError):
1277     pass
1278
1279
1280 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1281     hc = http_class(*args, **kwargs)
1282     source_address = ydl_handler._params.get('source_address')
1283
1284     if source_address is not None:
1285         # This is to workaround _create_connection() from socket where it will try all
1286         # address data from getaddrinfo() including IPv6. This filters the result from
1287         # getaddrinfo() based on the source_address value.
1288         # This is based on the cpython socket.create_connection() function.
1289         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1290         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1291             host, port = address
1292             err = None
1293             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1294             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1295             ip_addrs = [addr for addr in addrs if addr[0] == af]
1296             if addrs and not ip_addrs:
1297                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1298                 raise OSError(
1299                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1300                     % (ip_version, source_address[0]))
1301             for res in ip_addrs:
1302                 af, socktype, proto, canonname, sa = res
1303                 sock = None
1304                 try:
1305                     sock = socket.socket(af, socktype, proto)
1306                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1307                         sock.settimeout(timeout)
1308                     sock.bind(source_address)
1309                     sock.connect(sa)
1310                     err = None  # Explicitly break reference cycle
1311                     return sock
1312                 except OSError as _:
1313                     err = _
1314                     if sock is not None:
1315                         sock.close()
1316             if err is not None:
1317                 raise err
1318             else:
1319                 raise OSError('getaddrinfo returns an empty list')
1320         if hasattr(hc, '_create_connection'):
1321             hc._create_connection = _create_connection
1322         hc.source_address = (source_address, 0)
1323
1324     return hc
1325
1326
1327 class YoutubeDLHandler(urllib.request.HTTPHandler):
1328     """Handler for HTTP requests and responses.
1329
1330     This class, when installed with an OpenerDirector, automatically adds
1331     the standard headers to every HTTP request and handles gzipped, deflated and
1332     brotli responses from web servers.
1333
1334     Part of this code was copied from:
1335
1336     http://techknack.net/python-urllib2-handlers/
1337
1338     Andrew Rowls, the author of that code, agreed to release it to the
1339     public domain.
1340     """
1341
1342     def __init__(self, params, *args, **kwargs):
1343         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1344         self._params = params
1345
1346     def http_open(self, req):
1347         conn_class = http.client.HTTPConnection
1348
1349         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1350         if socks_proxy:
1351             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1352             del req.headers['Ytdl-socks-proxy']
1353
1354         return self.do_open(functools.partial(
1355             _create_http_connection, self, conn_class, False),
1356             req)
1357
1358     @staticmethod
1359     def deflate(data):
1360         if not data:
1361             return data
1362         try:
1363             return zlib.decompress(data, -zlib.MAX_WBITS)
1364         except zlib.error:
1365             return zlib.decompress(data)
1366
1367     @staticmethod
1368     def brotli(data):
1369         if not data:
1370             return data
1371         return brotli.decompress(data)
1372
1373     @staticmethod
1374     def gz(data):
1375         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
1376         try:
1377             return gz.read()
1378         except OSError as original_oserror:
1379             # There may be junk add the end of the file
1380             # See http://stackoverflow.com/q/4928560/35070 for details
1381             for i in range(1, 1024):
1382                 try:
1383                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
1384                     return gz.read()
1385                 except OSError:
1386                     continue
1387             else:
1388                 raise original_oserror
1389
1390     def http_request(self, req):
1391         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1392         # always respected by websites, some tend to give out URLs with non percent-encoded
1393         # non-ASCII characters (see telemb.py, ard.py [#3412])
1394         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1395         # To work around aforementioned issue we will replace request's original URL with
1396         # percent-encoded one
1397         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1398         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1399         url = req.get_full_url()
1400         url_escaped = escape_url(url)
1401
1402         # Substitute URL if any change after escaping
1403         if url != url_escaped:
1404             req = update_Request(req, url=url_escaped)
1405
1406         for h, v in self._params.get('http_headers', std_headers).items():
1407             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1408             # The dict keys are capitalized because of this bug by urllib
1409             if h.capitalize() not in req.headers:
1410                 req.add_header(h, v)
1411
1412         if 'Youtubedl-no-compression' in req.headers:  # deprecated
1413             req.headers.pop('Youtubedl-no-compression', None)
1414             req.add_header('Accept-encoding', 'identity')
1415
1416         if 'Accept-encoding' not in req.headers:
1417             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1418
1419         return super().do_request_(req)
1420
1421     def http_response(self, req, resp):
1422         old_resp = resp
1423
1424         # Content-Encoding header lists the encodings in order that they were applied [1].
1425         # To decompress, we simply do the reverse.
1426         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
1427         decoded_response = None
1428         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
1429             if encoding == 'gzip':
1430                 decoded_response = self.gz(decoded_response or resp.read())
1431             elif encoding == 'deflate':
1432                 decoded_response = self.deflate(decoded_response or resp.read())
1433             elif encoding == 'br' and brotli:
1434                 decoded_response = self.brotli(decoded_response or resp.read())
1435
1436         if decoded_response is not None:
1437             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
1438             resp.msg = old_resp.msg
1439         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1440         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1441         if 300 <= resp.code < 400:
1442             location = resp.headers.get('Location')
1443             if location:
1444                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1445                 location = location.encode('iso-8859-1').decode()
1446                 location_escaped = escape_url(location)
1447                 if location != location_escaped:
1448                     del resp.headers['Location']
1449                     resp.headers['Location'] = location_escaped
1450         return resp
1451
1452     https_request = http_request
1453     https_response = http_response
1454
1455
1456 def make_socks_conn_class(base_class, socks_proxy):
1457     assert issubclass(base_class, (
1458         http.client.HTTPConnection, http.client.HTTPSConnection))
1459
1460     url_components = urllib.parse.urlparse(socks_proxy)
1461     if url_components.scheme.lower() == 'socks5':
1462         socks_type = ProxyType.SOCKS5
1463     elif url_components.scheme.lower() in ('socks', 'socks4'):
1464         socks_type = ProxyType.SOCKS4
1465     elif url_components.scheme.lower() == 'socks4a':
1466         socks_type = ProxyType.SOCKS4A
1467
1468     def unquote_if_non_empty(s):
1469         if not s:
1470             return s
1471         return urllib.parse.unquote_plus(s)
1472
1473     proxy_args = (
1474         socks_type,
1475         url_components.hostname, url_components.port or 1080,
1476         True,  # Remote DNS
1477         unquote_if_non_empty(url_components.username),
1478         unquote_if_non_empty(url_components.password),
1479     )
1480
1481     class SocksConnection(base_class):
1482         def connect(self):
1483             self.sock = sockssocket()
1484             self.sock.setproxy(*proxy_args)
1485             if isinstance(self.timeout, (int, float)):
1486                 self.sock.settimeout(self.timeout)
1487             self.sock.connect((self.host, self.port))
1488
1489             if isinstance(self, http.client.HTTPSConnection):
1490                 if hasattr(self, '_context'):  # Python > 2.6
1491                     self.sock = self._context.wrap_socket(
1492                         self.sock, server_hostname=self.host)
1493                 else:
1494                     self.sock = ssl.wrap_socket(self.sock)
1495
1496     return SocksConnection
1497
1498
1499 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1500     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1501         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1502         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1503         self._params = params
1504
1505     def https_open(self, req):
1506         kwargs = {}
1507         conn_class = self._https_conn_class
1508
1509         if hasattr(self, '_context'):  # python > 2.6
1510             kwargs['context'] = self._context
1511         if hasattr(self, '_check_hostname'):  # python 3.x
1512             kwargs['check_hostname'] = self._check_hostname
1513
1514         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1515         if socks_proxy:
1516             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1517             del req.headers['Ytdl-socks-proxy']
1518
1519         try:
1520             return self.do_open(
1521                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1522         except urllib.error.URLError as e:
1523             if (isinstance(e.reason, ssl.SSLError)
1524                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1525                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1526             raise
1527
1528
1529 def is_path_like(f):
1530     return isinstance(f, (str, bytes, os.PathLike))
1531
1532
1533 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1534     def __init__(self, cookiejar=None):
1535         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1536
1537     def http_response(self, request, response):
1538         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1539
1540     https_request = urllib.request.HTTPCookieProcessor.http_request
1541     https_response = http_response
1542
1543
1544 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1545     """YoutubeDL redirect handler
1546
1547     The code is based on HTTPRedirectHandler implementation from CPython [1].
1548
1549     This redirect handler fixes and improves the logic to better align with RFC7261
1550      and what browsers tend to do [2][3]
1551
1552     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1553     2. https://datatracker.ietf.org/doc/html/rfc7231
1554     3. https://github.com/python/cpython/issues/91306
1555     """
1556
1557     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1558
1559     def redirect_request(self, req, fp, code, msg, headers, newurl):
1560         if code not in (301, 302, 303, 307, 308):
1561             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1562
1563         new_method = req.get_method()
1564         new_data = req.data
1565         remove_headers = []
1566         # A 303 must either use GET or HEAD for subsequent request
1567         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1568         if code == 303 and req.get_method() != 'HEAD':
1569             new_method = 'GET'
1570         # 301 and 302 redirects are commonly turned into a GET from a POST
1571         # for subsequent requests by browsers, so we'll do the same.
1572         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1573         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1574         elif code in (301, 302) and req.get_method() == 'POST':
1575             new_method = 'GET'
1576
1577         # only remove payload if method changed (e.g. POST to GET)
1578         if new_method != req.get_method():
1579             new_data = None
1580             remove_headers.extend(['Content-Length', 'Content-Type'])
1581
1582         new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
1583
1584         return urllib.request.Request(
1585             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1586             unverifiable=True, method=new_method, data=new_data)
1587
1588
1589 def extract_timezone(date_str):
1590     m = re.search(
1591         r'''(?x)
1592             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1593             (?P<tz>Z|                                            # just the UTC Z, or
1594                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1595                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1596                    [ ]?                                          # optional space
1597                 (?P<sign>\+|-)                                   # +/-
1598                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1599             $)
1600         ''', date_str)
1601     if not m:
1602         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1603         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1604         if timezone is not None:
1605             date_str = date_str[:-len(m.group('tz'))]
1606         timezone = datetime.timedelta(hours=timezone or 0)
1607     else:
1608         date_str = date_str[:-len(m.group('tz'))]
1609         if not m.group('sign'):
1610             timezone = datetime.timedelta()
1611         else:
1612             sign = 1 if m.group('sign') == '+' else -1
1613             timezone = datetime.timedelta(
1614                 hours=sign * int(m.group('hours')),
1615                 minutes=sign * int(m.group('minutes')))
1616     return timezone, date_str
1617
1618
1619 def parse_iso8601(date_str, delimiter='T', timezone=None):
1620     """ Return a UNIX timestamp from the given date """
1621
1622     if date_str is None:
1623         return None
1624
1625     date_str = re.sub(r'\.[0-9]+', '', date_str)
1626
1627     if timezone is None:
1628         timezone, date_str = extract_timezone(date_str)
1629
1630     with contextlib.suppress(ValueError):
1631         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1632         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1633         return calendar.timegm(dt.timetuple())
1634
1635
1636 def date_formats(day_first=True):
1637     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1638
1639
1640 def unified_strdate(date_str, day_first=True):
1641     """Return a string with the date in the format YYYYMMDD"""
1642
1643     if date_str is None:
1644         return None
1645     upload_date = None
1646     # Replace commas
1647     date_str = date_str.replace(',', ' ')
1648     # Remove AM/PM + timezone
1649     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1650     _, date_str = extract_timezone(date_str)
1651
1652     for expression in date_formats(day_first):
1653         with contextlib.suppress(ValueError):
1654             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1655     if upload_date is None:
1656         timetuple = email.utils.parsedate_tz(date_str)
1657         if timetuple:
1658             with contextlib.suppress(ValueError):
1659                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1660     if upload_date is not None:
1661         return str(upload_date)
1662
1663
1664 def unified_timestamp(date_str, day_first=True):
1665     if date_str is None:
1666         return None
1667
1668     date_str = re.sub(r'\s+', ' ', re.sub(
1669         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1670
1671     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1672     timezone, date_str = extract_timezone(date_str)
1673
1674     # Remove AM/PM + timezone
1675     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1676
1677     # Remove unrecognized timezones from ISO 8601 alike timestamps
1678     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1679     if m:
1680         date_str = date_str[:-len(m.group('tz'))]
1681
1682     # Python only supports microseconds, so remove nanoseconds
1683     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1684     if m:
1685         date_str = m.group(1)
1686
1687     for expression in date_formats(day_first):
1688         with contextlib.suppress(ValueError):
1689             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1690             return calendar.timegm(dt.timetuple())
1691
1692     timetuple = email.utils.parsedate_tz(date_str)
1693     if timetuple:
1694         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1695
1696
1697 def determine_ext(url, default_ext='unknown_video'):
1698     if url is None or '.' not in url:
1699         return default_ext
1700     guess = url.partition('?')[0].rpartition('.')[2]
1701     if re.match(r'^[A-Za-z0-9]+$', guess):
1702         return guess
1703     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1704     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1705         return guess.rstrip('/')
1706     else:
1707         return default_ext
1708
1709
1710 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1711     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1712
1713
1714 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1715     R"""
1716     Return a datetime object from a string.
1717     Supported format:
1718         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1719
1720     @param format       strftime format of DATE
1721     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1722                         auto: round to the unit provided in date_str (if applicable).
1723     """
1724     auto_precision = False
1725     if precision == 'auto':
1726         auto_precision = True
1727         precision = 'microsecond'
1728     today = datetime_round(datetime.datetime.utcnow(), precision)
1729     if date_str in ('now', 'today'):
1730         return today
1731     if date_str == 'yesterday':
1732         return today - datetime.timedelta(days=1)
1733     match = re.match(
1734         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1735         date_str)
1736     if match is not None:
1737         start_time = datetime_from_str(match.group('start'), precision, format)
1738         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1739         unit = match.group('unit')
1740         if unit == 'month' or unit == 'year':
1741             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1742             unit = 'day'
1743         else:
1744             if unit == 'week':
1745                 unit = 'day'
1746                 time *= 7
1747             delta = datetime.timedelta(**{unit + 's': time})
1748             new_date = start_time + delta
1749         if auto_precision:
1750             return datetime_round(new_date, unit)
1751         return new_date
1752
1753     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1754
1755
1756 def date_from_str(date_str, format='%Y%m%d', strict=False):
1757     R"""
1758     Return a date object from a string using datetime_from_str
1759
1760     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1761                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1762     """
1763     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1764         raise ValueError(f'Invalid date format "{date_str}"')
1765     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1766
1767
1768 def datetime_add_months(dt, months):
1769     """Increment/Decrement a datetime object by months."""
1770     month = dt.month + months - 1
1771     year = dt.year + month // 12
1772     month = month % 12 + 1
1773     day = min(dt.day, calendar.monthrange(year, month)[1])
1774     return dt.replace(year, month, day)
1775
1776
1777 def datetime_round(dt, precision='day'):
1778     """
1779     Round a datetime object's time to a specific precision
1780     """
1781     if precision == 'microsecond':
1782         return dt
1783
1784     unit_seconds = {
1785         'day': 86400,
1786         'hour': 3600,
1787         'minute': 60,
1788         'second': 1,
1789     }
1790     roundto = lambda x, n: ((x + n / 2) // n) * n
1791     timestamp = calendar.timegm(dt.timetuple())
1792     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1793
1794
1795 def hyphenate_date(date_str):
1796     """
1797     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1798     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1799     if match is not None:
1800         return '-'.join(match.groups())
1801     else:
1802         return date_str
1803
1804
1805 class DateRange:
1806     """Represents a time interval between two dates"""
1807
1808     def __init__(self, start=None, end=None):
1809         """start and end must be strings in the format accepted by date"""
1810         if start is not None:
1811             self.start = date_from_str(start, strict=True)
1812         else:
1813             self.start = datetime.datetime.min.date()
1814         if end is not None:
1815             self.end = date_from_str(end, strict=True)
1816         else:
1817             self.end = datetime.datetime.max.date()
1818         if self.start > self.end:
1819             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1820
1821     @classmethod
1822     def day(cls, day):
1823         """Returns a range that only contains the given day"""
1824         return cls(day, day)
1825
1826     def __contains__(self, date):
1827         """Check if the date is in the range"""
1828         if not isinstance(date, datetime.date):
1829             date = date_from_str(date)
1830         return self.start <= date <= self.end
1831
1832     def __repr__(self):
1833         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1834
1835     def __eq__(self, other):
1836         return (isinstance(other, DateRange)
1837                 and self.start == other.start and self.end == other.end)
1838
1839
1840 @functools.cache
1841 def system_identifier():
1842     python_implementation = platform.python_implementation()
1843     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1844         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1845     libc_ver = []
1846     with contextlib.suppress(OSError):  # We may not have access to the executable
1847         libc_ver = platform.libc_ver()
1848
1849     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1850         platform.python_version(),
1851         python_implementation,
1852         platform.machine(),
1853         platform.architecture()[0],
1854         platform.platform(),
1855         ssl.OPENSSL_VERSION,
1856         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1857     )
1858
1859
1860 @functools.cache
1861 def get_windows_version():
1862     ''' Get Windows version. returns () if it's not running on Windows '''
1863     if compat_os_name == 'nt':
1864         return version_tuple(platform.win32_ver()[1])
1865     else:
1866         return ()
1867
1868
1869 def write_string(s, out=None, encoding=None):
1870     assert isinstance(s, str)
1871     out = out or sys.stderr
1872     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1873     if not out:
1874         return
1875
1876     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1877         s = re.sub(r'([\r\n]+)', r' \1', s)
1878
1879     enc, buffer = None, out
1880     if 'b' in getattr(out, 'mode', ''):
1881         enc = encoding or preferredencoding()
1882     elif hasattr(out, 'buffer'):
1883         buffer = out.buffer
1884         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1885
1886     buffer.write(s.encode(enc, 'ignore') if enc else s)
1887     out.flush()
1888
1889
1890 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1891     from .. import _IN_CLI
1892     if _IN_CLI:
1893         if msg in deprecation_warning._cache:
1894             return
1895         deprecation_warning._cache.add(msg)
1896         if printer:
1897             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1898         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1899     else:
1900         import warnings
1901         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1902
1903
1904 deprecation_warning._cache = set()
1905
1906
1907 def bytes_to_intlist(bs):
1908     if not bs:
1909         return []
1910     if isinstance(bs[0], int):  # Python 3
1911         return list(bs)
1912     else:
1913         return [ord(c) for c in bs]
1914
1915
1916 def intlist_to_bytes(xs):
1917     if not xs:
1918         return b''
1919     return struct.pack('%dB' % len(xs), *xs)
1920
1921
1922 class LockingUnsupportedError(OSError):
1923     msg = 'File locking is not supported'
1924
1925     def __init__(self):
1926         super().__init__(self.msg)
1927
1928
1929 # Cross-platform file locking
1930 if sys.platform == 'win32':
1931     import ctypes
1932     import ctypes.wintypes
1933     import msvcrt
1934
1935     class OVERLAPPED(ctypes.Structure):
1936         _fields_ = [
1937             ('Internal', ctypes.wintypes.LPVOID),
1938             ('InternalHigh', ctypes.wintypes.LPVOID),
1939             ('Offset', ctypes.wintypes.DWORD),
1940             ('OffsetHigh', ctypes.wintypes.DWORD),
1941             ('hEvent', ctypes.wintypes.HANDLE),
1942         ]
1943
1944     kernel32 = ctypes.WinDLL('kernel32')
1945     LockFileEx = kernel32.LockFileEx
1946     LockFileEx.argtypes = [
1947         ctypes.wintypes.HANDLE,     # hFile
1948         ctypes.wintypes.DWORD,      # dwFlags
1949         ctypes.wintypes.DWORD,      # dwReserved
1950         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1951         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1952         ctypes.POINTER(OVERLAPPED)  # Overlapped
1953     ]
1954     LockFileEx.restype = ctypes.wintypes.BOOL
1955     UnlockFileEx = kernel32.UnlockFileEx
1956     UnlockFileEx.argtypes = [
1957         ctypes.wintypes.HANDLE,     # hFile
1958         ctypes.wintypes.DWORD,      # dwReserved
1959         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1960         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1961         ctypes.POINTER(OVERLAPPED)  # Overlapped
1962     ]
1963     UnlockFileEx.restype = ctypes.wintypes.BOOL
1964     whole_low = 0xffffffff
1965     whole_high = 0x7fffffff
1966
1967     def _lock_file(f, exclusive, block):
1968         overlapped = OVERLAPPED()
1969         overlapped.Offset = 0
1970         overlapped.OffsetHigh = 0
1971         overlapped.hEvent = 0
1972         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1973
1974         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1975                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1976                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1977             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1978             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1979
1980     def _unlock_file(f):
1981         assert f._lock_file_overlapped_p
1982         handle = msvcrt.get_osfhandle(f.fileno())
1983         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1984             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1985
1986 else:
1987     try:
1988         import fcntl
1989
1990         def _lock_file(f, exclusive, block):
1991             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1992             if not block:
1993                 flags |= fcntl.LOCK_NB
1994             try:
1995                 fcntl.flock(f, flags)
1996             except BlockingIOError:
1997                 raise
1998             except OSError:  # AOSP does not have flock()
1999                 fcntl.lockf(f, flags)
2000
2001         def _unlock_file(f):
2002             with contextlib.suppress(OSError):
2003                 return fcntl.flock(f, fcntl.LOCK_UN)
2004             with contextlib.suppress(OSError):
2005                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
2006             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
2007
2008     except ImportError:
2009
2010         def _lock_file(f, exclusive, block):
2011             raise LockingUnsupportedError()
2012
2013         def _unlock_file(f):
2014             raise LockingUnsupportedError()
2015
2016
2017 class locked_file:
2018     locked = False
2019
2020     def __init__(self, filename, mode, block=True, encoding=None):
2021         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2022             raise NotImplementedError(mode)
2023         self.mode, self.block = mode, block
2024
2025         writable = any(f in mode for f in 'wax+')
2026         readable = any(f in mode for f in 'r+')
2027         flags = functools.reduce(operator.ior, (
2028             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2029             getattr(os, 'O_BINARY', 0),  # Windows only
2030             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2031             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2032             os.O_APPEND if 'a' in mode else 0,
2033             os.O_EXCL if 'x' in mode else 0,
2034             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2035         ))
2036
2037         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2038
2039     def __enter__(self):
2040         exclusive = 'r' not in self.mode
2041         try:
2042             _lock_file(self.f, exclusive, self.block)
2043             self.locked = True
2044         except OSError:
2045             self.f.close()
2046             raise
2047         if 'w' in self.mode:
2048             try:
2049                 self.f.truncate()
2050             except OSError as e:
2051                 if e.errno not in (
2052                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2053                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2054                 ):
2055                     raise
2056         return self
2057
2058     def unlock(self):
2059         if not self.locked:
2060             return
2061         try:
2062             _unlock_file(self.f)
2063         finally:
2064             self.locked = False
2065
2066     def __exit__(self, *_):
2067         try:
2068             self.unlock()
2069         finally:
2070             self.f.close()
2071
2072     open = __enter__
2073     close = __exit__
2074
2075     def __getattr__(self, attr):
2076         return getattr(self.f, attr)
2077
2078     def __iter__(self):
2079         return iter(self.f)
2080
2081
2082 @functools.cache
2083 def get_filesystem_encoding():
2084     encoding = sys.getfilesystemencoding()
2085     return encoding if encoding is not None else 'utf-8'
2086
2087
2088 def shell_quote(args):
2089     quoted_args = []
2090     encoding = get_filesystem_encoding()
2091     for a in args:
2092         if isinstance(a, bytes):
2093             # We may get a filename encoded with 'encodeFilename'
2094             a = a.decode(encoding)
2095         quoted_args.append(compat_shlex_quote(a))
2096     return ' '.join(quoted_args)
2097
2098
2099 def smuggle_url(url, data):
2100     """ Pass additional data in a URL for internal use. """
2101
2102     url, idata = unsmuggle_url(url, {})
2103     data.update(idata)
2104     sdata = urllib.parse.urlencode(
2105         {'__youtubedl_smuggle': json.dumps(data)})
2106     return url + '#' + sdata
2107
2108
2109 def unsmuggle_url(smug_url, default=None):
2110     if '#__youtubedl_smuggle' not in smug_url:
2111         return smug_url, default
2112     url, _, sdata = smug_url.rpartition('#')
2113     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2114     data = json.loads(jsond)
2115     return url, data
2116
2117
2118 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2119     """ Formats numbers with decimal sufixes like K, M, etc """
2120     num, factor = float_or_none(num), float(factor)
2121     if num is None or num < 0:
2122         return None
2123     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2124     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2125     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2126     if factor == 1024:
2127         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2128     converted = num / (factor ** exponent)
2129     return fmt % (converted, suffix)
2130
2131
2132 def format_bytes(bytes):
2133     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2134
2135
2136 def lookup_unit_table(unit_table, s, strict=False):
2137     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2138     units_re = '|'.join(re.escape(u) for u in unit_table)
2139     m = (re.fullmatch if strict else re.match)(
2140         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2141     if not m:
2142         return None
2143
2144     num = float(m.group('num').replace(',', '.'))
2145     mult = unit_table[m.group('unit')]
2146     return round(num * mult)
2147
2148
2149 def parse_bytes(s):
2150     """Parse a string indicating a byte quantity into an integer"""
2151     return lookup_unit_table(
2152         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2153         s.upper(), strict=True)
2154
2155
2156 def parse_filesize(s):
2157     if s is None:
2158         return None
2159
2160     # The lower-case forms are of course incorrect and unofficial,
2161     # but we support those too
2162     _UNIT_TABLE = {
2163         'B': 1,
2164         'b': 1,
2165         'bytes': 1,
2166         'KiB': 1024,
2167         'KB': 1000,
2168         'kB': 1024,
2169         'Kb': 1000,
2170         'kb': 1000,
2171         'kilobytes': 1000,
2172         'kibibytes': 1024,
2173         'MiB': 1024 ** 2,
2174         'MB': 1000 ** 2,
2175         'mB': 1024 ** 2,
2176         'Mb': 1000 ** 2,
2177         'mb': 1000 ** 2,
2178         'megabytes': 1000 ** 2,
2179         'mebibytes': 1024 ** 2,
2180         'GiB': 1024 ** 3,
2181         'GB': 1000 ** 3,
2182         'gB': 1024 ** 3,
2183         'Gb': 1000 ** 3,
2184         'gb': 1000 ** 3,
2185         'gigabytes': 1000 ** 3,
2186         'gibibytes': 1024 ** 3,
2187         'TiB': 1024 ** 4,
2188         'TB': 1000 ** 4,
2189         'tB': 1024 ** 4,
2190         'Tb': 1000 ** 4,
2191         'tb': 1000 ** 4,
2192         'terabytes': 1000 ** 4,
2193         'tebibytes': 1024 ** 4,
2194         'PiB': 1024 ** 5,
2195         'PB': 1000 ** 5,
2196         'pB': 1024 ** 5,
2197         'Pb': 1000 ** 5,
2198         'pb': 1000 ** 5,
2199         'petabytes': 1000 ** 5,
2200         'pebibytes': 1024 ** 5,
2201         'EiB': 1024 ** 6,
2202         'EB': 1000 ** 6,
2203         'eB': 1024 ** 6,
2204         'Eb': 1000 ** 6,
2205         'eb': 1000 ** 6,
2206         'exabytes': 1000 ** 6,
2207         'exbibytes': 1024 ** 6,
2208         'ZiB': 1024 ** 7,
2209         'ZB': 1000 ** 7,
2210         'zB': 1024 ** 7,
2211         'Zb': 1000 ** 7,
2212         'zb': 1000 ** 7,
2213         'zettabytes': 1000 ** 7,
2214         'zebibytes': 1024 ** 7,
2215         'YiB': 1024 ** 8,
2216         'YB': 1000 ** 8,
2217         'yB': 1024 ** 8,
2218         'Yb': 1000 ** 8,
2219         'yb': 1000 ** 8,
2220         'yottabytes': 1000 ** 8,
2221         'yobibytes': 1024 ** 8,
2222     }
2223
2224     return lookup_unit_table(_UNIT_TABLE, s)
2225
2226
2227 def parse_count(s):
2228     if s is None:
2229         return None
2230
2231     s = re.sub(r'^[^\d]+\s', '', s).strip()
2232
2233     if re.match(r'^[\d,.]+$', s):
2234         return str_to_int(s)
2235
2236     _UNIT_TABLE = {
2237         'k': 1000,
2238         'K': 1000,
2239         'm': 1000 ** 2,
2240         'M': 1000 ** 2,
2241         'kk': 1000 ** 2,
2242         'KK': 1000 ** 2,
2243         'b': 1000 ** 3,
2244         'B': 1000 ** 3,
2245     }
2246
2247     ret = lookup_unit_table(_UNIT_TABLE, s)
2248     if ret is not None:
2249         return ret
2250
2251     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2252     if mobj:
2253         return str_to_int(mobj.group(1))
2254
2255
2256 def parse_resolution(s, *, lenient=False):
2257     if s is None:
2258         return {}
2259
2260     if lenient:
2261         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2262     else:
2263         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2264     if mobj:
2265         return {
2266             'width': int(mobj.group('w')),
2267             'height': int(mobj.group('h')),
2268         }
2269
2270     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2271     if mobj:
2272         return {'height': int(mobj.group(1))}
2273
2274     mobj = re.search(r'\b([48])[kK]\b', s)
2275     if mobj:
2276         return {'height': int(mobj.group(1)) * 540}
2277
2278     return {}
2279
2280
2281 def parse_bitrate(s):
2282     if not isinstance(s, str):
2283         return
2284     mobj = re.search(r'\b(\d+)\s*kbps', s)
2285     if mobj:
2286         return int(mobj.group(1))
2287
2288
2289 def month_by_name(name, lang='en'):
2290     """ Return the number of a month by (locale-independently) English name """
2291
2292     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2293
2294     try:
2295         return month_names.index(name) + 1
2296     except ValueError:
2297         return None
2298
2299
2300 def month_by_abbreviation(abbrev):
2301     """ Return the number of a month by (locale-independently) English
2302         abbreviations """
2303
2304     try:
2305         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2306     except ValueError:
2307         return None
2308
2309
2310 def fix_xml_ampersands(xml_str):
2311     """Replace all the '&' by '&amp;' in XML"""
2312     return re.sub(
2313         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2314         '&amp;',
2315         xml_str)
2316
2317
2318 def setproctitle(title):
2319     assert isinstance(title, str)
2320
2321     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2322     try:
2323         import ctypes
2324     except ImportError:
2325         return
2326
2327     try:
2328         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2329     except OSError:
2330         return
2331     except TypeError:
2332         # LoadLibrary in Windows Python 2.7.13 only expects
2333         # a bytestring, but since unicode_literals turns
2334         # every string into a unicode string, it fails.
2335         return
2336     title_bytes = title.encode()
2337     buf = ctypes.create_string_buffer(len(title_bytes))
2338     buf.value = title_bytes
2339     try:
2340         libc.prctl(15, buf, 0, 0, 0)
2341     except AttributeError:
2342         return  # Strange libc, just skip this
2343
2344
2345 def remove_start(s, start):
2346     return s[len(start):] if s is not None and s.startswith(start) else s
2347
2348
2349 def remove_end(s, end):
2350     return s[:-len(end)] if s is not None and s.endswith(end) else s
2351
2352
2353 def remove_quotes(s):
2354     if s is None or len(s) < 2:
2355         return s
2356     for quote in ('"', "'", ):
2357         if s[0] == quote and s[-1] == quote:
2358             return s[1:-1]
2359     return s
2360
2361
2362 def get_domain(url):
2363     """
2364     This implementation is inconsistent, but is kept for compatibility.
2365     Use this only for "webpage_url_domain"
2366     """
2367     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2368
2369
2370 def url_basename(url):
2371     path = urllib.parse.urlparse(url).path
2372     return path.strip('/').split('/')[-1]
2373
2374
2375 def base_url(url):
2376     return re.match(r'https?://[^?#]+/', url).group()
2377
2378
2379 def urljoin(base, path):
2380     if isinstance(path, bytes):
2381         path = path.decode()
2382     if not isinstance(path, str) or not path:
2383         return None
2384     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2385         return path
2386     if isinstance(base, bytes):
2387         base = base.decode()
2388     if not isinstance(base, str) or not re.match(
2389             r'^(?:https?:)?//', base):
2390         return None
2391     return urllib.parse.urljoin(base, path)
2392
2393
2394 class HEADRequest(urllib.request.Request):
2395     def get_method(self):
2396         return 'HEAD'
2397
2398
2399 class PUTRequest(urllib.request.Request):
2400     def get_method(self):
2401         return 'PUT'
2402
2403
2404 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2405     if get_attr and v is not None:
2406         v = getattr(v, get_attr, None)
2407     try:
2408         return int(v) * invscale // scale
2409     except (ValueError, TypeError, OverflowError):
2410         return default
2411
2412
2413 def str_or_none(v, default=None):
2414     return default if v is None else str(v)
2415
2416
2417 def str_to_int(int_str):
2418     """ A more relaxed version of int_or_none """
2419     if isinstance(int_str, int):
2420         return int_str
2421     elif isinstance(int_str, str):
2422         int_str = re.sub(r'[,\.\+]', '', int_str)
2423         return int_or_none(int_str)
2424
2425
2426 def float_or_none(v, scale=1, invscale=1, default=None):
2427     if v is None:
2428         return default
2429     try:
2430         return float(v) * invscale / scale
2431     except (ValueError, TypeError):
2432         return default
2433
2434
2435 def bool_or_none(v, default=None):
2436     return v if isinstance(v, bool) else default
2437
2438
2439 def strip_or_none(v, default=None):
2440     return v.strip() if isinstance(v, str) else default
2441
2442
2443 def url_or_none(url):
2444     if not url or not isinstance(url, str):
2445         return None
2446     url = url.strip()
2447     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2448
2449
2450 def request_to_url(req):
2451     if isinstance(req, urllib.request.Request):
2452         return req.get_full_url()
2453     else:
2454         return req
2455
2456
2457 def strftime_or_none(timestamp, date_format, default=None):
2458     datetime_object = None
2459     try:
2460         if isinstance(timestamp, (int, float)):  # unix timestamp
2461             # Using naive datetime here can break timestamp() in Windows
2462             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2463             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2464             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2465             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2466                                + datetime.timedelta(seconds=timestamp))
2467         elif isinstance(timestamp, str):  # assume YYYYMMDD
2468             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2469         date_format = re.sub(  # Support %s on windows
2470             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2471         return datetime_object.strftime(date_format)
2472     except (ValueError, TypeError, AttributeError):
2473         return default
2474
2475
2476 def parse_duration(s):
2477     if not isinstance(s, str):
2478         return None
2479     s = s.strip()
2480     if not s:
2481         return None
2482
2483     days, hours, mins, secs, ms = [None] * 5
2484     m = re.match(r'''(?x)
2485             (?P<before_secs>
2486                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2487             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2488             (?P<ms>[.:][0-9]+)?Z?$
2489         ''', s)
2490     if m:
2491         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2492     else:
2493         m = re.match(
2494             r'''(?ix)(?:P?
2495                 (?:
2496                     [0-9]+\s*y(?:ears?)?,?\s*
2497                 )?
2498                 (?:
2499                     [0-9]+\s*m(?:onths?)?,?\s*
2500                 )?
2501                 (?:
2502                     [0-9]+\s*w(?:eeks?)?,?\s*
2503                 )?
2504                 (?:
2505                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2506                 )?
2507                 T)?
2508                 (?:
2509                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2510                 )?
2511                 (?:
2512                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2513                 )?
2514                 (?:
2515                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2516                 )?Z?$''', s)
2517         if m:
2518             days, hours, mins, secs, ms = m.groups()
2519         else:
2520             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2521             if m:
2522                 hours, mins = m.groups()
2523             else:
2524                 return None
2525
2526     if ms:
2527         ms = ms.replace(':', '.')
2528     return sum(float(part or 0) * mult for part, mult in (
2529         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2530
2531
2532 def prepend_extension(filename, ext, expected_real_ext=None):
2533     name, real_ext = os.path.splitext(filename)
2534     return (
2535         f'{name}.{ext}{real_ext}'
2536         if not expected_real_ext or real_ext[1:] == expected_real_ext
2537         else f'{filename}.{ext}')
2538
2539
2540 def replace_extension(filename, ext, expected_real_ext=None):
2541     name, real_ext = os.path.splitext(filename)
2542     return '{}.{}'.format(
2543         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2544         ext)
2545
2546
2547 def check_executable(exe, args=[]):
2548     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2549     args can be a list of arguments for a short output (like -version) """
2550     try:
2551         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2552     except OSError:
2553         return False
2554     return exe
2555
2556
2557 def _get_exe_version_output(exe, args):
2558     try:
2559         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2560         # SIGTTOU if yt-dlp is run in the background.
2561         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2562         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2563                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2564         if ret:
2565             return None
2566     except OSError:
2567         return False
2568     return stdout
2569
2570
2571 def detect_exe_version(output, version_re=None, unrecognized='present'):
2572     assert isinstance(output, str)
2573     if version_re is None:
2574         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2575     m = re.search(version_re, output)
2576     if m:
2577         return m.group(1)
2578     else:
2579         return unrecognized
2580
2581
2582 def get_exe_version(exe, args=['--version'],
2583                     version_re=None, unrecognized=('present', 'broken')):
2584     """ Returns the version of the specified executable,
2585     or False if the executable is not present """
2586     unrecognized = variadic(unrecognized)
2587     assert len(unrecognized) in (1, 2)
2588     out = _get_exe_version_output(exe, args)
2589     if out is None:
2590         return unrecognized[-1]
2591     return out and detect_exe_version(out, version_re, unrecognized[0])
2592
2593
2594 def frange(start=0, stop=None, step=1):
2595     """Float range"""
2596     if stop is None:
2597         start, stop = 0, start
2598     sign = [-1, 1][step > 0] if step else 0
2599     while sign * start < sign * stop:
2600         yield start
2601         start += step
2602
2603
2604 class LazyList(collections.abc.Sequence):
2605     """Lazy immutable list from an iterable
2606     Note that slices of a LazyList are lists and not LazyList"""
2607
2608     class IndexError(IndexError):
2609         pass
2610
2611     def __init__(self, iterable, *, reverse=False, _cache=None):
2612         self._iterable = iter(iterable)
2613         self._cache = [] if _cache is None else _cache
2614         self._reversed = reverse
2615
2616     def __iter__(self):
2617         if self._reversed:
2618             # We need to consume the entire iterable to iterate in reverse
2619             yield from self.exhaust()
2620             return
2621         yield from self._cache
2622         for item in self._iterable:
2623             self._cache.append(item)
2624             yield item
2625
2626     def _exhaust(self):
2627         self._cache.extend(self._iterable)
2628         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2629         return self._cache
2630
2631     def exhaust(self):
2632         """Evaluate the entire iterable"""
2633         return self._exhaust()[::-1 if self._reversed else 1]
2634
2635     @staticmethod
2636     def _reverse_index(x):
2637         return None if x is None else ~x
2638
2639     def __getitem__(self, idx):
2640         if isinstance(idx, slice):
2641             if self._reversed:
2642                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2643             start, stop, step = idx.start, idx.stop, idx.step or 1
2644         elif isinstance(idx, int):
2645             if self._reversed:
2646                 idx = self._reverse_index(idx)
2647             start, stop, step = idx, idx, 0
2648         else:
2649             raise TypeError('indices must be integers or slices')
2650         if ((start or 0) < 0 or (stop or 0) < 0
2651                 or (start is None and step < 0)
2652                 or (stop is None and step > 0)):
2653             # We need to consume the entire iterable to be able to slice from the end
2654             # Obviously, never use this with infinite iterables
2655             self._exhaust()
2656             try:
2657                 return self._cache[idx]
2658             except IndexError as e:
2659                 raise self.IndexError(e) from e
2660         n = max(start or 0, stop or 0) - len(self._cache) + 1
2661         if n > 0:
2662             self._cache.extend(itertools.islice(self._iterable, n))
2663         try:
2664             return self._cache[idx]
2665         except IndexError as e:
2666             raise self.IndexError(e) from e
2667
2668     def __bool__(self):
2669         try:
2670             self[-1] if self._reversed else self[0]
2671         except self.IndexError:
2672             return False
2673         return True
2674
2675     def __len__(self):
2676         self._exhaust()
2677         return len(self._cache)
2678
2679     def __reversed__(self):
2680         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2681
2682     def __copy__(self):
2683         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2684
2685     def __repr__(self):
2686         # repr and str should mimic a list. So we exhaust the iterable
2687         return repr(self.exhaust())
2688
2689     def __str__(self):
2690         return repr(self.exhaust())
2691
2692
2693 class PagedList:
2694
2695     class IndexError(IndexError):
2696         pass
2697
2698     def __len__(self):
2699         # This is only useful for tests
2700         return len(self.getslice())
2701
2702     def __init__(self, pagefunc, pagesize, use_cache=True):
2703         self._pagefunc = pagefunc
2704         self._pagesize = pagesize
2705         self._pagecount = float('inf')
2706         self._use_cache = use_cache
2707         self._cache = {}
2708
2709     def getpage(self, pagenum):
2710         page_results = self._cache.get(pagenum)
2711         if page_results is None:
2712             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2713         if self._use_cache:
2714             self._cache[pagenum] = page_results
2715         return page_results
2716
2717     def getslice(self, start=0, end=None):
2718         return list(self._getslice(start, end))
2719
2720     def _getslice(self, start, end):
2721         raise NotImplementedError('This method must be implemented by subclasses')
2722
2723     def __getitem__(self, idx):
2724         assert self._use_cache, 'Indexing PagedList requires cache'
2725         if not isinstance(idx, int) or idx < 0:
2726             raise TypeError('indices must be non-negative integers')
2727         entries = self.getslice(idx, idx + 1)
2728         if not entries:
2729             raise self.IndexError()
2730         return entries[0]
2731
2732
2733 class OnDemandPagedList(PagedList):
2734     """Download pages until a page with less than maximum results"""
2735
2736     def _getslice(self, start, end):
2737         for pagenum in itertools.count(start // self._pagesize):
2738             firstid = pagenum * self._pagesize
2739             nextfirstid = pagenum * self._pagesize + self._pagesize
2740             if start >= nextfirstid:
2741                 continue
2742
2743             startv = (
2744                 start % self._pagesize
2745                 if firstid <= start < nextfirstid
2746                 else 0)
2747             endv = (
2748                 ((end - 1) % self._pagesize) + 1
2749                 if (end is not None and firstid <= end <= nextfirstid)
2750                 else None)
2751
2752             try:
2753                 page_results = self.getpage(pagenum)
2754             except Exception:
2755                 self._pagecount = pagenum - 1
2756                 raise
2757             if startv != 0 or endv is not None:
2758                 page_results = page_results[startv:endv]
2759             yield from page_results
2760
2761             # A little optimization - if current page is not "full", ie. does
2762             # not contain page_size videos then we can assume that this page
2763             # is the last one - there are no more ids on further pages -
2764             # i.e. no need to query again.
2765             if len(page_results) + startv < self._pagesize:
2766                 break
2767
2768             # If we got the whole page, but the next page is not interesting,
2769             # break out early as well
2770             if end == nextfirstid:
2771                 break
2772
2773
2774 class InAdvancePagedList(PagedList):
2775     """PagedList with total number of pages known in advance"""
2776
2777     def __init__(self, pagefunc, pagecount, pagesize):
2778         PagedList.__init__(self, pagefunc, pagesize, True)
2779         self._pagecount = pagecount
2780
2781     def _getslice(self, start, end):
2782         start_page = start // self._pagesize
2783         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2784         skip_elems = start - start_page * self._pagesize
2785         only_more = None if end is None else end - start
2786         for pagenum in range(start_page, end_page):
2787             page_results = self.getpage(pagenum)
2788             if skip_elems:
2789                 page_results = page_results[skip_elems:]
2790                 skip_elems = None
2791             if only_more is not None:
2792                 if len(page_results) < only_more:
2793                     only_more -= len(page_results)
2794                 else:
2795                     yield from page_results[:only_more]
2796                     break
2797             yield from page_results
2798
2799
2800 class PlaylistEntries:
2801     MissingEntry = object()
2802     is_exhausted = False
2803
2804     def __init__(self, ydl, info_dict):
2805         self.ydl = ydl
2806
2807         # _entries must be assigned now since infodict can change during iteration
2808         entries = info_dict.get('entries')
2809         if entries is None:
2810             raise EntryNotInPlaylist('There are no entries')
2811         elif isinstance(entries, list):
2812             self.is_exhausted = True
2813
2814         requested_entries = info_dict.get('requested_entries')
2815         self.is_incomplete = requested_entries is not None
2816         if self.is_incomplete:
2817             assert self.is_exhausted
2818             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2819             for i, entry in zip(requested_entries, entries):
2820                 self._entries[i - 1] = entry
2821         elif isinstance(entries, (list, PagedList, LazyList)):
2822             self._entries = entries
2823         else:
2824             self._entries = LazyList(entries)
2825
2826     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2827         (?P<start>[+-]?\d+)?
2828         (?P<range>[:-]
2829             (?P<end>[+-]?\d+|inf(?:inite)?)?
2830             (?::(?P<step>[+-]?\d+))?
2831         )?''')
2832
2833     @classmethod
2834     def parse_playlist_items(cls, string):
2835         for segment in string.split(','):
2836             if not segment:
2837                 raise ValueError('There is two or more consecutive commas')
2838             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2839             if not mobj:
2840                 raise ValueError(f'{segment!r} is not a valid specification')
2841             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2842             if int_or_none(step) == 0:
2843                 raise ValueError(f'Step in {segment!r} cannot be zero')
2844             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2845
2846     def get_requested_items(self):
2847         playlist_items = self.ydl.params.get('playlist_items')
2848         playlist_start = self.ydl.params.get('playliststart', 1)
2849         playlist_end = self.ydl.params.get('playlistend')
2850         # For backwards compatibility, interpret -1 as whole list
2851         if playlist_end in (-1, None):
2852             playlist_end = ''
2853         if not playlist_items:
2854             playlist_items = f'{playlist_start}:{playlist_end}'
2855         elif playlist_start != 1 or playlist_end:
2856             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2857
2858         for index in self.parse_playlist_items(playlist_items):
2859             for i, entry in self[index]:
2860                 yield i, entry
2861                 if not entry:
2862                     continue
2863                 try:
2864                     # The item may have just been added to archive. Don't break due to it
2865                     if not self.ydl.params.get('lazy_playlist'):
2866                         # TODO: Add auto-generated fields
2867                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2868                 except (ExistingVideoReached, RejectedVideoReached):
2869                     return
2870
2871     def get_full_count(self):
2872         if self.is_exhausted and not self.is_incomplete:
2873             return len(self)
2874         elif isinstance(self._entries, InAdvancePagedList):
2875             if self._entries._pagesize == 1:
2876                 return self._entries._pagecount
2877
2878     @functools.cached_property
2879     def _getter(self):
2880         if isinstance(self._entries, list):
2881             def get_entry(i):
2882                 try:
2883                     entry = self._entries[i]
2884                 except IndexError:
2885                     entry = self.MissingEntry
2886                     if not self.is_incomplete:
2887                         raise self.IndexError()
2888                 if entry is self.MissingEntry:
2889                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2890                 return entry
2891         else:
2892             def get_entry(i):
2893                 try:
2894                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2895                 except (LazyList.IndexError, PagedList.IndexError):
2896                     raise self.IndexError()
2897         return get_entry
2898
2899     def __getitem__(self, idx):
2900         if isinstance(idx, int):
2901             idx = slice(idx, idx)
2902
2903         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2904         step = 1 if idx.step is None else idx.step
2905         if idx.start is None:
2906             start = 0 if step > 0 else len(self) - 1
2907         else:
2908             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2909
2910         # NB: Do not call len(self) when idx == [:]
2911         if idx.stop is None:
2912             stop = 0 if step < 0 else float('inf')
2913         else:
2914             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2915         stop += [-1, 1][step > 0]
2916
2917         for i in frange(start, stop, step):
2918             if i < 0:
2919                 continue
2920             try:
2921                 entry = self._getter(i)
2922             except self.IndexError:
2923                 self.is_exhausted = True
2924                 if step > 0:
2925                     break
2926                 continue
2927             yield i + 1, entry
2928
2929     def __len__(self):
2930         return len(tuple(self[:]))
2931
2932     class IndexError(IndexError):
2933         pass
2934
2935
2936 def uppercase_escape(s):
2937     unicode_escape = codecs.getdecoder('unicode_escape')
2938     return re.sub(
2939         r'\\U[0-9a-fA-F]{8}',
2940         lambda m: unicode_escape(m.group(0))[0],
2941         s)
2942
2943
2944 def lowercase_escape(s):
2945     unicode_escape = codecs.getdecoder('unicode_escape')
2946     return re.sub(
2947         r'\\u[0-9a-fA-F]{4}',
2948         lambda m: unicode_escape(m.group(0))[0],
2949         s)
2950
2951
2952 def escape_rfc3986(s):
2953     """Escape non-ASCII characters as suggested by RFC 3986"""
2954     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2955
2956
2957 def escape_url(url):
2958     """Escape URL as suggested by RFC 3986"""
2959     url_parsed = urllib.parse.urlparse(url)
2960     return url_parsed._replace(
2961         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2962         path=escape_rfc3986(url_parsed.path),
2963         params=escape_rfc3986(url_parsed.params),
2964         query=escape_rfc3986(url_parsed.query),
2965         fragment=escape_rfc3986(url_parsed.fragment)
2966     ).geturl()
2967
2968
2969 def parse_qs(url, **kwargs):
2970     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2971
2972
2973 def read_batch_urls(batch_fd):
2974     def fixup(url):
2975         if not isinstance(url, str):
2976             url = url.decode('utf-8', 'replace')
2977         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2978         for bom in BOM_UTF8:
2979             if url.startswith(bom):
2980                 url = url[len(bom):]
2981         url = url.lstrip()
2982         if not url or url.startswith(('#', ';', ']')):
2983             return False
2984         # "#" cannot be stripped out since it is part of the URI
2985         # However, it can be safely stripped out if following a whitespace
2986         return re.split(r'\s#', url, 1)[0].rstrip()
2987
2988     with contextlib.closing(batch_fd) as fd:
2989         return [url for url in map(fixup, fd) if url]
2990
2991
2992 def urlencode_postdata(*args, **kargs):
2993     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2994
2995
2996 def update_url(url, *, query_update=None, **kwargs):
2997     """Replace URL components specified by kwargs
2998        @param url           str or parse url tuple
2999        @param query_update  update query
3000        @returns             str
3001     """
3002     if isinstance(url, str):
3003         if not kwargs and not query_update:
3004             return url
3005         else:
3006             url = urllib.parse.urlparse(url)
3007     if query_update:
3008         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3009         kwargs['query'] = urllib.parse.urlencode({
3010             **urllib.parse.parse_qs(url.query),
3011             **query_update
3012         }, True)
3013     return urllib.parse.urlunparse(url._replace(**kwargs))
3014
3015
3016 def update_url_query(url, query):
3017     return update_url(url, query_update=query)
3018
3019
3020 def update_Request(req, url=None, data=None, headers=None, query=None):
3021     req_headers = req.headers.copy()
3022     req_headers.update(headers or {})
3023     req_data = data or req.data
3024     req_url = update_url_query(url or req.get_full_url(), query)
3025     req_get_method = req.get_method()
3026     if req_get_method == 'HEAD':
3027         req_type = HEADRequest
3028     elif req_get_method == 'PUT':
3029         req_type = PUTRequest
3030     else:
3031         req_type = urllib.request.Request
3032     new_req = req_type(
3033         req_url, data=req_data, headers=req_headers,
3034         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3035     if hasattr(req, 'timeout'):
3036         new_req.timeout = req.timeout
3037     return new_req
3038
3039
3040 def _multipart_encode_impl(data, boundary):
3041     content_type = 'multipart/form-data; boundary=%s' % boundary
3042
3043     out = b''
3044     for k, v in data.items():
3045         out += b'--' + boundary.encode('ascii') + b'\r\n'
3046         if isinstance(k, str):
3047             k = k.encode()
3048         if isinstance(v, str):
3049             v = v.encode()
3050         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3051         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3052         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3053         if boundary.encode('ascii') in content:
3054             raise ValueError('Boundary overlaps with data')
3055         out += content
3056
3057     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3058
3059     return out, content_type
3060
3061
3062 def multipart_encode(data, boundary=None):
3063     '''
3064     Encode a dict to RFC 7578-compliant form-data
3065
3066     data:
3067         A dict where keys and values can be either Unicode or bytes-like
3068         objects.
3069     boundary:
3070         If specified a Unicode object, it's used as the boundary. Otherwise
3071         a random boundary is generated.
3072
3073     Reference: https://tools.ietf.org/html/rfc7578
3074     '''
3075     has_specified_boundary = boundary is not None
3076
3077     while True:
3078         if boundary is None:
3079             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3080
3081         try:
3082             out, content_type = _multipart_encode_impl(data, boundary)
3083             break
3084         except ValueError:
3085             if has_specified_boundary:
3086                 raise
3087             boundary = None
3088
3089     return out, content_type
3090
3091
3092 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3093     if blocked_types is NO_DEFAULT:
3094         blocked_types = (str, bytes, collections.abc.Mapping)
3095     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3096
3097
3098 def variadic(x, allowed_types=NO_DEFAULT):
3099     if not isinstance(allowed_types, (tuple, type)):
3100         deprecation_warning('allowed_types should be a tuple or a type')
3101         allowed_types = tuple(allowed_types)
3102     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3103
3104
3105 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3106     for f in funcs:
3107         try:
3108             val = f(*args, **kwargs)
3109         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3110             pass
3111         else:
3112             if expected_type is None or isinstance(val, expected_type):
3113                 return val
3114
3115
3116 def try_get(src, getter, expected_type=None):
3117     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3118
3119
3120 def filter_dict(dct, cndn=lambda _, v: v is not None):
3121     return {k: v for k, v in dct.items() if cndn(k, v)}
3122
3123
3124 def merge_dicts(*dicts):
3125     merged = {}
3126     for a_dict in dicts:
3127         for k, v in a_dict.items():
3128             if (v is not None and k not in merged
3129                     or isinstance(v, str) and merged[k] == ''):
3130                 merged[k] = v
3131     return merged
3132
3133
3134 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3135     return string if isinstance(string, str) else str(string, encoding, errors)
3136
3137
3138 US_RATINGS = {
3139     'G': 0,
3140     'PG': 10,
3141     'PG-13': 13,
3142     'R': 16,
3143     'NC': 18,
3144 }
3145
3146
3147 TV_PARENTAL_GUIDELINES = {
3148     'TV-Y': 0,
3149     'TV-Y7': 7,
3150     'TV-G': 0,
3151     'TV-PG': 0,
3152     'TV-14': 14,
3153     'TV-MA': 17,
3154 }
3155
3156
3157 def parse_age_limit(s):
3158     # isinstance(False, int) is True. So type() must be used instead
3159     if type(s) is int:  # noqa: E721
3160         return s if 0 <= s <= 21 else None
3161     elif not isinstance(s, str):
3162         return None
3163     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3164     if m:
3165         return int(m.group('age'))
3166     s = s.upper()
3167     if s in US_RATINGS:
3168         return US_RATINGS[s]
3169     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3170     if m:
3171         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3172     return None
3173
3174
3175 def strip_jsonp(code):
3176     return re.sub(
3177         r'''(?sx)^
3178             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3179             (?:\s*&&\s*(?P=func_name))?
3180             \s*\(\s*(?P<callback_data>.*)\);?
3181             \s*?(?://[^\n]*)*$''',
3182         r'\g<callback_data>', code)
3183
3184
3185 def js_to_json(code, vars={}, *, strict=False):
3186     # vars is a dict of var, val pairs to substitute
3187     STRING_QUOTES = '\'"`'
3188     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3189     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3190     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3191     INTEGER_TABLE = (
3192         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3193         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3194     )
3195
3196     def process_escape(match):
3197         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3198         escape = match.group(1) or match.group(2)
3199
3200         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3201                 else R'\u00' if escape == 'x'
3202                 else '' if escape == '\n'
3203                 else escape)
3204
3205     def template_substitute(match):
3206         evaluated = js_to_json(match.group(1), vars, strict=strict)
3207         if evaluated[0] == '"':
3208             return json.loads(evaluated)
3209         return evaluated
3210
3211     def fix_kv(m):
3212         v = m.group(0)
3213         if v in ('true', 'false', 'null'):
3214             return v
3215         elif v in ('undefined', 'void 0'):
3216             return 'null'
3217         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3218             return ''
3219
3220         if v[0] in STRING_QUOTES:
3221             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3222             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3223             return f'"{escaped}"'
3224
3225         for regex, base in INTEGER_TABLE:
3226             im = re.match(regex, v)
3227             if im:
3228                 i = int(im.group(1), base)
3229                 return f'"{i}":' if v.endswith(':') else str(i)
3230
3231         if v in vars:
3232             try:
3233                 if not strict:
3234                     json.loads(vars[v])
3235             except json.JSONDecodeError:
3236                 return json.dumps(vars[v])
3237             else:
3238                 return vars[v]
3239
3240         if not strict:
3241             return f'"{v}"'
3242
3243         raise ValueError(f'Unknown value: {v}')
3244
3245     def create_map(mobj):
3246         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3247
3248     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3249     if not strict:
3250         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3251         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3252         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3253         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3254
3255     return re.sub(rf'''(?sx)
3256         {STRING_RE}|
3257         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3258         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3259         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3260         [0-9]+(?={SKIP_RE}:)|
3261         !+
3262         ''', fix_kv, code)
3263
3264
3265 def qualities(quality_ids):
3266     """ Get a numeric quality value out of a list of possible values """
3267     def q(qid):
3268         try:
3269             return quality_ids.index(qid)
3270         except ValueError:
3271             return -1
3272     return q
3273
3274
3275 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3276
3277
3278 DEFAULT_OUTTMPL = {
3279     'default': '%(title)s [%(id)s].%(ext)s',
3280     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3281 }
3282 OUTTMPL_TYPES = {
3283     'chapter': None,
3284     'subtitle': None,
3285     'thumbnail': None,
3286     'description': 'description',
3287     'annotation': 'annotations.xml',
3288     'infojson': 'info.json',
3289     'link': None,
3290     'pl_video': None,
3291     'pl_thumbnail': None,
3292     'pl_description': 'description',
3293     'pl_infojson': 'info.json',
3294 }
3295
3296 # As of [1] format syntax is:
3297 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3298 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3299 STR_FORMAT_RE_TMPL = r'''(?x)
3300     (?<!%)(?P<prefix>(?:%%)*)
3301     %
3302     (?P<has_key>\((?P<key>{0})\))?
3303     (?P<format>
3304         (?P<conversion>[#0\-+ ]+)?
3305         (?P<min_width>\d+)?
3306         (?P<precision>\.\d+)?
3307         (?P<len_mod>[hlL])?  # unused in python
3308         {1}  # conversion type
3309     )
3310 '''
3311
3312
3313 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
3314
3315
3316 def limit_length(s, length):
3317     """ Add ellipses to overly long strings """
3318     if s is None:
3319         return None
3320     ELLIPSES = '...'
3321     if len(s) > length:
3322         return s[:length - len(ELLIPSES)] + ELLIPSES
3323     return s
3324
3325
3326 def version_tuple(v):
3327     return tuple(int(e) for e in re.split(r'[-.]', v))
3328
3329
3330 def is_outdated_version(version, limit, assume_new=True):
3331     if not version:
3332         return not assume_new
3333     try:
3334         return version_tuple(version) < version_tuple(limit)
3335     except ValueError:
3336         return not assume_new
3337
3338
3339 def ytdl_is_updateable():
3340     """ Returns if yt-dlp can be updated with -U """
3341
3342     from ..update import is_non_updateable
3343
3344     return not is_non_updateable()
3345
3346
3347 def args_to_str(args):
3348     # Get a short string representation for a subprocess command
3349     return ' '.join(compat_shlex_quote(a) for a in args)
3350
3351
3352 def error_to_str(err):
3353     return f'{type(err).__name__}: {err}'
3354
3355
3356 def mimetype2ext(mt, default=NO_DEFAULT):
3357     if not isinstance(mt, str):
3358         if default is not NO_DEFAULT:
3359             return default
3360         return None
3361
3362     MAP = {
3363         # video
3364         '3gpp': '3gp',
3365         'mp2t': 'ts',
3366         'mp4': 'mp4',
3367         'mpeg': 'mpeg',
3368         'mpegurl': 'm3u8',
3369         'quicktime': 'mov',
3370         'webm': 'webm',
3371         'vp9': 'vp9',
3372         'x-flv': 'flv',
3373         'x-m4v': 'm4v',
3374         'x-matroska': 'mkv',
3375         'x-mng': 'mng',
3376         'x-mp4-fragmented': 'mp4',
3377         'x-ms-asf': 'asf',
3378         'x-ms-wmv': 'wmv',
3379         'x-msvideo': 'avi',
3380
3381         # application (streaming playlists)
3382         'dash+xml': 'mpd',
3383         'f4m+xml': 'f4m',
3384         'hds+xml': 'f4m',
3385         'vnd.apple.mpegurl': 'm3u8',
3386         'vnd.ms-sstr+xml': 'ism',
3387         'x-mpegurl': 'm3u8',
3388
3389         # audio
3390         'audio/mp4': 'm4a',
3391         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3392         # Using .mp3 as it's the most popular one
3393         'audio/mpeg': 'mp3',
3394         'audio/webm': 'webm',
3395         'audio/x-matroska': 'mka',
3396         'audio/x-mpegurl': 'm3u',
3397         'midi': 'mid',
3398         'ogg': 'ogg',
3399         'wav': 'wav',
3400         'wave': 'wav',
3401         'x-aac': 'aac',
3402         'x-flac': 'flac',
3403         'x-m4a': 'm4a',
3404         'x-realaudio': 'ra',
3405         'x-wav': 'wav',
3406
3407         # image
3408         'avif': 'avif',
3409         'bmp': 'bmp',
3410         'gif': 'gif',
3411         'jpeg': 'jpg',
3412         'png': 'png',
3413         'svg+xml': 'svg',
3414         'tiff': 'tif',
3415         'vnd.wap.wbmp': 'wbmp',
3416         'webp': 'webp',
3417         'x-icon': 'ico',
3418         'x-jng': 'jng',
3419         'x-ms-bmp': 'bmp',
3420
3421         # caption
3422         'filmstrip+json': 'fs',
3423         'smptett+xml': 'tt',
3424         'ttaf+xml': 'dfxp',
3425         'ttml+xml': 'ttml',
3426         'x-ms-sami': 'sami',
3427
3428         # misc
3429         'gzip': 'gz',
3430         'json': 'json',
3431         'xml': 'xml',
3432         'zip': 'zip',
3433     }
3434
3435     mimetype = mt.partition(';')[0].strip().lower()
3436     _, _, subtype = mimetype.rpartition('/')
3437
3438     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3439     if ext:
3440         return ext
3441     elif default is not NO_DEFAULT:
3442         return default
3443     return subtype.replace('+', '.')
3444
3445
3446 def ext2mimetype(ext_or_url):
3447     if not ext_or_url:
3448         return None
3449     if '.' not in ext_or_url:
3450         ext_or_url = f'file.{ext_or_url}'
3451     return mimetypes.guess_type(ext_or_url)[0]
3452
3453
3454 def parse_codecs(codecs_str):
3455     # http://tools.ietf.org/html/rfc6381
3456     if not codecs_str:
3457         return {}
3458     split_codecs = list(filter(None, map(
3459         str.strip, codecs_str.strip().strip(',').split(','))))
3460     vcodec, acodec, scodec, hdr = None, None, None, None
3461     for full_codec in split_codecs:
3462         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3463         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3464                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3465             if vcodec:
3466                 continue
3467             vcodec = full_codec
3468             if parts[0] in ('dvh1', 'dvhe'):
3469                 hdr = 'DV'
3470             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3471                 hdr = 'HDR10'
3472             elif parts[:2] == ['vp9', '2']:
3473                 hdr = 'HDR10'
3474         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3475                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3476             acodec = acodec or full_codec
3477         elif parts[0] in ('stpp', 'wvtt'):
3478             scodec = scodec or full_codec
3479         else:
3480             write_string(f'WARNING: Unknown codec {full_codec}\n')
3481     if vcodec or acodec or scodec:
3482         return {
3483             'vcodec': vcodec or 'none',
3484             'acodec': acodec or 'none',
3485             'dynamic_range': hdr,
3486             **({'scodec': scodec} if scodec is not None else {}),
3487         }
3488     elif len(split_codecs) == 2:
3489         return {
3490             'vcodec': split_codecs[0],
3491             'acodec': split_codecs[1],
3492         }
3493     return {}
3494
3495
3496 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3497     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3498
3499     allow_mkv = not preferences or 'mkv' in preferences
3500
3501     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3502         return 'mkv'  # TODO: any other format allows this?
3503
3504     # TODO: All codecs supported by parse_codecs isn't handled here
3505     COMPATIBLE_CODECS = {
3506         'mp4': {
3507             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3508             'h264', 'aacl', 'ec-3',  # Set in ISM
3509         },
3510         'webm': {
3511             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3512             'vp9x', 'vp8x',  # in the webm spec
3513         },
3514     }
3515
3516     sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3517     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3518
3519     for ext in preferences or COMPATIBLE_CODECS.keys():
3520         codec_set = COMPATIBLE_CODECS.get(ext, set())
3521         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3522             return ext
3523
3524     COMPATIBLE_EXTS = (
3525         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3526         {'webm', 'weba'},
3527     )
3528     for ext in preferences or vexts:
3529         current_exts = {ext, *vexts, *aexts}
3530         if ext == 'mkv' or current_exts == {ext} or any(
3531                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3532             return ext
3533     return 'mkv' if allow_mkv else preferences[-1]
3534
3535
3536 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3537     getheader = url_handle.headers.get
3538
3539     cd = getheader('Content-Disposition')
3540     if cd:
3541         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3542         if m:
3543             e = determine_ext(m.group('filename'), default_ext=None)
3544             if e:
3545                 return e
3546
3547     meta_ext = getheader('x-amz-meta-name')
3548     if meta_ext:
3549         e = meta_ext.rpartition('.')[2]
3550         if e:
3551             return e
3552
3553     return mimetype2ext(getheader('Content-Type'), default=default)
3554
3555
3556 def encode_data_uri(data, mime_type):
3557     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3558
3559
3560 def age_restricted(content_limit, age_limit):
3561     """ Returns True iff the content should be blocked """
3562
3563     if age_limit is None:  # No limit set
3564         return False
3565     if content_limit is None:
3566         return False  # Content available for everyone
3567     return age_limit < content_limit
3568
3569
3570 # List of known byte-order-marks (BOM)
3571 BOMS = [
3572     (b'\xef\xbb\xbf', 'utf-8'),
3573     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3574     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3575     (b'\xff\xfe', 'utf-16-le'),
3576     (b'\xfe\xff', 'utf-16-be'),
3577 ]
3578
3579
3580 def is_html(first_bytes):
3581     """ Detect whether a file contains HTML by examining its first bytes. """
3582
3583     encoding = 'utf-8'
3584     for bom, enc in BOMS:
3585         while first_bytes.startswith(bom):
3586             encoding, first_bytes = enc, first_bytes[len(bom):]
3587
3588     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3589
3590
3591 def determine_protocol(info_dict):
3592     protocol = info_dict.get('protocol')
3593     if protocol is not None:
3594         return protocol
3595
3596     url = sanitize_url(info_dict['url'])
3597     if url.startswith('rtmp'):
3598         return 'rtmp'
3599     elif url.startswith('mms'):
3600         return 'mms'
3601     elif url.startswith('rtsp'):
3602         return 'rtsp'
3603
3604     ext = determine_ext(url)
3605     if ext == 'm3u8':
3606         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3607     elif ext == 'f4m':
3608         return 'f4m'
3609
3610     return urllib.parse.urlparse(url).scheme
3611
3612
3613 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3614     """ Render a list of rows, each as a list of values.
3615     Text after a \t will be right aligned """
3616     def width(string):
3617         return len(remove_terminal_sequences(string).replace('\t', ''))
3618
3619     def get_max_lens(table):
3620         return [max(width(str(v)) for v in col) for col in zip(*table)]
3621
3622     def filter_using_list(row, filterArray):
3623         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3624
3625     max_lens = get_max_lens(data) if hide_empty else []
3626     header_row = filter_using_list(header_row, max_lens)
3627     data = [filter_using_list(row, max_lens) for row in data]
3628
3629     table = [header_row] + data
3630     max_lens = get_max_lens(table)
3631     extra_gap += 1
3632     if delim:
3633         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3634         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3635     for row in table:
3636         for pos, text in enumerate(map(str, row)):
3637             if '\t' in text:
3638                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3639             else:
3640                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3641     ret = '\n'.join(''.join(row).rstrip() for row in table)
3642     return ret
3643
3644
3645 def _match_one(filter_part, dct, incomplete):
3646     # TODO: Generalize code with YoutubeDL._build_format_filter
3647     STRING_OPERATORS = {
3648         '*=': operator.contains,
3649         '^=': lambda attr, value: attr.startswith(value),
3650         '$=': lambda attr, value: attr.endswith(value),
3651         '~=': lambda attr, value: re.search(value, attr),
3652     }
3653     COMPARISON_OPERATORS = {
3654         **STRING_OPERATORS,
3655         '<=': operator.le,  # "<=" must be defined above "<"
3656         '<': operator.lt,
3657         '>=': operator.ge,
3658         '>': operator.gt,
3659         '=': operator.eq,
3660     }
3661
3662     if isinstance(incomplete, bool):
3663         is_incomplete = lambda _: incomplete
3664     else:
3665         is_incomplete = lambda k: k in incomplete
3666
3667     operator_rex = re.compile(r'''(?x)
3668         (?P<key>[a-z_]+)
3669         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3670         (?:
3671             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3672             (?P<strval>.+?)
3673         )
3674         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3675     m = operator_rex.fullmatch(filter_part.strip())
3676     if m:
3677         m = m.groupdict()
3678         unnegated_op = COMPARISON_OPERATORS[m['op']]
3679         if m['negation']:
3680             op = lambda attr, value: not unnegated_op(attr, value)
3681         else:
3682             op = unnegated_op
3683         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3684         if m['quote']:
3685             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3686         actual_value = dct.get(m['key'])
3687         numeric_comparison = None
3688         if isinstance(actual_value, (int, float)):
3689             # If the original field is a string and matching comparisonvalue is
3690             # a number we should respect the origin of the original field
3691             # and process comparison value as a string (see
3692             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3693             try:
3694                 numeric_comparison = int(comparison_value)
3695             except ValueError:
3696                 numeric_comparison = parse_filesize(comparison_value)
3697                 if numeric_comparison is None:
3698                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3699                 if numeric_comparison is None:
3700                     numeric_comparison = parse_duration(comparison_value)
3701         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3702             raise ValueError('Operator %s only supports string values!' % m['op'])
3703         if actual_value is None:
3704             return is_incomplete(m['key']) or m['none_inclusive']
3705         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3706
3707     UNARY_OPERATORS = {
3708         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3709         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3710     }
3711     operator_rex = re.compile(r'''(?x)
3712         (?P<op>%s)\s*(?P<key>[a-z_]+)
3713         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3714     m = operator_rex.fullmatch(filter_part.strip())
3715     if m:
3716         op = UNARY_OPERATORS[m.group('op')]
3717         actual_value = dct.get(m.group('key'))
3718         if is_incomplete(m.group('key')) and actual_value is None:
3719             return True
3720         return op(actual_value)
3721
3722     raise ValueError('Invalid filter part %r' % filter_part)
3723
3724
3725 def match_str(filter_str, dct, incomplete=False):
3726     """ Filter a dictionary with a simple string syntax.
3727     @returns           Whether the filter passes
3728     @param incomplete  Set of keys that is expected to be missing from dct.
3729                        Can be True/False to indicate all/none of the keys may be missing.
3730                        All conditions on incomplete keys pass if the key is missing
3731     """
3732     return all(
3733         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3734         for filter_part in re.split(r'(?<!\\)&', filter_str))
3735
3736
3737 def match_filter_func(filters, breaking_filters=None):
3738     if not filters and not breaking_filters:
3739         return None
3740     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3741     filters = set(variadic(filters or []))
3742
3743     interactive = '-' in filters
3744     if interactive:
3745         filters.remove('-')
3746
3747     def _match_func(info_dict, incomplete=False):
3748         ret = breaking_filters(info_dict, incomplete)
3749         if ret is not None:
3750             raise RejectedVideoReached(ret)
3751
3752         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3753             return NO_DEFAULT if interactive and not incomplete else None
3754         else:
3755             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3756             filter_str = ') | ('.join(map(str.strip, filters))
3757             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3758     return _match_func
3759
3760
3761 class download_range_func:
3762     def __init__(self, chapters, ranges):
3763         self.chapters, self.ranges = chapters, ranges
3764
3765     def __call__(self, info_dict, ydl):
3766         if not self.ranges and not self.chapters:
3767             yield {}
3768
3769         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3770                    else 'Cannot match chapters since chapter information is unavailable')
3771         for regex in self.chapters or []:
3772             for i, chapter in enumerate(info_dict.get('chapters') or []):
3773                 if re.search(regex, chapter['title']):
3774                     warning = None
3775                     yield {**chapter, 'index': i}
3776         if self.chapters and warning:
3777             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3778
3779         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3780
3781     def __eq__(self, other):
3782         return (isinstance(other, download_range_func)
3783                 and self.chapters == other.chapters and self.ranges == other.ranges)
3784
3785     def __repr__(self):
3786         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3787
3788
3789 def parse_dfxp_time_expr(time_expr):
3790     if not time_expr:
3791         return
3792
3793     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3794     if mobj:
3795         return float(mobj.group('time_offset'))
3796
3797     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3798     if mobj:
3799         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3800
3801
3802 def srt_subtitles_timecode(seconds):
3803     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3804
3805
3806 def ass_subtitles_timecode(seconds):
3807     time = timetuple_from_msec(seconds * 1000)
3808     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3809
3810
3811 def dfxp2srt(dfxp_data):
3812     '''
3813     @param dfxp_data A bytes-like object containing DFXP data
3814     @returns A unicode object containing converted SRT data
3815     '''
3816     LEGACY_NAMESPACES = (
3817         (b'http://www.w3.org/ns/ttml', [
3818             b'http://www.w3.org/2004/11/ttaf1',
3819             b'http://www.w3.org/2006/04/ttaf1',
3820             b'http://www.w3.org/2006/10/ttaf1',
3821         ]),
3822         (b'http://www.w3.org/ns/ttml#styling', [
3823             b'http://www.w3.org/ns/ttml#style',
3824         ]),
3825     )
3826
3827     SUPPORTED_STYLING = [
3828         'color',
3829         'fontFamily',
3830         'fontSize',
3831         'fontStyle',
3832         'fontWeight',
3833         'textDecoration'
3834     ]
3835
3836     _x = functools.partial(xpath_with_ns, ns_map={
3837         'xml': 'http://www.w3.org/XML/1998/namespace',
3838         'ttml': 'http://www.w3.org/ns/ttml',
3839         'tts': 'http://www.w3.org/ns/ttml#styling',
3840     })
3841
3842     styles = {}
3843     default_style = {}
3844
3845     class TTMLPElementParser:
3846         _out = ''
3847         _unclosed_elements = []
3848         _applied_styles = []
3849
3850         def start(self, tag, attrib):
3851             if tag in (_x('ttml:br'), 'br'):
3852                 self._out += '\n'
3853             else:
3854                 unclosed_elements = []
3855                 style = {}
3856                 element_style_id = attrib.get('style')
3857                 if default_style:
3858                     style.update(default_style)
3859                 if element_style_id:
3860                     style.update(styles.get(element_style_id, {}))
3861                 for prop in SUPPORTED_STYLING:
3862                     prop_val = attrib.get(_x('tts:' + prop))
3863                     if prop_val:
3864                         style[prop] = prop_val
3865                 if style:
3866                     font = ''
3867                     for k, v in sorted(style.items()):
3868                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3869                             continue
3870                         if k == 'color':
3871                             font += ' color="%s"' % v
3872                         elif k == 'fontSize':
3873                             font += ' size="%s"' % v
3874                         elif k == 'fontFamily':
3875                             font += ' face="%s"' % v
3876                         elif k == 'fontWeight' and v == 'bold':
3877                             self._out += '<b>'
3878                             unclosed_elements.append('b')
3879                         elif k == 'fontStyle' and v == 'italic':
3880                             self._out += '<i>'
3881                             unclosed_elements.append('i')
3882                         elif k == 'textDecoration' and v == 'underline':
3883                             self._out += '<u>'
3884                             unclosed_elements.append('u')
3885                     if font:
3886                         self._out += '<font' + font + '>'
3887                         unclosed_elements.append('font')
3888                     applied_style = {}
3889                     if self._applied_styles:
3890                         applied_style.update(self._applied_styles[-1])
3891                     applied_style.update(style)
3892                     self._applied_styles.append(applied_style)
3893                 self._unclosed_elements.append(unclosed_elements)
3894
3895         def end(self, tag):
3896             if tag not in (_x('ttml:br'), 'br'):
3897                 unclosed_elements = self._unclosed_elements.pop()
3898                 for element in reversed(unclosed_elements):
3899                     self._out += '</%s>' % element
3900                 if unclosed_elements and self._applied_styles:
3901                     self._applied_styles.pop()
3902
3903         def data(self, data):
3904             self._out += data
3905
3906         def close(self):
3907             return self._out.strip()
3908
3909     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3910     # This will not trigger false positives since only UTF-8 text is being replaced
3911     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3912
3913     def parse_node(node):
3914         target = TTMLPElementParser()
3915         parser = xml.etree.ElementTree.XMLParser(target=target)
3916         parser.feed(xml.etree.ElementTree.tostring(node))
3917         return parser.close()
3918
3919     for k, v in LEGACY_NAMESPACES:
3920         for ns in v:
3921             dfxp_data = dfxp_data.replace(ns, k)
3922
3923     dfxp = compat_etree_fromstring(dfxp_data)
3924     out = []
3925     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3926
3927     if not paras:
3928         raise ValueError('Invalid dfxp/TTML subtitle')
3929
3930     repeat = False
3931     while True:
3932         for style in dfxp.findall(_x('.//ttml:style')):
3933             style_id = style.get('id') or style.get(_x('xml:id'))
3934             if not style_id:
3935                 continue
3936             parent_style_id = style.get('style')
3937             if parent_style_id:
3938                 if parent_style_id not in styles:
3939                     repeat = True
3940                     continue
3941                 styles[style_id] = styles[parent_style_id].copy()
3942             for prop in SUPPORTED_STYLING:
3943                 prop_val = style.get(_x('tts:' + prop))
3944                 if prop_val:
3945                     styles.setdefault(style_id, {})[prop] = prop_val
3946         if repeat:
3947             repeat = False
3948         else:
3949             break
3950
3951     for p in ('body', 'div'):
3952         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3953         if ele is None:
3954             continue
3955         style = styles.get(ele.get('style'))
3956         if not style:
3957             continue
3958         default_style.update(style)
3959
3960     for para, index in zip(paras, itertools.count(1)):
3961         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3962         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3963         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3964         if begin_time is None:
3965             continue
3966         if not end_time:
3967             if not dur:
3968                 continue
3969             end_time = begin_time + dur
3970         out.append('%d\n%s --> %s\n%s\n\n' % (
3971             index,
3972             srt_subtitles_timecode(begin_time),
3973             srt_subtitles_timecode(end_time),
3974             parse_node(para)))
3975
3976     return ''.join(out)
3977
3978
3979 def cli_option(params, command_option, param, separator=None):
3980     param = params.get(param)
3981     return ([] if param is None
3982             else [command_option, str(param)] if separator is None
3983             else [f'{command_option}{separator}{param}'])
3984
3985
3986 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3987     param = params.get(param)
3988     assert param in (True, False, None)
3989     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3990
3991
3992 def cli_valueless_option(params, command_option, param, expected_value=True):
3993     return [command_option] if params.get(param) == expected_value else []
3994
3995
3996 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3997     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3998         if use_compat:
3999             return argdict
4000         else:
4001             argdict = None
4002     if argdict is None:
4003         return default
4004     assert isinstance(argdict, dict)
4005
4006     assert isinstance(keys, (list, tuple))
4007     for key_list in keys:
4008         arg_list = list(filter(
4009             lambda x: x is not None,
4010             [argdict.get(key.lower()) for key in variadic(key_list)]))
4011         if arg_list:
4012             return [arg for args in arg_list for arg in args]
4013     return default
4014
4015
4016 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4017     main_key, exe = main_key.lower(), exe.lower()
4018     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4019     keys = [f'{root_key}{k}' for k in (keys or [''])]
4020     if root_key in keys:
4021         if main_key != exe:
4022             keys.append((main_key, exe))
4023         keys.append('default')
4024     else:
4025         use_compat = False
4026     return cli_configuration_args(argdict, keys, default, use_compat)
4027
4028
4029 class ISO639Utils:
4030     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4031     _lang_map = {
4032         'aa': 'aar',
4033         'ab': 'abk',
4034         'ae': 'ave',
4035         'af': 'afr',
4036         'ak': 'aka',
4037         'am': 'amh',
4038         'an': 'arg',
4039         'ar': 'ara',
4040         'as': 'asm',
4041         'av': 'ava',
4042         'ay': 'aym',
4043         'az': 'aze',
4044         'ba': 'bak',
4045         'be': 'bel',
4046         'bg': 'bul',
4047         'bh': 'bih',
4048         'bi': 'bis',
4049         'bm': 'bam',
4050         'bn': 'ben',
4051         'bo': 'bod',
4052         'br': 'bre',
4053         'bs': 'bos',
4054         'ca': 'cat',
4055         'ce': 'che',
4056         'ch': 'cha',
4057         'co': 'cos',
4058         'cr': 'cre',
4059         'cs': 'ces',
4060         'cu': 'chu',
4061         'cv': 'chv',
4062         'cy': 'cym',
4063         'da': 'dan',
4064         'de': 'deu',
4065         'dv': 'div',
4066         'dz': 'dzo',
4067         'ee': 'ewe',
4068         'el': 'ell',
4069         'en': 'eng',
4070         'eo': 'epo',
4071         'es': 'spa',
4072         'et': 'est',
4073         'eu': 'eus',
4074         'fa': 'fas',
4075         'ff': 'ful',
4076         'fi': 'fin',
4077         'fj': 'fij',
4078         'fo': 'fao',
4079         'fr': 'fra',
4080         'fy': 'fry',
4081         'ga': 'gle',
4082         'gd': 'gla',
4083         'gl': 'glg',
4084         'gn': 'grn',
4085         'gu': 'guj',
4086         'gv': 'glv',
4087         'ha': 'hau',
4088         'he': 'heb',
4089         'iw': 'heb',  # Replaced by he in 1989 revision
4090         'hi': 'hin',
4091         'ho': 'hmo',
4092         'hr': 'hrv',
4093         'ht': 'hat',
4094         'hu': 'hun',
4095         'hy': 'hye',
4096         'hz': 'her',
4097         'ia': 'ina',
4098         'id': 'ind',
4099         'in': 'ind',  # Replaced by id in 1989 revision
4100         'ie': 'ile',
4101         'ig': 'ibo',
4102         'ii': 'iii',
4103         'ik': 'ipk',
4104         'io': 'ido',
4105         'is': 'isl',
4106         'it': 'ita',
4107         'iu': 'iku',
4108         'ja': 'jpn',
4109         'jv': 'jav',
4110         'ka': 'kat',
4111         'kg': 'kon',
4112         'ki': 'kik',
4113         'kj': 'kua',
4114         'kk': 'kaz',
4115         'kl': 'kal',
4116         'km': 'khm',
4117         'kn': 'kan',
4118         'ko': 'kor',
4119         'kr': 'kau',
4120         'ks': 'kas',
4121         'ku': 'kur',
4122         'kv': 'kom',
4123         'kw': 'cor',
4124         'ky': 'kir',
4125         'la': 'lat',
4126         'lb': 'ltz',
4127         'lg': 'lug',
4128         'li': 'lim',
4129         'ln': 'lin',
4130         'lo': 'lao',
4131         'lt': 'lit',
4132         'lu': 'lub',
4133         'lv': 'lav',
4134         'mg': 'mlg',
4135         'mh': 'mah',
4136         'mi': 'mri',
4137         'mk': 'mkd',
4138         'ml': 'mal',
4139         'mn': 'mon',
4140         'mr': 'mar',
4141         'ms': 'msa',
4142         'mt': 'mlt',
4143         'my': 'mya',
4144         'na': 'nau',
4145         'nb': 'nob',
4146         'nd': 'nde',
4147         'ne': 'nep',
4148         'ng': 'ndo',
4149         'nl': 'nld',
4150         'nn': 'nno',
4151         'no': 'nor',
4152         'nr': 'nbl',
4153         'nv': 'nav',
4154         'ny': 'nya',
4155         'oc': 'oci',
4156         'oj': 'oji',
4157         'om': 'orm',
4158         'or': 'ori',
4159         'os': 'oss',
4160         'pa': 'pan',
4161         'pe': 'per',
4162         'pi': 'pli',
4163         'pl': 'pol',
4164         'ps': 'pus',
4165         'pt': 'por',
4166         'qu': 'que',
4167         'rm': 'roh',
4168         'rn': 'run',
4169         'ro': 'ron',
4170         'ru': 'rus',
4171         'rw': 'kin',
4172         'sa': 'san',
4173         'sc': 'srd',
4174         'sd': 'snd',
4175         'se': 'sme',
4176         'sg': 'sag',
4177         'si': 'sin',
4178         'sk': 'slk',
4179         'sl': 'slv',
4180         'sm': 'smo',
4181         'sn': 'sna',
4182         'so': 'som',
4183         'sq': 'sqi',
4184         'sr': 'srp',
4185         'ss': 'ssw',
4186         'st': 'sot',
4187         'su': 'sun',
4188         'sv': 'swe',
4189         'sw': 'swa',
4190         'ta': 'tam',
4191         'te': 'tel',
4192         'tg': 'tgk',
4193         'th': 'tha',
4194         'ti': 'tir',
4195         'tk': 'tuk',
4196         'tl': 'tgl',
4197         'tn': 'tsn',
4198         'to': 'ton',
4199         'tr': 'tur',
4200         'ts': 'tso',
4201         'tt': 'tat',
4202         'tw': 'twi',
4203         'ty': 'tah',
4204         'ug': 'uig',
4205         'uk': 'ukr',
4206         'ur': 'urd',
4207         'uz': 'uzb',
4208         've': 'ven',
4209         'vi': 'vie',
4210         'vo': 'vol',
4211         'wa': 'wln',
4212         'wo': 'wol',
4213         'xh': 'xho',
4214         'yi': 'yid',
4215         'ji': 'yid',  # Replaced by yi in 1989 revision
4216         'yo': 'yor',
4217         'za': 'zha',
4218         'zh': 'zho',
4219         'zu': 'zul',
4220     }
4221
4222     @classmethod
4223     def short2long(cls, code):
4224         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4225         return cls._lang_map.get(code[:2])
4226
4227     @classmethod
4228     def long2short(cls, code):
4229         """Convert language code from ISO 639-2/T to ISO 639-1"""
4230         for short_name, long_name in cls._lang_map.items():
4231             if long_name == code:
4232                 return short_name
4233
4234
4235 class ISO3166Utils:
4236     # From http://data.okfn.org/data/core/country-list
4237     _country_map = {
4238         'AF': 'Afghanistan',
4239         'AX': 'Åland Islands',
4240         'AL': 'Albania',
4241         'DZ': 'Algeria',
4242         'AS': 'American Samoa',
4243         'AD': 'Andorra',
4244         'AO': 'Angola',
4245         'AI': 'Anguilla',
4246         'AQ': 'Antarctica',
4247         'AG': 'Antigua and Barbuda',
4248         'AR': 'Argentina',
4249         'AM': 'Armenia',
4250         'AW': 'Aruba',
4251         'AU': 'Australia',
4252         'AT': 'Austria',
4253         'AZ': 'Azerbaijan',
4254         'BS': 'Bahamas',
4255         'BH': 'Bahrain',
4256         'BD': 'Bangladesh',
4257         'BB': 'Barbados',
4258         'BY': 'Belarus',
4259         'BE': 'Belgium',
4260         'BZ': 'Belize',
4261         'BJ': 'Benin',
4262         'BM': 'Bermuda',
4263         'BT': 'Bhutan',
4264         'BO': 'Bolivia, Plurinational State of',
4265         'BQ': 'Bonaire, Sint Eustatius and Saba',
4266         'BA': 'Bosnia and Herzegovina',
4267         'BW': 'Botswana',
4268         'BV': 'Bouvet Island',
4269         'BR': 'Brazil',
4270         'IO': 'British Indian Ocean Territory',
4271         'BN': 'Brunei Darussalam',
4272         'BG': 'Bulgaria',
4273         'BF': 'Burkina Faso',
4274         'BI': 'Burundi',
4275         'KH': 'Cambodia',
4276         'CM': 'Cameroon',
4277         'CA': 'Canada',
4278         'CV': 'Cape Verde',
4279         'KY': 'Cayman Islands',
4280         'CF': 'Central African Republic',
4281         'TD': 'Chad',
4282         'CL': 'Chile',
4283         'CN': 'China',
4284         'CX': 'Christmas Island',
4285         'CC': 'Cocos (Keeling) Islands',
4286         'CO': 'Colombia',
4287         'KM': 'Comoros',
4288         'CG': 'Congo',
4289         'CD': 'Congo, the Democratic Republic of the',
4290         'CK': 'Cook Islands',
4291         'CR': 'Costa Rica',
4292         'CI': 'Côte d\'Ivoire',
4293         'HR': 'Croatia',
4294         'CU': 'Cuba',
4295         'CW': 'Curaçao',
4296         'CY': 'Cyprus',
4297         'CZ': 'Czech Republic',
4298         'DK': 'Denmark',
4299         'DJ': 'Djibouti',
4300         'DM': 'Dominica',
4301         'DO': 'Dominican Republic',
4302         'EC': 'Ecuador',
4303         'EG': 'Egypt',
4304         'SV': 'El Salvador',
4305         'GQ': 'Equatorial Guinea',
4306         'ER': 'Eritrea',
4307         'EE': 'Estonia',
4308         'ET': 'Ethiopia',
4309         'FK': 'Falkland Islands (Malvinas)',
4310         'FO': 'Faroe Islands',
4311         'FJ': 'Fiji',
4312         'FI': 'Finland',
4313         'FR': 'France',
4314         'GF': 'French Guiana',
4315         'PF': 'French Polynesia',
4316         'TF': 'French Southern Territories',
4317         'GA': 'Gabon',
4318         'GM': 'Gambia',
4319         'GE': 'Georgia',
4320         'DE': 'Germany',
4321         'GH': 'Ghana',
4322         'GI': 'Gibraltar',
4323         'GR': 'Greece',
4324         'GL': 'Greenland',
4325         'GD': 'Grenada',
4326         'GP': 'Guadeloupe',
4327         'GU': 'Guam',
4328         'GT': 'Guatemala',
4329         'GG': 'Guernsey',
4330         'GN': 'Guinea',
4331         'GW': 'Guinea-Bissau',
4332         'GY': 'Guyana',
4333         'HT': 'Haiti',
4334         'HM': 'Heard Island and McDonald Islands',
4335         'VA': 'Holy See (Vatican City State)',
4336         'HN': 'Honduras',
4337         'HK': 'Hong Kong',
4338         'HU': 'Hungary',
4339         'IS': 'Iceland',
4340         'IN': 'India',
4341         'ID': 'Indonesia',
4342         'IR': 'Iran, Islamic Republic of',
4343         'IQ': 'Iraq',
4344         'IE': 'Ireland',
4345         'IM': 'Isle of Man',
4346         'IL': 'Israel',
4347         'IT': 'Italy',
4348         'JM': 'Jamaica',
4349         'JP': 'Japan',
4350         'JE': 'Jersey',
4351         'JO': 'Jordan',
4352         'KZ': 'Kazakhstan',
4353         'KE': 'Kenya',
4354         'KI': 'Kiribati',
4355         'KP': 'Korea, Democratic People\'s Republic of',
4356         'KR': 'Korea, Republic of',
4357         'KW': 'Kuwait',
4358         'KG': 'Kyrgyzstan',
4359         'LA': 'Lao People\'s Democratic Republic',
4360         'LV': 'Latvia',
4361         'LB': 'Lebanon',
4362         'LS': 'Lesotho',
4363         'LR': 'Liberia',
4364         'LY': 'Libya',
4365         'LI': 'Liechtenstein',
4366         'LT': 'Lithuania',
4367         'LU': 'Luxembourg',
4368         'MO': 'Macao',
4369         'MK': 'Macedonia, the Former Yugoslav Republic of',
4370         'MG': 'Madagascar',
4371         'MW': 'Malawi',
4372         'MY': 'Malaysia',
4373         'MV': 'Maldives',
4374         'ML': 'Mali',
4375         'MT': 'Malta',
4376         'MH': 'Marshall Islands',
4377         'MQ': 'Martinique',
4378         'MR': 'Mauritania',
4379         'MU': 'Mauritius',
4380         'YT': 'Mayotte',
4381         'MX': 'Mexico',
4382         'FM': 'Micronesia, Federated States of',
4383         'MD': 'Moldova, Republic of',
4384         'MC': 'Monaco',
4385         'MN': 'Mongolia',
4386         'ME': 'Montenegro',
4387         'MS': 'Montserrat',
4388         'MA': 'Morocco',
4389         'MZ': 'Mozambique',
4390         'MM': 'Myanmar',
4391         'NA': 'Namibia',
4392         'NR': 'Nauru',
4393         'NP': 'Nepal',
4394         'NL': 'Netherlands',
4395         'NC': 'New Caledonia',
4396         'NZ': 'New Zealand',
4397         'NI': 'Nicaragua',
4398         'NE': 'Niger',
4399         'NG': 'Nigeria',
4400         'NU': 'Niue',
4401         'NF': 'Norfolk Island',
4402         'MP': 'Northern Mariana Islands',
4403         'NO': 'Norway',
4404         'OM': 'Oman',
4405         'PK': 'Pakistan',
4406         'PW': 'Palau',
4407         'PS': 'Palestine, State of',
4408         'PA': 'Panama',
4409         'PG': 'Papua New Guinea',
4410         'PY': 'Paraguay',
4411         'PE': 'Peru',
4412         'PH': 'Philippines',
4413         'PN': 'Pitcairn',
4414         'PL': 'Poland',
4415         'PT': 'Portugal',
4416         'PR': 'Puerto Rico',
4417         'QA': 'Qatar',
4418         'RE': 'Réunion',
4419         'RO': 'Romania',
4420         'RU': 'Russian Federation',
4421         'RW': 'Rwanda',
4422         'BL': 'Saint Barthélemy',
4423         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4424         'KN': 'Saint Kitts and Nevis',
4425         'LC': 'Saint Lucia',
4426         'MF': 'Saint Martin (French part)',
4427         'PM': 'Saint Pierre and Miquelon',
4428         'VC': 'Saint Vincent and the Grenadines',
4429         'WS': 'Samoa',
4430         'SM': 'San Marino',
4431         'ST': 'Sao Tome and Principe',
4432         'SA': 'Saudi Arabia',
4433         'SN': 'Senegal',
4434         'RS': 'Serbia',
4435         'SC': 'Seychelles',
4436         'SL': 'Sierra Leone',
4437         'SG': 'Singapore',
4438         'SX': 'Sint Maarten (Dutch part)',
4439         'SK': 'Slovakia',
4440         'SI': 'Slovenia',
4441         'SB': 'Solomon Islands',
4442         'SO': 'Somalia',
4443         'ZA': 'South Africa',
4444         'GS': 'South Georgia and the South Sandwich Islands',
4445         'SS': 'South Sudan',
4446         'ES': 'Spain',
4447         'LK': 'Sri Lanka',
4448         'SD': 'Sudan',
4449         'SR': 'Suriname',
4450         'SJ': 'Svalbard and Jan Mayen',
4451         'SZ': 'Swaziland',
4452         'SE': 'Sweden',
4453         'CH': 'Switzerland',
4454         'SY': 'Syrian Arab Republic',
4455         'TW': 'Taiwan, Province of China',
4456         'TJ': 'Tajikistan',
4457         'TZ': 'Tanzania, United Republic of',
4458         'TH': 'Thailand',
4459         'TL': 'Timor-Leste',
4460         'TG': 'Togo',
4461         'TK': 'Tokelau',
4462         'TO': 'Tonga',
4463         'TT': 'Trinidad and Tobago',
4464         'TN': 'Tunisia',
4465         'TR': 'Turkey',
4466         'TM': 'Turkmenistan',
4467         'TC': 'Turks and Caicos Islands',
4468         'TV': 'Tuvalu',
4469         'UG': 'Uganda',
4470         'UA': 'Ukraine',
4471         'AE': 'United Arab Emirates',
4472         'GB': 'United Kingdom',
4473         'US': 'United States',
4474         'UM': 'United States Minor Outlying Islands',
4475         'UY': 'Uruguay',
4476         'UZ': 'Uzbekistan',
4477         'VU': 'Vanuatu',
4478         'VE': 'Venezuela, Bolivarian Republic of',
4479         'VN': 'Viet Nam',
4480         'VG': 'Virgin Islands, British',
4481         'VI': 'Virgin Islands, U.S.',
4482         'WF': 'Wallis and Futuna',
4483         'EH': 'Western Sahara',
4484         'YE': 'Yemen',
4485         'ZM': 'Zambia',
4486         'ZW': 'Zimbabwe',
4487         # Not ISO 3166 codes, but used for IP blocks
4488         'AP': 'Asia/Pacific Region',
4489         'EU': 'Europe',
4490     }
4491
4492     @classmethod
4493     def short2full(cls, code):
4494         """Convert an ISO 3166-2 country code to the corresponding full name"""
4495         return cls._country_map.get(code.upper())
4496
4497
4498 class GeoUtils:
4499     # Major IPv4 address blocks per country
4500     _country_ip_map = {
4501         'AD': '46.172.224.0/19',
4502         'AE': '94.200.0.0/13',
4503         'AF': '149.54.0.0/17',
4504         'AG': '209.59.64.0/18',
4505         'AI': '204.14.248.0/21',
4506         'AL': '46.99.0.0/16',
4507         'AM': '46.70.0.0/15',
4508         'AO': '105.168.0.0/13',
4509         'AP': '182.50.184.0/21',
4510         'AQ': '23.154.160.0/24',
4511         'AR': '181.0.0.0/12',
4512         'AS': '202.70.112.0/20',
4513         'AT': '77.116.0.0/14',
4514         'AU': '1.128.0.0/11',
4515         'AW': '181.41.0.0/18',
4516         'AX': '185.217.4.0/22',
4517         'AZ': '5.197.0.0/16',
4518         'BA': '31.176.128.0/17',
4519         'BB': '65.48.128.0/17',
4520         'BD': '114.130.0.0/16',
4521         'BE': '57.0.0.0/8',
4522         'BF': '102.178.0.0/15',
4523         'BG': '95.42.0.0/15',
4524         'BH': '37.131.0.0/17',
4525         'BI': '154.117.192.0/18',
4526         'BJ': '137.255.0.0/16',
4527         'BL': '185.212.72.0/23',
4528         'BM': '196.12.64.0/18',
4529         'BN': '156.31.0.0/16',
4530         'BO': '161.56.0.0/16',
4531         'BQ': '161.0.80.0/20',
4532         'BR': '191.128.0.0/12',
4533         'BS': '24.51.64.0/18',
4534         'BT': '119.2.96.0/19',
4535         'BW': '168.167.0.0/16',
4536         'BY': '178.120.0.0/13',
4537         'BZ': '179.42.192.0/18',
4538         'CA': '99.224.0.0/11',
4539         'CD': '41.243.0.0/16',
4540         'CF': '197.242.176.0/21',
4541         'CG': '160.113.0.0/16',
4542         'CH': '85.0.0.0/13',
4543         'CI': '102.136.0.0/14',
4544         'CK': '202.65.32.0/19',
4545         'CL': '152.172.0.0/14',
4546         'CM': '102.244.0.0/14',
4547         'CN': '36.128.0.0/10',
4548         'CO': '181.240.0.0/12',
4549         'CR': '201.192.0.0/12',
4550         'CU': '152.206.0.0/15',
4551         'CV': '165.90.96.0/19',
4552         'CW': '190.88.128.0/17',
4553         'CY': '31.153.0.0/16',
4554         'CZ': '88.100.0.0/14',
4555         'DE': '53.0.0.0/8',
4556         'DJ': '197.241.0.0/17',
4557         'DK': '87.48.0.0/12',
4558         'DM': '192.243.48.0/20',
4559         'DO': '152.166.0.0/15',
4560         'DZ': '41.96.0.0/12',
4561         'EC': '186.68.0.0/15',
4562         'EE': '90.190.0.0/15',
4563         'EG': '156.160.0.0/11',
4564         'ER': '196.200.96.0/20',
4565         'ES': '88.0.0.0/11',
4566         'ET': '196.188.0.0/14',
4567         'EU': '2.16.0.0/13',
4568         'FI': '91.152.0.0/13',
4569         'FJ': '144.120.0.0/16',
4570         'FK': '80.73.208.0/21',
4571         'FM': '119.252.112.0/20',
4572         'FO': '88.85.32.0/19',
4573         'FR': '90.0.0.0/9',
4574         'GA': '41.158.0.0/15',
4575         'GB': '25.0.0.0/8',
4576         'GD': '74.122.88.0/21',
4577         'GE': '31.146.0.0/16',
4578         'GF': '161.22.64.0/18',
4579         'GG': '62.68.160.0/19',
4580         'GH': '154.160.0.0/12',
4581         'GI': '95.164.0.0/16',
4582         'GL': '88.83.0.0/19',
4583         'GM': '160.182.0.0/15',
4584         'GN': '197.149.192.0/18',
4585         'GP': '104.250.0.0/19',
4586         'GQ': '105.235.224.0/20',
4587         'GR': '94.64.0.0/13',
4588         'GT': '168.234.0.0/16',
4589         'GU': '168.123.0.0/16',
4590         'GW': '197.214.80.0/20',
4591         'GY': '181.41.64.0/18',
4592         'HK': '113.252.0.0/14',
4593         'HN': '181.210.0.0/16',
4594         'HR': '93.136.0.0/13',
4595         'HT': '148.102.128.0/17',
4596         'HU': '84.0.0.0/14',
4597         'ID': '39.192.0.0/10',
4598         'IE': '87.32.0.0/12',
4599         'IL': '79.176.0.0/13',
4600         'IM': '5.62.80.0/20',
4601         'IN': '117.192.0.0/10',
4602         'IO': '203.83.48.0/21',
4603         'IQ': '37.236.0.0/14',
4604         'IR': '2.176.0.0/12',
4605         'IS': '82.221.0.0/16',
4606         'IT': '79.0.0.0/10',
4607         'JE': '87.244.64.0/18',
4608         'JM': '72.27.0.0/17',
4609         'JO': '176.29.0.0/16',
4610         'JP': '133.0.0.0/8',
4611         'KE': '105.48.0.0/12',
4612         'KG': '158.181.128.0/17',
4613         'KH': '36.37.128.0/17',
4614         'KI': '103.25.140.0/22',
4615         'KM': '197.255.224.0/20',
4616         'KN': '198.167.192.0/19',
4617         'KP': '175.45.176.0/22',
4618         'KR': '175.192.0.0/10',
4619         'KW': '37.36.0.0/14',
4620         'KY': '64.96.0.0/15',
4621         'KZ': '2.72.0.0/13',
4622         'LA': '115.84.64.0/18',
4623         'LB': '178.135.0.0/16',
4624         'LC': '24.92.144.0/20',
4625         'LI': '82.117.0.0/19',
4626         'LK': '112.134.0.0/15',
4627         'LR': '102.183.0.0/16',
4628         'LS': '129.232.0.0/17',
4629         'LT': '78.56.0.0/13',
4630         'LU': '188.42.0.0/16',
4631         'LV': '46.109.0.0/16',
4632         'LY': '41.252.0.0/14',
4633         'MA': '105.128.0.0/11',
4634         'MC': '88.209.64.0/18',
4635         'MD': '37.246.0.0/16',
4636         'ME': '178.175.0.0/17',
4637         'MF': '74.112.232.0/21',
4638         'MG': '154.126.0.0/17',
4639         'MH': '117.103.88.0/21',
4640         'MK': '77.28.0.0/15',
4641         'ML': '154.118.128.0/18',
4642         'MM': '37.111.0.0/17',
4643         'MN': '49.0.128.0/17',
4644         'MO': '60.246.0.0/16',
4645         'MP': '202.88.64.0/20',
4646         'MQ': '109.203.224.0/19',
4647         'MR': '41.188.64.0/18',
4648         'MS': '208.90.112.0/22',
4649         'MT': '46.11.0.0/16',
4650         'MU': '105.16.0.0/12',
4651         'MV': '27.114.128.0/18',
4652         'MW': '102.70.0.0/15',
4653         'MX': '187.192.0.0/11',
4654         'MY': '175.136.0.0/13',
4655         'MZ': '197.218.0.0/15',
4656         'NA': '41.182.0.0/16',
4657         'NC': '101.101.0.0/18',
4658         'NE': '197.214.0.0/18',
4659         'NF': '203.17.240.0/22',
4660         'NG': '105.112.0.0/12',
4661         'NI': '186.76.0.0/15',
4662         'NL': '145.96.0.0/11',
4663         'NO': '84.208.0.0/13',
4664         'NP': '36.252.0.0/15',
4665         'NR': '203.98.224.0/19',
4666         'NU': '49.156.48.0/22',
4667         'NZ': '49.224.0.0/14',
4668         'OM': '5.36.0.0/15',
4669         'PA': '186.72.0.0/15',
4670         'PE': '186.160.0.0/14',
4671         'PF': '123.50.64.0/18',
4672         'PG': '124.240.192.0/19',
4673         'PH': '49.144.0.0/13',
4674         'PK': '39.32.0.0/11',
4675         'PL': '83.0.0.0/11',
4676         'PM': '70.36.0.0/20',
4677         'PR': '66.50.0.0/16',
4678         'PS': '188.161.0.0/16',
4679         'PT': '85.240.0.0/13',
4680         'PW': '202.124.224.0/20',
4681         'PY': '181.120.0.0/14',
4682         'QA': '37.210.0.0/15',
4683         'RE': '102.35.0.0/16',
4684         'RO': '79.112.0.0/13',
4685         'RS': '93.86.0.0/15',
4686         'RU': '5.136.0.0/13',
4687         'RW': '41.186.0.0/16',
4688         'SA': '188.48.0.0/13',
4689         'SB': '202.1.160.0/19',
4690         'SC': '154.192.0.0/11',
4691         'SD': '102.120.0.0/13',
4692         'SE': '78.64.0.0/12',
4693         'SG': '8.128.0.0/10',
4694         'SI': '188.196.0.0/14',
4695         'SK': '78.98.0.0/15',
4696         'SL': '102.143.0.0/17',
4697         'SM': '89.186.32.0/19',
4698         'SN': '41.82.0.0/15',
4699         'SO': '154.115.192.0/18',
4700         'SR': '186.179.128.0/17',
4701         'SS': '105.235.208.0/21',
4702         'ST': '197.159.160.0/19',
4703         'SV': '168.243.0.0/16',
4704         'SX': '190.102.0.0/20',
4705         'SY': '5.0.0.0/16',
4706         'SZ': '41.84.224.0/19',
4707         'TC': '65.255.48.0/20',
4708         'TD': '154.68.128.0/19',
4709         'TG': '196.168.0.0/14',
4710         'TH': '171.96.0.0/13',
4711         'TJ': '85.9.128.0/18',
4712         'TK': '27.96.24.0/21',
4713         'TL': '180.189.160.0/20',
4714         'TM': '95.85.96.0/19',
4715         'TN': '197.0.0.0/11',
4716         'TO': '175.176.144.0/21',
4717         'TR': '78.160.0.0/11',
4718         'TT': '186.44.0.0/15',
4719         'TV': '202.2.96.0/19',
4720         'TW': '120.96.0.0/11',
4721         'TZ': '156.156.0.0/14',
4722         'UA': '37.52.0.0/14',
4723         'UG': '102.80.0.0/13',
4724         'US': '6.0.0.0/8',
4725         'UY': '167.56.0.0/13',
4726         'UZ': '84.54.64.0/18',
4727         'VA': '212.77.0.0/19',
4728         'VC': '207.191.240.0/21',
4729         'VE': '186.88.0.0/13',
4730         'VG': '66.81.192.0/20',
4731         'VI': '146.226.0.0/16',
4732         'VN': '14.160.0.0/11',
4733         'VU': '202.80.32.0/20',
4734         'WF': '117.20.32.0/21',
4735         'WS': '202.4.32.0/19',
4736         'YE': '134.35.0.0/16',
4737         'YT': '41.242.116.0/22',
4738         'ZA': '41.0.0.0/11',
4739         'ZM': '102.144.0.0/13',
4740         'ZW': '102.177.192.0/18',
4741     }
4742
4743     @classmethod
4744     def random_ipv4(cls, code_or_block):
4745         if len(code_or_block) == 2:
4746             block = cls._country_ip_map.get(code_or_block.upper())
4747             if not block:
4748                 return None
4749         else:
4750             block = code_or_block
4751         addr, preflen = block.split('/')
4752         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4753         addr_max = addr_min | (0xffffffff >> int(preflen))
4754         return str(socket.inet_ntoa(
4755             struct.pack('!L', random.randint(addr_min, addr_max))))
4756
4757
4758 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4759     def __init__(self, proxies=None):
4760         # Set default handlers
4761         for type in ('http', 'https'):
4762             setattr(self, '%s_open' % type,
4763                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4764                         meth(r, proxy, type))
4765         urllib.request.ProxyHandler.__init__(self, proxies)
4766
4767     def proxy_open(self, req, proxy, type):
4768         req_proxy = req.headers.get('Ytdl-request-proxy')
4769         if req_proxy is not None:
4770             proxy = req_proxy
4771             del req.headers['Ytdl-request-proxy']
4772
4773         if proxy == '__noproxy__':
4774             return None  # No Proxy
4775         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4776             req.add_header('Ytdl-socks-proxy', proxy)
4777             # yt-dlp's http/https handlers do wrapping the socket with socks
4778             return None
4779         return urllib.request.ProxyHandler.proxy_open(
4780             self, req, proxy, type)
4781
4782
4783 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4784 # released into Public Domain
4785 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4786
4787 def long_to_bytes(n, blocksize=0):
4788     """long_to_bytes(n:long, blocksize:int) : string
4789     Convert a long integer to a byte string.
4790
4791     If optional blocksize is given and greater than zero, pad the front of the
4792     byte string with binary zeros so that the length is a multiple of
4793     blocksize.
4794     """
4795     # after much testing, this algorithm was deemed to be the fastest
4796     s = b''
4797     n = int(n)
4798     while n > 0:
4799         s = struct.pack('>I', n & 0xffffffff) + s
4800         n = n >> 32
4801     # strip off leading zeros
4802     for i in range(len(s)):
4803         if s[i] != b'\000'[0]:
4804             break
4805     else:
4806         # only happens when n == 0
4807         s = b'\000'
4808         i = 0
4809     s = s[i:]
4810     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4811     # de-padding being done above, but sigh...
4812     if blocksize > 0 and len(s) % blocksize:
4813         s = (blocksize - len(s) % blocksize) * b'\000' + s
4814     return s
4815
4816
4817 def bytes_to_long(s):
4818     """bytes_to_long(string) : long
4819     Convert a byte string to a long integer.
4820
4821     This is (essentially) the inverse of long_to_bytes().
4822     """
4823     acc = 0
4824     length = len(s)
4825     if length % 4:
4826         extra = (4 - length % 4)
4827         s = b'\000' * extra + s
4828         length = length + extra
4829     for i in range(0, length, 4):
4830         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4831     return acc
4832
4833
4834 def ohdave_rsa_encrypt(data, exponent, modulus):
4835     '''
4836     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4837
4838     Input:
4839         data: data to encrypt, bytes-like object
4840         exponent, modulus: parameter e and N of RSA algorithm, both integer
4841     Output: hex string of encrypted data
4842
4843     Limitation: supports one block encryption only
4844     '''
4845
4846     payload = int(binascii.hexlify(data[::-1]), 16)
4847     encrypted = pow(payload, exponent, modulus)
4848     return '%x' % encrypted
4849
4850
4851 def pkcs1pad(data, length):
4852     """
4853     Padding input data with PKCS#1 scheme
4854
4855     @param {int[]} data        input data
4856     @param {int}   length      target length
4857     @returns {int[]}           padded data
4858     """
4859     if len(data) > length - 11:
4860         raise ValueError('Input data too long for PKCS#1 padding')
4861
4862     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4863     return [0, 2] + pseudo_random + [0] + data
4864
4865
4866 def _base_n_table(n, table):
4867     if not table and not n:
4868         raise ValueError('Either table or n must be specified')
4869     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4870
4871     if n and n != len(table):
4872         raise ValueError(f'base {n} exceeds table length {len(table)}')
4873     return table
4874
4875
4876 def encode_base_n(num, n=None, table=None):
4877     """Convert given int to a base-n string"""
4878     table = _base_n_table(n, table)
4879     if not num:
4880         return table[0]
4881
4882     result, base = '', len(table)
4883     while num:
4884         result = table[num % base] + result
4885         num = num // base
4886     return result
4887
4888
4889 def decode_base_n(string, n=None, table=None):
4890     """Convert given base-n string to int"""
4891     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4892     result, base = 0, len(table)
4893     for char in string:
4894         result = result * base + table[char]
4895     return result
4896
4897
4898 def decode_packed_codes(code):
4899     mobj = re.search(PACKED_CODES_RE, code)
4900     obfuscated_code, base, count, symbols = mobj.groups()
4901     base = int(base)
4902     count = int(count)
4903     symbols = symbols.split('|')
4904     symbol_table = {}
4905
4906     while count:
4907         count -= 1
4908         base_n_count = encode_base_n(count, base)
4909         symbol_table[base_n_count] = symbols[count] or base_n_count
4910
4911     return re.sub(
4912         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4913         obfuscated_code)
4914
4915
4916 def caesar(s, alphabet, shift):
4917     if shift == 0:
4918         return s
4919     l = len(alphabet)
4920     return ''.join(
4921         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4922         for c in s)
4923
4924
4925 def rot47(s):
4926     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4927
4928
4929 def parse_m3u8_attributes(attrib):
4930     info = {}
4931     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4932         if val.startswith('"'):
4933             val = val[1:-1]
4934         info[key] = val
4935     return info
4936
4937
4938 def urshift(val, n):
4939     return val >> n if val >= 0 else (val + 0x100000000) >> n
4940
4941
4942 def write_xattr(path, key, value):
4943     # Windows: Write xattrs to NTFS Alternate Data Streams:
4944     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4945     if compat_os_name == 'nt':
4946         assert ':' not in key
4947         assert os.path.exists(path)
4948
4949         try:
4950             with open(f'{path}:{key}', 'wb') as f:
4951                 f.write(value)
4952         except OSError as e:
4953             raise XAttrMetadataError(e.errno, e.strerror)
4954         return
4955
4956     # UNIX Method 1. Use xattrs/pyxattrs modules
4957
4958     setxattr = None
4959     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4960         # Unicode arguments are not supported in pyxattr until version 0.5.0
4961         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4962         if version_tuple(xattr.__version__) >= (0, 5, 0):
4963             setxattr = xattr.set
4964     elif xattr:
4965         setxattr = xattr.setxattr
4966
4967     if setxattr:
4968         try:
4969             setxattr(path, key, value)
4970         except OSError as e:
4971             raise XAttrMetadataError(e.errno, e.strerror)
4972         return
4973
4974     # UNIX Method 2. Use setfattr/xattr executables
4975     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4976            else 'xattr' if check_executable('xattr', ['-h']) else None)
4977     if not exe:
4978         raise XAttrUnavailableError(
4979             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4980             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4981
4982     value = value.decode()
4983     try:
4984         _, stderr, returncode = Popen.run(
4985             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4986             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4987     except OSError as e:
4988         raise XAttrMetadataError(e.errno, e.strerror)
4989     if returncode:
4990         raise XAttrMetadataError(returncode, stderr)
4991
4992
4993 def random_birthday(year_field, month_field, day_field):
4994     start_date = datetime.date(1950, 1, 1)
4995     end_date = datetime.date(1995, 12, 31)
4996     offset = random.randint(0, (end_date - start_date).days)
4997     random_date = start_date + datetime.timedelta(offset)
4998     return {
4999         year_field: str(random_date.year),
5000         month_field: str(random_date.month),
5001         day_field: str(random_date.day),
5002     }
5003
5004
5005 def find_available_port(interface=''):
5006     try:
5007         with socket.socket() as sock:
5008             sock.bind((interface, 0))
5009             return sock.getsockname()[1]
5010     except OSError:
5011         return None
5012
5013
5014 # Templates for internet shortcut files, which are plain text files.
5015 DOT_URL_LINK_TEMPLATE = '''\
5016 [InternetShortcut]
5017 URL=%(url)s
5018 '''
5019
5020 DOT_WEBLOC_LINK_TEMPLATE = '''\
5021 <?xml version="1.0" encoding="UTF-8"?>
5022 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5023 <plist version="1.0">
5024 <dict>
5025 \t<key>URL</key>
5026 \t<string>%(url)s</string>
5027 </dict>
5028 </plist>
5029 '''
5030
5031 DOT_DESKTOP_LINK_TEMPLATE = '''\
5032 [Desktop Entry]
5033 Encoding=UTF-8
5034 Name=%(filename)s
5035 Type=Link
5036 URL=%(url)s
5037 Icon=text-html
5038 '''
5039
5040 LINK_TEMPLATES = {
5041     'url': DOT_URL_LINK_TEMPLATE,
5042     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5043     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5044 }
5045
5046
5047 def iri_to_uri(iri):
5048     """
5049     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5050
5051     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5052     """
5053
5054     iri_parts = urllib.parse.urlparse(iri)
5055
5056     if '[' in iri_parts.netloc:
5057         raise ValueError('IPv6 URIs are not, yet, supported.')
5058         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5059
5060     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5061
5062     net_location = ''
5063     if iri_parts.username:
5064         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5065         if iri_parts.password is not None:
5066             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5067         net_location += '@'
5068
5069     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5070     # The 'idna' encoding produces ASCII text.
5071     if iri_parts.port is not None and iri_parts.port != 80:
5072         net_location += ':' + str(iri_parts.port)
5073
5074     return urllib.parse.urlunparse(
5075         (iri_parts.scheme,
5076             net_location,
5077
5078             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5079
5080             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5081             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5082
5083             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5084             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5085
5086             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5087
5088     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5089
5090
5091 def to_high_limit_path(path):
5092     if sys.platform in ['win32', 'cygwin']:
5093         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5094         return '\\\\?\\' + os.path.abspath(path)
5095
5096     return path
5097
5098
5099 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5100     val = traversal.traverse_obj(obj, *variadic(field))
5101     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5102         return default
5103     return template % func(val)
5104
5105
5106 def clean_podcast_url(url):
5107     return re.sub(r'''(?x)
5108         (?:
5109             (?:
5110                 chtbl\.com/track|
5111                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5112                 play\.podtrac\.com
5113             )/[^/]+|
5114             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5115             flex\.acast\.com|
5116             pd(?:
5117                 cn\.co| # https://podcorn.com/analytics-prefix/
5118                 st\.fm # https://podsights.com/docs/
5119             )/e
5120         )/''', '', url)
5121
5122
5123 _HEX_TABLE = '0123456789abcdef'
5124
5125
5126 def random_uuidv4():
5127     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5128
5129
5130 def make_dir(path, to_screen=None):
5131     try:
5132         dn = os.path.dirname(path)
5133         if dn:
5134             os.makedirs(dn, exist_ok=True)
5135         return True
5136     except OSError as err:
5137         if callable(to_screen) is not None:
5138             to_screen(f'unable to create directory {err}')
5139         return False
5140
5141
5142 def get_executable_path():
5143     from ..update import _get_variant_and_executable_path
5144
5145     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5146
5147
5148 def get_user_config_dirs(package_name):
5149     # .config (e.g. ~/.config/package_name)
5150     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5151     yield os.path.join(xdg_config_home, package_name)
5152
5153     # appdata (%APPDATA%/package_name)
5154     appdata_dir = os.getenv('appdata')
5155     if appdata_dir:
5156         yield os.path.join(appdata_dir, package_name)
5157
5158     # home (~/.package_name)
5159     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5160
5161
5162 def get_system_config_dirs(package_name):
5163     # /etc/package_name
5164     yield os.path.join('/etc', package_name)
5165
5166
5167 def time_seconds(**kwargs):
5168     """
5169     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5170     """
5171     return time.time() + datetime.timedelta(**kwargs).total_seconds()
5172
5173
5174 # create a JSON Web Signature (jws) with HS256 algorithm
5175 # the resulting format is in JWS Compact Serialization
5176 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5177 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5178 def jwt_encode_hs256(payload_data, key, headers={}):
5179     header_data = {
5180         'alg': 'HS256',
5181         'typ': 'JWT',
5182     }
5183     if headers:
5184         header_data.update(headers)
5185     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5186     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5187     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5188     signature_b64 = base64.b64encode(h.digest())
5189     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5190     return token
5191
5192
5193 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5194 def jwt_decode_hs256(jwt):
5195     header_b64, payload_b64, signature_b64 = jwt.split('.')
5196     # add trailing ='s that may have been stripped, superfluous ='s are ignored
5197     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5198     return payload_data
5199
5200
5201 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5202
5203
5204 @functools.cache
5205 def supports_terminal_sequences(stream):
5206     if compat_os_name == 'nt':
5207         if not WINDOWS_VT_MODE:
5208             return False
5209     elif not os.getenv('TERM'):
5210         return False
5211     try:
5212         return stream.isatty()
5213     except BaseException:
5214         return False
5215
5216
5217 def windows_enable_vt_mode():
5218     """Ref: https://bugs.python.org/issue30075 """
5219     if get_windows_version() < (10, 0, 10586):
5220         return
5221
5222     import ctypes
5223     import ctypes.wintypes
5224     import msvcrt
5225
5226     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5227
5228     dll = ctypes.WinDLL('kernel32', use_last_error=False)
5229     handle = os.open('CONOUT$', os.O_RDWR)
5230     try:
5231         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5232         dw_original_mode = ctypes.wintypes.DWORD()
5233         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5234         if not success:
5235             raise Exception('GetConsoleMode failed')
5236
5237         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5238             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5239         if not success:
5240             raise Exception('SetConsoleMode failed')
5241     finally:
5242         os.close(handle)
5243
5244     global WINDOWS_VT_MODE
5245     WINDOWS_VT_MODE = True
5246     supports_terminal_sequences.cache_clear()
5247
5248
5249 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5250
5251
5252 def remove_terminal_sequences(string):
5253     return _terminal_sequences_re.sub('', string)
5254
5255
5256 def number_of_digits(number):
5257     return len('%d' % number)
5258
5259
5260 def join_nonempty(*values, delim='-', from_dict=None):
5261     if from_dict is not None:
5262         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
5263     return delim.join(map(str, filter(None, values)))
5264
5265
5266 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5267     """
5268     Find the largest format dimensions in terms of video width and, for each thumbnail:
5269     * Modify the URL: Match the width with the provided regex and replace with the former width
5270     * Update dimensions
5271
5272     This function is useful with video services that scale the provided thumbnails on demand
5273     """
5274     _keys = ('width', 'height')
5275     max_dimensions = max(
5276         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5277         default=(0, 0))
5278     if not max_dimensions[0]:
5279         return thumbnails
5280     return [
5281         merge_dicts(
5282             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5283             dict(zip(_keys, max_dimensions)), thumbnail)
5284         for thumbnail in thumbnails
5285     ]
5286
5287
5288 def parse_http_range(range):
5289     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5290     if not range:
5291         return None, None, None
5292     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5293     if not crg:
5294         return None, None, None
5295     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5296
5297
5298 def read_stdin(what):
5299     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5300     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5301     return sys.stdin
5302
5303
5304 def determine_file_encoding(data):
5305     """
5306     Detect the text encoding used
5307     @returns (encoding, bytes to skip)
5308     """
5309
5310     # BOM marks are given priority over declarations
5311     for bom, enc in BOMS:
5312         if data.startswith(bom):
5313             return enc, len(bom)
5314
5315     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5316     # We ignore the endianness to get a good enough match
5317     data = data.replace(b'\0', b'')
5318     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5319     return mobj.group(1).decode() if mobj else None, 0
5320
5321
5322 class Config:
5323     own_args = None
5324     parsed_args = None
5325     filename = None
5326     __initialized = False
5327
5328     def __init__(self, parser, label=None):
5329         self.parser, self.label = parser, label
5330         self._loaded_paths, self.configs = set(), []
5331
5332     def init(self, args=None, filename=None):
5333         assert not self.__initialized
5334         self.own_args, self.filename = args, filename
5335         return self.load_configs()
5336
5337     def load_configs(self):
5338         directory = ''
5339         if self.filename:
5340             location = os.path.realpath(self.filename)
5341             directory = os.path.dirname(location)
5342             if location in self._loaded_paths:
5343                 return False
5344             self._loaded_paths.add(location)
5345
5346         self.__initialized = True
5347         opts, _ = self.parser.parse_known_args(self.own_args)
5348         self.parsed_args = self.own_args
5349         for location in opts.config_locations or []:
5350             if location == '-':
5351                 if location in self._loaded_paths:
5352                     continue
5353                 self._loaded_paths.add(location)
5354                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5355                 continue
5356             location = os.path.join(directory, expand_path(location))
5357             if os.path.isdir(location):
5358                 location = os.path.join(location, 'yt-dlp.conf')
5359             if not os.path.exists(location):
5360                 self.parser.error(f'config location {location} does not exist')
5361             self.append_config(self.read_file(location), location)
5362         return True
5363
5364     def __str__(self):
5365         label = join_nonempty(
5366             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5367             delim=' ')
5368         return join_nonempty(
5369             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5370             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5371             delim='\n')
5372
5373     @staticmethod
5374     def read_file(filename, default=[]):
5375         try:
5376             optionf = open(filename, 'rb')
5377         except OSError:
5378             return default  # silently skip if file is not present
5379         try:
5380             enc, skip = determine_file_encoding(optionf.read(512))
5381             optionf.seek(skip, io.SEEK_SET)
5382         except OSError:
5383             enc = None  # silently skip read errors
5384         try:
5385             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5386             contents = optionf.read().decode(enc or preferredencoding())
5387             res = shlex.split(contents, comments=True)
5388         except Exception as err:
5389             raise ValueError(f'Unable to parse "{filename}": {err}')
5390         finally:
5391             optionf.close()
5392         return res
5393
5394     @staticmethod
5395     def hide_login_info(opts):
5396         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5397         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5398
5399         def _scrub_eq(o):
5400             m = eqre.match(o)
5401             if m:
5402                 return m.group('key') + '=PRIVATE'
5403             else:
5404                 return o
5405
5406         opts = list(map(_scrub_eq, opts))
5407         for idx, opt in enumerate(opts):
5408             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5409                 opts[idx + 1] = 'PRIVATE'
5410         return opts
5411
5412     def append_config(self, *args, label=None):
5413         config = type(self)(self.parser, label)
5414         config._loaded_paths = self._loaded_paths
5415         if config.init(*args):
5416             self.configs.append(config)
5417
5418     @property
5419     def all_args(self):
5420         for config in reversed(self.configs):
5421             yield from config.all_args
5422         yield from self.parsed_args or []
5423
5424     def parse_known_args(self, **kwargs):
5425         return self.parser.parse_known_args(self.all_args, **kwargs)
5426
5427     def parse_args(self):
5428         return self.parser.parse_args(self.all_args)
5429
5430
5431 class WebSocketsWrapper:
5432     """Wraps websockets module to use in non-async scopes"""
5433     pool = None
5434
5435     def __init__(self, url, headers=None, connect=True):
5436         self.loop = asyncio.new_event_loop()
5437         # XXX: "loop" is deprecated
5438         self.conn = websockets.connect(
5439             url, extra_headers=headers, ping_interval=None,
5440             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5441         if connect:
5442             self.__enter__()
5443         atexit.register(self.__exit__, None, None, None)
5444
5445     def __enter__(self):
5446         if not self.pool:
5447             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5448         return self
5449
5450     def send(self, *args):
5451         self.run_with_loop(self.pool.send(*args), self.loop)
5452
5453     def recv(self, *args):
5454         return self.run_with_loop(self.pool.recv(*args), self.loop)
5455
5456     def __exit__(self, type, value, traceback):
5457         try:
5458             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5459         finally:
5460             self.loop.close()
5461             self._cancel_all_tasks(self.loop)
5462
5463     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5464     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5465     @staticmethod
5466     def run_with_loop(main, loop):
5467         if not asyncio.iscoroutine(main):
5468             raise ValueError(f'a coroutine was expected, got {main!r}')
5469
5470         try:
5471             return loop.run_until_complete(main)
5472         finally:
5473             loop.run_until_complete(loop.shutdown_asyncgens())
5474             if hasattr(loop, 'shutdown_default_executor'):
5475                 loop.run_until_complete(loop.shutdown_default_executor())
5476
5477     @staticmethod
5478     def _cancel_all_tasks(loop):
5479         to_cancel = asyncio.all_tasks(loop)
5480
5481         if not to_cancel:
5482             return
5483
5484         for task in to_cancel:
5485             task.cancel()
5486
5487         # XXX: "loop" is removed in python 3.10+
5488         loop.run_until_complete(
5489             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5490
5491         for task in to_cancel:
5492             if task.cancelled():
5493                 continue
5494             if task.exception() is not None:
5495                 loop.call_exception_handler({
5496                     'message': 'unhandled exception during asyncio.run() shutdown',
5497                     'exception': task.exception(),
5498                     'task': task,
5499                 })
5500
5501
5502 def merge_headers(*dicts):
5503     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5504     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5505
5506
5507 def cached_method(f):
5508     """Cache a method"""
5509     signature = inspect.signature(f)
5510
5511     @functools.wraps(f)
5512     def wrapper(self, *args, **kwargs):
5513         bound_args = signature.bind(self, *args, **kwargs)
5514         bound_args.apply_defaults()
5515         key = tuple(bound_args.arguments.values())[1:]
5516
5517         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5518         if key not in cache:
5519             cache[key] = f(self, *args, **kwargs)
5520         return cache[key]
5521     return wrapper
5522
5523
5524 class classproperty:
5525     """property access for class methods with optional caching"""
5526     def __new__(cls, func=None, *args, **kwargs):
5527         if not func:
5528             return functools.partial(cls, *args, **kwargs)
5529         return super().__new__(cls)
5530
5531     def __init__(self, func, *, cache=False):
5532         functools.update_wrapper(self, func)
5533         self.func = func
5534         self._cache = {} if cache else None
5535
5536     def __get__(self, _, cls):
5537         if self._cache is None:
5538             return self.func(cls)
5539         elif cls not in self._cache:
5540             self._cache[cls] = self.func(cls)
5541         return self._cache[cls]
5542
5543
5544 class function_with_repr:
5545     def __init__(self, func, repr_=None):
5546         functools.update_wrapper(self, func)
5547         self.func, self.__repr = func, repr_
5548
5549     def __call__(self, *args, **kwargs):
5550         return self.func(*args, **kwargs)
5551
5552     def __repr__(self):
5553         if self.__repr:
5554             return self.__repr
5555         return f'{self.func.__module__}.{self.func.__qualname__}'
5556
5557
5558 class Namespace(types.SimpleNamespace):
5559     """Immutable namespace"""
5560
5561     def __iter__(self):
5562         return iter(self.__dict__.values())
5563
5564     @property
5565     def items_(self):
5566         return self.__dict__.items()
5567
5568
5569 MEDIA_EXTENSIONS = Namespace(
5570     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5571     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5572     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5573     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5574     thumbnails=('jpg', 'png', 'webp'),
5575     storyboards=('mhtml', ),
5576     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5577     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5578 )
5579 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5580 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5581
5582 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5583
5584
5585 class RetryManager:
5586     """Usage:
5587         for retry in RetryManager(...):
5588             try:
5589                 ...
5590             except SomeException as err:
5591                 retry.error = err
5592                 continue
5593     """
5594     attempt, _error = 0, None
5595
5596     def __init__(self, _retries, _error_callback, **kwargs):
5597         self.retries = _retries or 0
5598         self.error_callback = functools.partial(_error_callback, **kwargs)
5599
5600     def _should_retry(self):
5601         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5602
5603     @property
5604     def error(self):
5605         if self._error is NO_DEFAULT:
5606             return None
5607         return self._error
5608
5609     @error.setter
5610     def error(self, value):
5611         self._error = value
5612
5613     def __iter__(self):
5614         while self._should_retry():
5615             self.error = NO_DEFAULT
5616             self.attempt += 1
5617             yield self
5618             if self.error:
5619                 self.error_callback(self.error, self.attempt, self.retries)
5620
5621     @staticmethod
5622     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5623         """Utility function for reporting retries"""
5624         if count > retries:
5625             if error:
5626                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5627             raise e
5628
5629         if not count:
5630             return warn(e)
5631         elif isinstance(e, ExtractorError):
5632             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5633         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5634
5635         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5636         if delay:
5637             info(f'Sleeping {delay:.2f} seconds ...')
5638             time.sleep(delay)
5639
5640
5641 def make_archive_id(ie, video_id):
5642     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5643     return f'{ie_key.lower()} {video_id}'
5644
5645
5646 def truncate_string(s, left, right=0):
5647     assert left > 3 and right >= 0
5648     if s is None or len(s) <= left + right:
5649         return s
5650     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5651
5652
5653 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5654     assert 'all' in alias_dict, '"all" alias is required'
5655     requested = list(start or [])
5656     for val in options:
5657         discard = val.startswith('-')
5658         if discard:
5659             val = val[1:]
5660
5661         if val in alias_dict:
5662             val = alias_dict[val] if not discard else [
5663                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5664             # NB: Do not allow regex in aliases for performance
5665             requested = orderedSet_from_options(val, alias_dict, start=requested)
5666             continue
5667
5668         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5669                    else [val] if val in alias_dict['all'] else None)
5670         if current is None:
5671             raise ValueError(val)
5672
5673         if discard:
5674             for item in current:
5675                 while item in requested:
5676                     requested.remove(item)
5677         else:
5678             requested.extend(current)
5679
5680     return orderedSet(requested)
5681
5682
5683 # TODO: Rewrite
5684 class FormatSorter:
5685     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5686
5687     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5688                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5689                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5690     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5691                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5692                     'fps', 'fs_approx', 'source', 'id')
5693
5694     settings = {
5695         'vcodec': {'type': 'ordered', 'regex': True,
5696                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5697         'acodec': {'type': 'ordered', 'regex': True,
5698                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5699         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5700                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5701         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5702                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5703         'vext': {'type': 'ordered', 'field': 'video_ext',
5704                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5705                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5706         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5707                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5708                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5709         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5710         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5711                        'field': ('vcodec', 'acodec'),
5712                        'function': lambda it: int(any(v != 'none' for v in it))},
5713         'ie_pref': {'priority': True, 'type': 'extractor'},
5714         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5715         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5716         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5717         'quality': {'convert': 'float', 'default': -1},
5718         'filesize': {'convert': 'bytes'},
5719         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5720         'id': {'convert': 'string', 'field': 'format_id'},
5721         'height': {'convert': 'float_none'},
5722         'width': {'convert': 'float_none'},
5723         'fps': {'convert': 'float_none'},
5724         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5725         'tbr': {'convert': 'float_none'},
5726         'vbr': {'convert': 'float_none'},
5727         'abr': {'convert': 'float_none'},
5728         'asr': {'convert': 'float_none'},
5729         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5730
5731         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5732         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'),
5733                'function': lambda it: next(filter(None, it), None)},
5734         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'),
5735                  'function': lambda it: next(filter(None, it), None)},
5736         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5737         'res': {'type': 'multiple', 'field': ('height', 'width'),
5738                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5739
5740         # Actual field names
5741         'format_id': {'type': 'alias', 'field': 'id'},
5742         'preference': {'type': 'alias', 'field': 'ie_pref'},
5743         'language_preference': {'type': 'alias', 'field': 'lang'},
5744         'source_preference': {'type': 'alias', 'field': 'source'},
5745         'protocol': {'type': 'alias', 'field': 'proto'},
5746         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5747         'audio_channels': {'type': 'alias', 'field': 'channels'},
5748
5749         # Deprecated
5750         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5751         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5752         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5753         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5754         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5755         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5756         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5757         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5758         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5759         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5760         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5761         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5762         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5763         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5764         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5765         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5766         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5767         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5768         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5769         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5770     }
5771
5772     def __init__(self, ydl, field_preference):
5773         self.ydl = ydl
5774         self._order = []
5775         self.evaluate_params(self.ydl.params, field_preference)
5776         if ydl.params.get('verbose'):
5777             self.print_verbose_info(self.ydl.write_debug)
5778
5779     def _get_field_setting(self, field, key):
5780         if field not in self.settings:
5781             if key in ('forced', 'priority'):
5782                 return False
5783             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5784                                         'deprecated and may be removed in a future version')
5785             self.settings[field] = {}
5786         propObj = self.settings[field]
5787         if key not in propObj:
5788             type = propObj.get('type')
5789             if key == 'field':
5790                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5791             elif key == 'convert':
5792                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5793             else:
5794                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5795             propObj[key] = default
5796         return propObj[key]
5797
5798     def _resolve_field_value(self, field, value, convertNone=False):
5799         if value is None:
5800             if not convertNone:
5801                 return None
5802         else:
5803             value = value.lower()
5804         conversion = self._get_field_setting(field, 'convert')
5805         if conversion == 'ignore':
5806             return None
5807         if conversion == 'string':
5808             return value
5809         elif conversion == 'float_none':
5810             return float_or_none(value)
5811         elif conversion == 'bytes':
5812             return parse_bytes(value)
5813         elif conversion == 'order':
5814             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5815             use_regex = self._get_field_setting(field, 'regex')
5816             list_length = len(order_list)
5817             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5818             if use_regex and value is not None:
5819                 for i, regex in enumerate(order_list):
5820                     if regex and re.match(regex, value):
5821                         return list_length - i
5822                 return list_length - empty_pos  # not in list
5823             else:  # not regex or  value = None
5824                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5825         else:
5826             if value.isnumeric():
5827                 return float(value)
5828             else:
5829                 self.settings[field]['convert'] = 'string'
5830                 return value
5831
5832     def evaluate_params(self, params, sort_extractor):
5833         self._use_free_order = params.get('prefer_free_formats', False)
5834         self._sort_user = params.get('format_sort', [])
5835         self._sort_extractor = sort_extractor
5836
5837         def add_item(field, reverse, closest, limit_text):
5838             field = field.lower()
5839             if field in self._order:
5840                 return
5841             self._order.append(field)
5842             limit = self._resolve_field_value(field, limit_text)
5843             data = {
5844                 'reverse': reverse,
5845                 'closest': False if limit is None else closest,
5846                 'limit_text': limit_text,
5847                 'limit': limit}
5848             if field in self.settings:
5849                 self.settings[field].update(data)
5850             else:
5851                 self.settings[field] = data
5852
5853         sort_list = (
5854             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5855             + (tuple() if params.get('format_sort_force', False)
5856                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5857             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5858
5859         for item in sort_list:
5860             match = re.match(self.regex, item)
5861             if match is None:
5862                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5863             field = match.group('field')
5864             if field is None:
5865                 continue
5866             if self._get_field_setting(field, 'type') == 'alias':
5867                 alias, field = field, self._get_field_setting(field, 'field')
5868                 if self._get_field_setting(alias, 'deprecated'):
5869                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5870                                                 f'be removed in a future version. Please use {field} instead')
5871             reverse = match.group('reverse') is not None
5872             closest = match.group('separator') == '~'
5873             limit_text = match.group('limit')
5874
5875             has_limit = limit_text is not None
5876             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5877             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5878
5879             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5880             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5881             limit_count = len(limits)
5882             for (i, f) in enumerate(fields):
5883                 add_item(f, reverse, closest,
5884                          limits[i] if i < limit_count
5885                          else limits[0] if has_limit and not has_multiple_limits
5886                          else None)
5887
5888     def print_verbose_info(self, write_debug):
5889         if self._sort_user:
5890             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5891         if self._sort_extractor:
5892             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5893         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5894             '+' if self._get_field_setting(field, 'reverse') else '', field,
5895             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5896                           self._get_field_setting(field, 'limit_text'),
5897                           self._get_field_setting(field, 'limit'))
5898             if self._get_field_setting(field, 'limit_text') is not None else '')
5899             for field in self._order if self._get_field_setting(field, 'visible')]))
5900
5901     def _calculate_field_preference_from_value(self, format, field, type, value):
5902         reverse = self._get_field_setting(field, 'reverse')
5903         closest = self._get_field_setting(field, 'closest')
5904         limit = self._get_field_setting(field, 'limit')
5905
5906         if type == 'extractor':
5907             maximum = self._get_field_setting(field, 'max')
5908             if value is None or (maximum is not None and value >= maximum):
5909                 value = -1
5910         elif type == 'boolean':
5911             in_list = self._get_field_setting(field, 'in_list')
5912             not_in_list = self._get_field_setting(field, 'not_in_list')
5913             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5914         elif type == 'ordered':
5915             value = self._resolve_field_value(field, value, True)
5916
5917         # try to convert to number
5918         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5919         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5920         if is_num:
5921             value = val_num
5922
5923         return ((-10, 0) if value is None
5924                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5925                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5926                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5927                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5928                 else (-1, value, 0))
5929
5930     def _calculate_field_preference(self, format, field):
5931         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5932         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5933         if type == 'multiple':
5934             type = 'field'  # Only 'field' is allowed in multiple for now
5935             actual_fields = self._get_field_setting(field, 'field')
5936
5937             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5938         else:
5939             value = get_value(field)
5940         return self._calculate_field_preference_from_value(format, field, type, value)
5941
5942     def calculate_preference(self, format):
5943         # Determine missing protocol
5944         if not format.get('protocol'):
5945             format['protocol'] = determine_protocol(format)
5946
5947         # Determine missing ext
5948         if not format.get('ext') and 'url' in format:
5949             format['ext'] = determine_ext(format['url'])
5950         if format.get('vcodec') == 'none':
5951             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5952             format['video_ext'] = 'none'
5953         else:
5954             format['video_ext'] = format['ext']
5955             format['audio_ext'] = 'none'
5956         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5957         #    format['preference'] = -1000
5958
5959         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5960             # HEVC-over-FLV is out-of-spec by FLV's original spec
5961             # ref. https://trac.ffmpeg.org/ticket/6389
5962             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5963             format['preference'] = -100
5964
5965         # Determine missing bitrates
5966         if format.get('vcodec') == 'none':
5967             format['vbr'] = 0
5968         if format.get('acodec') == 'none':
5969             format['abr'] = 0
5970         if not format.get('vbr') and format.get('vcodec') != 'none':
5971             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5972         if not format.get('abr') and format.get('acodec') != 'none':
5973             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5974         if not format.get('tbr'):
5975             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5976
5977         return tuple(self._calculate_field_preference(format, field) for field in self._order)