yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import netrc
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import shlex
  35 import socket
  36 import ssl
  37 import struct
  38 import subprocess
  39 import sys
  40 import tempfile
  41 import time
  42 import traceback
  43 import types
  44 import unicodedata
  45 import urllib.error
  46 import urllib.parse
  47 import urllib.request
  48 import xml.etree.ElementTree
  49 import zlib
  50
  51 from . import traversal
  52
  53 from ..compat import functools  # isort: split
  54 from ..compat import (
  55     compat_etree_fromstring,
  56     compat_expanduser,
  57     compat_HTMLParseError,
  58     compat_os_name,
  59     compat_shlex_quote,
  60 )
  61 from ..dependencies import brotli, certifi, websockets, xattr
  62 from ..socks import ProxyType, sockssocket
  63
  64 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  65
  66 # This is not clearly defined otherwise
  67 compiled_regex_type = type(re.compile(''))
  68
  69
  70 def random_user_agent():
  71     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  72     _CHROME_VERSIONS = (
  73         '90.0.4430.212',
  74         '90.0.4430.24',
  75         '90.0.4430.70',
  76         '90.0.4430.72',
  77         '90.0.4430.85',
  78         '90.0.4430.93',
  79         '91.0.4472.101',
  80         '91.0.4472.106',
  81         '91.0.4472.114',
  82         '91.0.4472.124',
  83         '91.0.4472.164',
  84         '91.0.4472.19',
  85         '91.0.4472.77',
  86         '92.0.4515.107',
  87         '92.0.4515.115',
  88         '92.0.4515.131',
  89         '92.0.4515.159',
  90         '92.0.4515.43',
  91         '93.0.4556.0',
  92         '93.0.4577.15',
  93         '93.0.4577.63',
  94         '93.0.4577.82',
  95         '94.0.4606.41',
  96         '94.0.4606.54',
  97         '94.0.4606.61',
  98         '94.0.4606.71',
  99         '94.0.4606.81',
 100         '94.0.4606.85',
 101         '95.0.4638.17',
 102         '95.0.4638.50',
 103         '95.0.4638.54',
 104         '95.0.4638.69',
 105         '95.0.4638.74',
 106         '96.0.4664.18',
 107         '96.0.4664.45',
 108         '96.0.4664.55',
 109         '96.0.4664.93',
 110         '97.0.4692.20',
 111     )
 112     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 113
 114
 115 SUPPORTED_ENCODINGS = [
 116     'gzip', 'deflate'
 117 ]
 118 if brotli:
 119     SUPPORTED_ENCODINGS.append('br')
 120
 121 std_headers = {
 122     'User-Agent': random_user_agent(),
 123     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 124     'Accept-Language': 'en-us,en;q=0.5',
 125     'Sec-Fetch-Mode': 'navigate',
 126 }
 127
 128
 129 USER_AGENTS = {
 130     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 131 }
 132
 133
 134 class NO_DEFAULT:
 135     pass
 136
 137
 138 def IDENTITY(x):
 139     return x
 140
 141
 142 ENGLISH_MONTH_NAMES = [
 143     'January', 'February', 'March', 'April', 'May', 'June',
 144     'July', 'August', 'September', 'October', 'November', 'December']
 145
 146 MONTH_NAMES = {
 147     'en': ENGLISH_MONTH_NAMES,
 148     'fr': [
 149         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 150         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 151     # these follow the genitive grammatical case (dopełniacz)
 152     # some websites might be using nominative, which will require another month list
 153     # https://en.wikibooks.org/wiki/Polish/Noun_cases
 154     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
 155            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
 156 }
 157
 158 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 159 TIMEZONE_NAMES = {
 160     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 161     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 162     'EST': -5, 'EDT': -4,  # Eastern
 163     'CST': -6, 'CDT': -5,  # Central
 164     'MST': -7, 'MDT': -6,  # Mountain
 165     'PST': -8, 'PDT': -7   # Pacific
 166 }
 167
 168 # needed for sanitizing filenames in restricted mode
 169 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 170                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 171                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 172
 173 DATE_FORMATS = (
 174     '%d %B %Y',
 175     '%d %b %Y',
 176     '%B %d %Y',
 177     '%B %dst %Y',
 178     '%B %dnd %Y',
 179     '%B %drd %Y',
 180     '%B %dth %Y',
 181     '%b %d %Y',
 182     '%b %dst %Y',
 183     '%b %dnd %Y',
 184     '%b %drd %Y',
 185     '%b %dth %Y',
 186     '%b %dst %Y %I:%M',
 187     '%b %dnd %Y %I:%M',
 188     '%b %drd %Y %I:%M',
 189     '%b %dth %Y %I:%M',
 190     '%Y %m %d',
 191     '%Y-%m-%d',
 192     '%Y.%m.%d.',
 193     '%Y/%m/%d',
 194     '%Y/%m/%d %H:%M',
 195     '%Y/%m/%d %H:%M:%S',
 196     '%Y%m%d%H%M',
 197     '%Y%m%d%H%M%S',
 198     '%Y%m%d',
 199     '%Y-%m-%d %H:%M',
 200     '%Y-%m-%d %H:%M:%S',
 201     '%Y-%m-%d %H:%M:%S.%f',
 202     '%Y-%m-%d %H:%M:%S:%f',
 203     '%d.%m.%Y %H:%M',
 204     '%d.%m.%Y %H.%M',
 205     '%Y-%m-%dT%H:%M:%SZ',
 206     '%Y-%m-%dT%H:%M:%S.%fZ',
 207     '%Y-%m-%dT%H:%M:%S.%f0Z',
 208     '%Y-%m-%dT%H:%M:%S',
 209     '%Y-%m-%dT%H:%M:%S.%f',
 210     '%Y-%m-%dT%H:%M',
 211     '%b %d %Y at %H:%M',
 212     '%b %d %Y at %H:%M:%S',
 213     '%B %d %Y at %H:%M',
 214     '%B %d %Y at %H:%M:%S',
 215     '%H:%M %d-%b-%Y',
 216 )
 217
 218 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 219 DATE_FORMATS_DAY_FIRST.extend([
 220     '%d-%m-%Y',
 221     '%d.%m.%Y',
 222     '%d.%m.%y',
 223     '%d/%m/%Y',
 224     '%d/%m/%y',
 225     '%d/%m/%Y %H:%M:%S',
 226     '%d-%m-%Y %H:%M',
 227     '%H:%M %d/%m/%Y',
 228 ])
 229
 230 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 231 DATE_FORMATS_MONTH_FIRST.extend([
 232     '%m-%d-%Y',
 233     '%m.%d.%Y',
 234     '%m/%d/%Y',
 235     '%m/%d/%y',
 236     '%m/%d/%Y %H:%M:%S',
 237 ])
 238
 239 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 240 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 241
 242 NUMBER_RE = r'\d+(?:\.\d+)?'
 243
 244
 245 @functools.cache
 246 def preferredencoding():
 247     """Get preferred encoding.
 248
 249     Returns the best encoding scheme for the system, based on
 250     locale.getpreferredencoding() and some further tweaks.
 251     """
 252     try:
 253         pref = locale.getpreferredencoding()
 254         'TEST'.encode(pref)
 255     except Exception:
 256         pref = 'UTF-8'
 257
 258     return pref
 259
 260
 261 def write_json_file(obj, fn):
 262     """ Encode obj as JSON and write it to fn, atomically if possible """
 263
 264     tf = tempfile.NamedTemporaryFile(
 265         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 266         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 267
 268     try:
 269         with tf:
 270             json.dump(obj, tf, ensure_ascii=False)
 271         if sys.platform == 'win32':
 272             # Need to remove existing file on Windows, else os.rename raises
 273             # WindowsError or FileExistsError.
 274             with contextlib.suppress(OSError):
 275                 os.unlink(fn)
 276         with contextlib.suppress(OSError):
 277             mask = os.umask(0)
 278             os.umask(mask)
 279             os.chmod(tf.name, 0o666 & ~mask)
 280         os.rename(tf.name, fn)
 281     except Exception:
 282         with contextlib.suppress(OSError):
 283             os.remove(tf.name)
 284         raise
 285
 286
 287 def find_xpath_attr(node, xpath, key, val=None):
 288     """ Find the xpath xpath[@key=val] """
 289     assert re.match(r'^[a-zA-Z_-]+$', key)
 290     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 291     return node.find(expr)
 292
 293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 294 # the namespace parameter
 295
 296
 297 def xpath_with_ns(path, ns_map):
 298     components = [c.split(':') for c in path.split('/')]
 299     replaced = []
 300     for c in components:
 301         if len(c) == 1:
 302             replaced.append(c[0])
 303         else:
 304             ns, tag = c
 305             replaced.append('{%s}%s' % (ns_map[ns], tag))
 306     return '/'.join(replaced)
 307
 308
 309 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 310     def _find_xpath(xpath):
 311         return node.find(xpath)
 312
 313     if isinstance(xpath, str):
 314         n = _find_xpath(xpath)
 315     else:
 316         for xp in xpath:
 317             n = _find_xpath(xp)
 318             if n is not None:
 319                 break
 320
 321     if n is None:
 322         if default is not NO_DEFAULT:
 323             return default
 324         elif fatal:
 325             name = xpath if name is None else name
 326             raise ExtractorError('Could not find XML element %s' % name)
 327         else:
 328             return None
 329     return n
 330
 331
 332 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 333     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 334     if n is None or n == default:
 335         return n
 336     if n.text is None:
 337         if default is not NO_DEFAULT:
 338             return default
 339         elif fatal:
 340             name = xpath if name is None else name
 341             raise ExtractorError('Could not find XML element\'s text %s' % name)
 342         else:
 343             return None
 344     return n.text
 345
 346
 347 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 348     n = find_xpath_attr(node, xpath, key)
 349     if n is None:
 350         if default is not NO_DEFAULT:
 351             return default
 352         elif fatal:
 353             name = f'{xpath}[@{key}]' if name is None else name
 354             raise ExtractorError('Could not find XML attribute %s' % name)
 355         else:
 356             return None
 357     return n.attrib[key]
 358
 359
 360 def get_element_by_id(id, html, **kwargs):
 361     """Return the content of the tag with the specified ID in the passed HTML document"""
 362     return get_element_by_attribute('id', id, html, **kwargs)
 363
 364
 365 def get_element_html_by_id(id, html, **kwargs):
 366     """Return the html of the tag with the specified ID in the passed HTML document"""
 367     return get_element_html_by_attribute('id', id, html, **kwargs)
 368
 369
 370 def get_element_by_class(class_name, html):
 371     """Return the content of the first tag with the specified class in the passed HTML document"""
 372     retval = get_elements_by_class(class_name, html)
 373     return retval[0] if retval else None
 374
 375
 376 def get_element_html_by_class(class_name, html):
 377     """Return the html of the first tag with the specified class in the passed HTML document"""
 378     retval = get_elements_html_by_class(class_name, html)
 379     return retval[0] if retval else None
 380
 381
 382 def get_element_by_attribute(attribute, value, html, **kwargs):
 383     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 384     return retval[0] if retval else None
 385
 386
 387 def get_element_html_by_attribute(attribute, value, html, **kargs):
 388     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 389     return retval[0] if retval else None
 390
 391
 392 def get_elements_by_class(class_name, html, **kargs):
 393     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 394     return get_elements_by_attribute(
 395         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 396         html, escape_value=False)
 397
 398
 399 def get_elements_html_by_class(class_name, html):
 400     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 401     return get_elements_html_by_attribute(
 402         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 403         html, escape_value=False)
 404
 405
 406 def get_elements_by_attribute(*args, **kwargs):
 407     """Return the content of the tag with the specified attribute in the passed HTML document"""
 408     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 409
 410
 411 def get_elements_html_by_attribute(*args, **kwargs):
 412     """Return the html of the tag with the specified attribute in the passed HTML document"""
 413     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 414
 415
 416 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 417     """
 418     Return the text (content) and the html (whole) of the tag with the specified
 419     attribute in the passed HTML document
 420     """
 421     if not value:
 422         return
 423
 424     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 425
 426     value = re.escape(value) if escape_value else value
 427
 428     partial_element_re = rf'''(?x)
 429         <(?P<tag>{tag})
 430          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 431          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 432         '''
 433
 434     for m in re.finditer(partial_element_re, html):
 435         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 436
 437         yield (
 438             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 439             whole
 440         )
 441
 442
 443 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 444     """
 445     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 446     closing tag for the first opening tag it has encountered, and can be used
 447     as a context manager
 448     """
 449
 450     class HTMLBreakOnClosingTagException(Exception):
 451         pass
 452
 453     def __init__(self):
 454         self.tagstack = collections.deque()
 455         html.parser.HTMLParser.__init__(self)
 456
 457     def __enter__(self):
 458         return self
 459
 460     def __exit__(self, *_):
 461         self.close()
 462
 463     def close(self):
 464         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 465         # so data remains buffered; we no longer have any interest in it, thus
 466         # override this method to discard it
 467         pass
 468
 469     def handle_starttag(self, tag, _):
 470         self.tagstack.append(tag)
 471
 472     def handle_endtag(self, tag):
 473         if not self.tagstack:
 474             raise compat_HTMLParseError('no tags in the stack')
 475         while self.tagstack:
 476             inner_tag = self.tagstack.pop()
 477             if inner_tag == tag:
 478                 break
 479         else:
 480             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 481         if not self.tagstack:
 482             raise self.HTMLBreakOnClosingTagException()
 483
 484
 485 # XXX: This should be far less strict
 486 def get_element_text_and_html_by_tag(tag, html):
 487     """
 488     For the first element with the specified tag in the passed HTML document
 489     return its' content (text) and the whole element (html)
 490     """
 491     def find_or_raise(haystack, needle, exc):
 492         try:
 493             return haystack.index(needle)
 494         except ValueError:
 495             raise exc
 496     closing_tag = f'</{tag}>'
 497     whole_start = find_or_raise(
 498         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 499     content_start = find_or_raise(
 500         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 501     content_start += whole_start + 1
 502     with HTMLBreakOnClosingTagParser() as parser:
 503         parser.feed(html[whole_start:content_start])
 504         if not parser.tagstack or parser.tagstack[0] != tag:
 505             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 506         offset = content_start
 507         while offset < len(html):
 508             next_closing_tag_start = find_or_raise(
 509                 html[offset:], closing_tag,
 510                 compat_HTMLParseError(f'closing {tag} tag not found'))
 511             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 512             try:
 513                 parser.feed(html[offset:offset + next_closing_tag_end])
 514                 offset += next_closing_tag_end
 515             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 516                 return html[content_start:offset + next_closing_tag_start], \
 517                     html[whole_start:offset + next_closing_tag_end]
 518         raise compat_HTMLParseError('unexpected end of html')
 519
 520
 521 class HTMLAttributeParser(html.parser.HTMLParser):
 522     """Trivial HTML parser to gather the attributes for a single element"""
 523
 524     def __init__(self):
 525         self.attrs = {}
 526         html.parser.HTMLParser.__init__(self)
 527
 528     def handle_starttag(self, tag, attrs):
 529         self.attrs = dict(attrs)
 530         raise compat_HTMLParseError('done')
 531
 532
 533 class HTMLListAttrsParser(html.parser.HTMLParser):
 534     """HTML parser to gather the attributes for the elements of a list"""
 535
 536     def __init__(self):
 537         html.parser.HTMLParser.__init__(self)
 538         self.items = []
 539         self._level = 0
 540
 541     def handle_starttag(self, tag, attrs):
 542         if tag == 'li' and self._level == 0:
 543             self.items.append(dict(attrs))
 544         self._level += 1
 545
 546     def handle_endtag(self, tag):
 547         self._level -= 1
 548
 549
 550 def extract_attributes(html_element):
 551     """Given a string for an HTML element such as
 552     <el
 553          a="foo" B="bar" c="&98;az" d=boz
 554          empty= noval entity="&amp;"
 555          sq='"' dq="'"
 556     >
 557     Decode and return a dictionary of attributes.
 558     {
 559         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 560         'empty': '', 'noval': None, 'entity': '&',
 561         'sq': '"', 'dq': '\''
 562     }.
 563     """
 564     parser = HTMLAttributeParser()
 565     with contextlib.suppress(compat_HTMLParseError):
 566         parser.feed(html_element)
 567         parser.close()
 568     return parser.attrs
 569
 570
 571 def parse_list(webpage):
 572     """Given a string for an series of HTML <li> elements,
 573     return a dictionary of their attributes"""
 574     parser = HTMLListAttrsParser()
 575     parser.feed(webpage)
 576     parser.close()
 577     return parser.items
 578
 579
 580 def clean_html(html):
 581     """Clean an HTML snippet into a readable string"""
 582
 583     if html is None:  # Convenience for sanitizing descriptions etc.
 584         return html
 585
 586     html = re.sub(r'\s+', ' ', html)
 587     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 588     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 589     # Strip html tags
 590     html = re.sub('<.*?>', '', html)
 591     # Replace html entities
 592     html = unescapeHTML(html)
 593     return html.strip()
 594
 595
 596 class LenientJSONDecoder(json.JSONDecoder):
 597     # TODO: Write tests
 598     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 599         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 600         self._close_attempts = 2 * close_objects
 601         super().__init__(*args, **kwargs)
 602
 603     @staticmethod
 604     def _close_object(err):
 605         doc = err.doc[:err.pos]
 606         # We need to add comma first to get the correct error message
 607         if err.msg.startswith('Expecting \',\''):
 608             return doc + ','
 609         elif not doc.endswith(','):
 610             return
 611
 612         if err.msg.startswith('Expecting property name'):
 613             return doc[:-1] + '}'
 614         elif err.msg.startswith('Expecting value'):
 615             return doc[:-1] + ']'
 616
 617     def decode(self, s):
 618         if self.transform_source:
 619             s = self.transform_source(s)
 620         for attempt in range(self._close_attempts + 1):
 621             try:
 622                 if self.ignore_extra:
 623                     return self.raw_decode(s.lstrip())[0]
 624                 return super().decode(s)
 625             except json.JSONDecodeError as e:
 626                 if e.pos is None:
 627                     raise
 628                 elif attempt < self._close_attempts:
 629                     s = self._close_object(e)
 630                     if s is not None:
 631                         continue
 632                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 633         assert False, 'Too many attempts to decode JSON'
 634
 635
 636 def sanitize_open(filename, open_mode):
 637     """Try to open the given filename, and slightly tweak it if this fails.
 638
 639     Attempts to open the given filename. If this fails, it tries to change
 640     the filename slightly, step by step, until it's either able to open it
 641     or it fails and raises a final exception, like the standard open()
 642     function.
 643
 644     It returns the tuple (stream, definitive_file_name).
 645     """
 646     if filename == '-':
 647         if sys.platform == 'win32':
 648             import msvcrt
 649
 650             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 651             with contextlib.suppress(io.UnsupportedOperation):
 652                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 653         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 654
 655     for attempt in range(2):
 656         try:
 657             try:
 658                 if sys.platform == 'win32':
 659                     # FIXME: An exclusive lock also locks the file from being read.
 660                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 661                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 662                     raise LockingUnsupportedError()
 663                 stream = locked_file(filename, open_mode, block=False).__enter__()
 664             except OSError:
 665                 stream = open(filename, open_mode)
 666             return stream, filename
 667         except OSError as err:
 668             if attempt or err.errno in (errno.EACCES,):
 669                 raise
 670             old_filename, filename = filename, sanitize_path(filename)
 671             if old_filename == filename:
 672                 raise
 673
 674
 675 def timeconvert(timestr):
 676     """Convert RFC 2822 defined time string into system timestamp"""
 677     timestamp = None
 678     timetuple = email.utils.parsedate_tz(timestr)
 679     if timetuple is not None:
 680         timestamp = email.utils.mktime_tz(timetuple)
 681     return timestamp
 682
 683
 684 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 685     """Sanitizes a string so it could be used as part of a filename.
 686     @param restricted   Use a stricter subset of allowed characters
 687     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 688                         If unset, yt-dlp's new sanitization rules are in effect
 689     """
 690     if s == '':
 691         return ''
 692
 693     def replace_insane(char):
 694         if restricted and char in ACCENT_CHARS:
 695             return ACCENT_CHARS[char]
 696         elif not restricted and char == '\n':
 697             return '\0 '
 698         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 699             # Replace with their full-width unicode counterparts
 700             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 701         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 702             return ''
 703         elif char == '"':
 704             return '' if restricted else '\''
 705         elif char == ':':
 706             return '\0_\0-' if restricted else '\0 \0-'
 707         elif char in '\\/|*<>':
 708             return '\0_'
 709         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 710             return '\0_'
 711         return char
 712
 713     # Replace look-alike Unicode glyphs
 714     if restricted and (is_id is NO_DEFAULT or not is_id):
 715         s = unicodedata.normalize('NFKC', s)
 716     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 717     result = ''.join(map(replace_insane, s))
 718     if is_id is NO_DEFAULT:
 719         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 720         STRIP_RE = r'(?:\0.|[ _-])*'
 721         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 722     result = result.replace('\0', '') or '_'
 723
 724     if not is_id:
 725         while '__' in result:
 726             result = result.replace('__', '_')
 727         result = result.strip('_')
 728         # Common case of "Foreign band name - English song title"
 729         if restricted and result.startswith('-_'):
 730             result = result[2:]
 731         if result.startswith('-'):
 732             result = '_' + result[len('-'):]
 733         result = result.lstrip('.')
 734         if not result:
 735             result = '_'
 736     return result
 737
 738
 739 def sanitize_path(s, force=False):
 740     """Sanitizes and normalizes path on Windows"""
 741     if sys.platform == 'win32':
 742         force = False
 743         drive_or_unc, _ = os.path.splitdrive(s)
 744     elif force:
 745         drive_or_unc = ''
 746     else:
 747         return s
 748
 749     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 750     if drive_or_unc:
 751         norm_path.pop(0)
 752     sanitized_path = [
 753         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 754         for path_part in norm_path]
 755     if drive_or_unc:
 756         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 757     elif force and s and s[0] == os.path.sep:
 758         sanitized_path.insert(0, os.path.sep)
 759     return os.path.join(*sanitized_path)
 760
 761
 762 def sanitize_url(url, *, scheme='http'):
 763     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 764     # the number of unwanted failures due to missing protocol
 765     if url is None:
 766         return
 767     elif url.startswith('//'):
 768         return f'{scheme}:{url}'
 769     # Fix some common typos seen so far
 770     COMMON_TYPOS = (
 771         # https://github.com/ytdl-org/youtube-dl/issues/15649
 772         (r'^httpss://', r'https://'),
 773         # https://bx1.be/lives/direct-tv/
 774         (r'^rmtp([es]?)://', r'rtmp\1://'),
 775     )
 776     for mistake, fixup in COMMON_TYPOS:
 777         if re.match(mistake, url):
 778             return re.sub(mistake, fixup, url)
 779     return url
 780
 781
 782 def extract_basic_auth(url):
 783     parts = urllib.parse.urlsplit(url)
 784     if parts.username is None:
 785         return url, None
 786     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 787         parts.hostname if parts.port is None
 788         else '%s:%d' % (parts.hostname, parts.port))))
 789     auth_payload = base64.b64encode(
 790         ('%s:%s' % (parts.username, parts.password or '')).encode())
 791     return url, f'Basic {auth_payload.decode()}'
 792
 793
 794 def sanitized_Request(url, *args, **kwargs):
 795     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 796     if auth_header is not None:
 797         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 798         headers['Authorization'] = auth_header
 799     return urllib.request.Request(url, *args, **kwargs)
 800
 801
 802 def expand_path(s):
 803     """Expand shell variables and ~"""
 804     return os.path.expandvars(compat_expanduser(s))
 805
 806
 807 def orderedSet(iterable, *, lazy=False):
 808     """Remove all duplicates from the input iterable"""
 809     def _iter():
 810         seen = []  # Do not use set since the items can be unhashable
 811         for x in iterable:
 812             if x not in seen:
 813                 seen.append(x)
 814                 yield x
 815
 816     return _iter() if lazy else list(_iter())
 817
 818
 819 def _htmlentity_transform(entity_with_semicolon):
 820     """Transforms an HTML entity to a character."""
 821     entity = entity_with_semicolon[:-1]
 822
 823     # Known non-numeric HTML entity
 824     if entity in html.entities.name2codepoint:
 825         return chr(html.entities.name2codepoint[entity])
 826
 827     # TODO: HTML5 allows entities without a semicolon.
 828     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 829     if entity_with_semicolon in html.entities.html5:
 830         return html.entities.html5[entity_with_semicolon]
 831
 832     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 833     if mobj is not None:
 834         numstr = mobj.group(1)
 835         if numstr.startswith('x'):
 836             base = 16
 837             numstr = '0%s' % numstr
 838         else:
 839             base = 10
 840         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 841         with contextlib.suppress(ValueError):
 842             return chr(int(numstr, base))
 843
 844     # Unknown entity in name, return its literal representation
 845     return '&%s;' % entity
 846
 847
 848 def unescapeHTML(s):
 849     if s is None:
 850         return None
 851     assert isinstance(s, str)
 852
 853     return re.sub(
 854         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 855
 856
 857 def escapeHTML(text):
 858     return (
 859         text
 860         .replace('&', '&amp;')
 861         .replace('<', '&lt;')
 862         .replace('>', '&gt;')
 863         .replace('"', '&quot;')
 864         .replace("'", '&#39;')
 865     )
 866
 867
 868 class netrc_from_content(netrc.netrc):
 869     def __init__(self, content):
 870         self.hosts, self.macros = {}, {}
 871         with io.StringIO(content) as stream:
 872             self._parse('-', stream, False)
 873
 874
 875 class Popen(subprocess.Popen):
 876     if sys.platform == 'win32':
 877         _startupinfo = subprocess.STARTUPINFO()
 878         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 879     else:
 880         _startupinfo = None
 881
 882     @staticmethod
 883     def _fix_pyinstaller_ld_path(env):
 884         """Restore LD_LIBRARY_PATH when using PyInstaller
 885             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 886                  https://github.com/yt-dlp/yt-dlp/issues/4573
 887         """
 888         if not hasattr(sys, '_MEIPASS'):
 889             return
 890
 891         def _fix(key):
 892             orig = env.get(f'{key}_ORIG')
 893             if orig is None:
 894                 env.pop(key, None)
 895             else:
 896                 env[key] = orig
 897
 898         _fix('LD_LIBRARY_PATH')  # Linux
 899         _fix('DYLD_LIBRARY_PATH')  # macOS
 900
 901     def __init__(self, *args, env=None, text=False, **kwargs):
 902         if env is None:
 903             env = os.environ.copy()
 904         self._fix_pyinstaller_ld_path(env)
 905
 906         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 907         if text is True:
 908             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 909             kwargs.setdefault('encoding', 'utf-8')
 910             kwargs.setdefault('errors', 'replace')
 911         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 912
 913     def communicate_or_kill(self, *args, **kwargs):
 914         try:
 915             return self.communicate(*args, **kwargs)
 916         except BaseException:  # Including KeyboardInterrupt
 917             self.kill(timeout=None)
 918             raise
 919
 920     def kill(self, *, timeout=0):
 921         super().kill()
 922         if timeout != 0:
 923             self.wait(timeout=timeout)
 924
 925     @classmethod
 926     def run(cls, *args, timeout=None, **kwargs):
 927         with cls(*args, **kwargs) as proc:
 928             default = '' if proc.__text_mode else b''
 929             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 930             return stdout or default, stderr or default, proc.returncode
 931
 932
 933 def encodeArgument(s):
 934     # Legacy code that uses byte strings
 935     # Uncomment the following line after fixing all post processors
 936     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 937     return s if isinstance(s, str) else s.decode('ascii')
 938
 939
 940 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 941
 942
 943 def timetuple_from_msec(msec):
 944     secs, msec = divmod(msec, 1000)
 945     mins, secs = divmod(secs, 60)
 946     hrs, mins = divmod(mins, 60)
 947     return _timetuple(hrs, mins, secs, msec)
 948
 949
 950 def formatSeconds(secs, delim=':', msec=False):
 951     time = timetuple_from_msec(secs * 1000)
 952     if time.hours:
 953         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 954     elif time.minutes:
 955         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 956     else:
 957         ret = '%d' % time.seconds
 958     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 959
 960
 961 def _ssl_load_windows_store_certs(ssl_context, storename):
 962     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 963     try:
 964         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 965                  if encoding == 'x509_asn' and (
 966                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 967     except PermissionError:
 968         return
 969     for cert in certs:
 970         with contextlib.suppress(ssl.SSLError):
 971             ssl_context.load_verify_locations(cadata=cert)
 972
 973
 974 def make_HTTPS_handler(params, **kwargs):
 975     opts_check_certificate = not params.get('nocheckcertificate')
 976     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 977     context.check_hostname = opts_check_certificate
 978     if params.get('legacyserverconnect'):
 979         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 980         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 981         context.set_ciphers('DEFAULT')
 982     elif (
 983         sys.version_info < (3, 10)
 984         and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
 985         and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
 986     ):
 987         # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
 988         # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
 989         # in some situations [2][3].
 990         # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
 991         # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
 992         # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
 993         # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
 994         # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
 995         # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
 996         # 4. https://peps.python.org/pep-0644/
 997         # 5. https://peps.python.org/pep-0644/#libressl-support
 998         # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
 999         context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1000         context.minimum_version = ssl.TLSVersion.TLSv1_2
1001
1002     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1003     if opts_check_certificate:
1004         if certifi and 'no-certifi' not in params.get('compat_opts', []):
1005             context.load_verify_locations(cafile=certifi.where())
1006         else:
1007             try:
1008                 context.load_default_certs()
1009                 # Work around the issue in load_default_certs when there are bad certificates. See:
1010                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1011                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1012             except ssl.SSLError:
1013                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1014                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1015                     for storename in ('CA', 'ROOT'):
1016                         _ssl_load_windows_store_certs(context, storename)
1017                 context.set_default_verify_paths()
1018
1019     client_certfile = params.get('client_certificate')
1020     if client_certfile:
1021         try:
1022             context.load_cert_chain(
1023                 client_certfile, keyfile=params.get('client_certificate_key'),
1024                 password=params.get('client_certificate_password'))
1025         except ssl.SSLError:
1026             raise YoutubeDLError('Unable to load client certificate')
1027
1028     # Some servers may reject requests if ALPN extension is not sent. See:
1029     # https://github.com/python/cpython/issues/85140
1030     # https://github.com/yt-dlp/yt-dlp/issues/3878
1031     with contextlib.suppress(NotImplementedError):
1032         context.set_alpn_protocols(['http/1.1'])
1033
1034     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1035
1036
1037 def bug_reports_message(before=';'):
1038     from ..update import REPOSITORY
1039
1040     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1041            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1042
1043     before = before.rstrip()
1044     if not before or before.endswith(('.', '!', '?')):
1045         msg = msg[0].title() + msg[1:]
1046
1047     return (before + ' ' if before else '') + msg
1048
1049
1050 class YoutubeDLError(Exception):
1051     """Base exception for YoutubeDL errors."""
1052     msg = None
1053
1054     def __init__(self, msg=None):
1055         if msg is not None:
1056             self.msg = msg
1057         elif self.msg is None:
1058             self.msg = type(self).__name__
1059         super().__init__(self.msg)
1060
1061
1062 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1063 if hasattr(ssl, 'CertificateError'):
1064     network_exceptions.append(ssl.CertificateError)
1065 network_exceptions = tuple(network_exceptions)
1066
1067
1068 class ExtractorError(YoutubeDLError):
1069     """Error during info extraction."""
1070
1071     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1072         """ tb, if given, is the original traceback (so that it can be printed out).
1073         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1074         """
1075         if sys.exc_info()[0] in network_exceptions:
1076             expected = True
1077
1078         self.orig_msg = str(msg)
1079         self.traceback = tb
1080         self.expected = expected
1081         self.cause = cause
1082         self.video_id = video_id
1083         self.ie = ie
1084         self.exc_info = sys.exc_info()  # preserve original exception
1085         if isinstance(self.exc_info[1], ExtractorError):
1086             self.exc_info = self.exc_info[1].exc_info
1087         super().__init__(self.__msg)
1088
1089     @property
1090     def __msg(self):
1091         return ''.join((
1092             format_field(self.ie, None, '[%s] '),
1093             format_field(self.video_id, None, '%s: '),
1094             self.orig_msg,
1095             format_field(self.cause, None, ' (caused by %r)'),
1096             '' if self.expected else bug_reports_message()))
1097
1098     def format_traceback(self):
1099         return join_nonempty(
1100             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1101             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1102             delim='\n') or None
1103
1104     def __setattr__(self, name, value):
1105         super().__setattr__(name, value)
1106         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1107             self.msg = self.__msg or type(self).__name__
1108             self.args = (self.msg, )  # Cannot be property
1109
1110
1111 class UnsupportedError(ExtractorError):
1112     def __init__(self, url):
1113         super().__init__(
1114             'Unsupported URL: %s' % url, expected=True)
1115         self.url = url
1116
1117
1118 class RegexNotFoundError(ExtractorError):
1119     """Error when a regex didn't match"""
1120     pass
1121
1122
1123 class GeoRestrictedError(ExtractorError):
1124     """Geographic restriction Error exception.
1125
1126     This exception may be thrown when a video is not available from your
1127     geographic location due to geographic restrictions imposed by a website.
1128     """
1129
1130     def __init__(self, msg, countries=None, **kwargs):
1131         kwargs['expected'] = True
1132         super().__init__(msg, **kwargs)
1133         self.countries = countries
1134
1135
1136 class UserNotLive(ExtractorError):
1137     """Error when a channel/user is not live"""
1138
1139     def __init__(self, msg=None, **kwargs):
1140         kwargs['expected'] = True
1141         super().__init__(msg or 'The channel is not currently live', **kwargs)
1142
1143
1144 class DownloadError(YoutubeDLError):
1145     """Download Error exception.
1146
1147     This exception may be thrown by FileDownloader objects if they are not
1148     configured to continue on errors. They will contain the appropriate
1149     error message.
1150     """
1151
1152     def __init__(self, msg, exc_info=None):
1153         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1154         super().__init__(msg)
1155         self.exc_info = exc_info
1156
1157
1158 class EntryNotInPlaylist(YoutubeDLError):
1159     """Entry not in playlist exception.
1160
1161     This exception will be thrown by YoutubeDL when a requested entry
1162     is not found in the playlist info_dict
1163     """
1164     msg = 'Entry not found in info'
1165
1166
1167 class SameFileError(YoutubeDLError):
1168     """Same File exception.
1169
1170     This exception will be thrown by FileDownloader objects if they detect
1171     multiple files would have to be downloaded to the same file on disk.
1172     """
1173     msg = 'Fixed output name but more than one file to download'
1174
1175     def __init__(self, filename=None):
1176         if filename is not None:
1177             self.msg += f': {filename}'
1178         super().__init__(self.msg)
1179
1180
1181 class PostProcessingError(YoutubeDLError):
1182     """Post Processing exception.
1183
1184     This exception may be raised by PostProcessor's .run() method to
1185     indicate an error in the postprocessing task.
1186     """
1187
1188
1189 class DownloadCancelled(YoutubeDLError):
1190     """ Exception raised when the download queue should be interrupted """
1191     msg = 'The download was cancelled'
1192
1193
1194 class ExistingVideoReached(DownloadCancelled):
1195     """ --break-on-existing triggered """
1196     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1197
1198
1199 class RejectedVideoReached(DownloadCancelled):
1200     """ --break-match-filter triggered """
1201     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1202
1203
1204 class MaxDownloadsReached(DownloadCancelled):
1205     """ --max-downloads limit has been reached. """
1206     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1207
1208
1209 class ReExtractInfo(YoutubeDLError):
1210     """ Video info needs to be re-extracted. """
1211
1212     def __init__(self, msg, expected=False):
1213         super().__init__(msg)
1214         self.expected = expected
1215
1216
1217 class ThrottledDownload(ReExtractInfo):
1218     """ Download speed below --throttled-rate. """
1219     msg = 'The download speed is below throttle limit'
1220
1221     def __init__(self):
1222         super().__init__(self.msg, expected=False)
1223
1224
1225 class UnavailableVideoError(YoutubeDLError):
1226     """Unavailable Format exception.
1227
1228     This exception will be thrown when a video is requested
1229     in a format that is not available for that video.
1230     """
1231     msg = 'Unable to download video'
1232
1233     def __init__(self, err=None):
1234         if err is not None:
1235             self.msg += f': {err}'
1236         super().__init__(self.msg)
1237
1238
1239 class ContentTooShortError(YoutubeDLError):
1240     """Content Too Short exception.
1241
1242     This exception may be raised by FileDownloader objects when a file they
1243     download is too small for what the server announced first, indicating
1244     the connection was probably interrupted.
1245     """
1246
1247     def __init__(self, downloaded, expected):
1248         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1249         # Both in bytes
1250         self.downloaded = downloaded
1251         self.expected = expected
1252
1253
1254 class XAttrMetadataError(YoutubeDLError):
1255     def __init__(self, code=None, msg='Unknown error'):
1256         super().__init__(msg)
1257         self.code = code
1258         self.msg = msg
1259
1260         # Parsing code and msg
1261         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1262                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1263             self.reason = 'NO_SPACE'
1264         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1265             self.reason = 'VALUE_TOO_LONG'
1266         else:
1267             self.reason = 'NOT_SUPPORTED'
1268
1269
1270 class XAttrUnavailableError(YoutubeDLError):
1271     pass
1272
1273
1274 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1275     hc = http_class(*args, **kwargs)
1276     source_address = ydl_handler._params.get('source_address')
1277
1278     if source_address is not None:
1279         # This is to workaround _create_connection() from socket where it will try all
1280         # address data from getaddrinfo() including IPv6. This filters the result from
1281         # getaddrinfo() based on the source_address value.
1282         # This is based on the cpython socket.create_connection() function.
1283         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1284         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1285             host, port = address
1286             err = None
1287             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1288             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1289             ip_addrs = [addr for addr in addrs if addr[0] == af]
1290             if addrs and not ip_addrs:
1291                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1292                 raise OSError(
1293                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1294                     % (ip_version, source_address[0]))
1295             for res in ip_addrs:
1296                 af, socktype, proto, canonname, sa = res
1297                 sock = None
1298                 try:
1299                     sock = socket.socket(af, socktype, proto)
1300                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1301                         sock.settimeout(timeout)
1302                     sock.bind(source_address)
1303                     sock.connect(sa)
1304                     err = None  # Explicitly break reference cycle
1305                     return sock
1306                 except OSError as _:
1307                     err = _
1308                     if sock is not None:
1309                         sock.close()
1310             if err is not None:
1311                 raise err
1312             else:
1313                 raise OSError('getaddrinfo returns an empty list')
1314         if hasattr(hc, '_create_connection'):
1315             hc._create_connection = _create_connection
1316         hc.source_address = (source_address, 0)
1317
1318     return hc
1319
1320
1321 class YoutubeDLHandler(urllib.request.HTTPHandler):
1322     """Handler for HTTP requests and responses.
1323
1324     This class, when installed with an OpenerDirector, automatically adds
1325     the standard headers to every HTTP request and handles gzipped, deflated and
1326     brotli responses from web servers.
1327
1328     Part of this code was copied from:
1329
1330     http://techknack.net/python-urllib2-handlers/
1331
1332     Andrew Rowls, the author of that code, agreed to release it to the
1333     public domain.
1334     """
1335
1336     def __init__(self, params, *args, **kwargs):
1337         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1338         self._params = params
1339
1340     def http_open(self, req):
1341         conn_class = http.client.HTTPConnection
1342
1343         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1344         if socks_proxy:
1345             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1346             del req.headers['Ytdl-socks-proxy']
1347
1348         return self.do_open(functools.partial(
1349             _create_http_connection, self, conn_class, False),
1350             req)
1351
1352     @staticmethod
1353     def deflate(data):
1354         if not data:
1355             return data
1356         try:
1357             return zlib.decompress(data, -zlib.MAX_WBITS)
1358         except zlib.error:
1359             return zlib.decompress(data)
1360
1361     @staticmethod
1362     def brotli(data):
1363         if not data:
1364             return data
1365         return brotli.decompress(data)
1366
1367     @staticmethod
1368     def gz(data):
1369         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
1370         try:
1371             return gz.read()
1372         except OSError as original_oserror:
1373             # There may be junk add the end of the file
1374             # See http://stackoverflow.com/q/4928560/35070 for details
1375             for i in range(1, 1024):
1376                 try:
1377                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
1378                     return gz.read()
1379                 except OSError:
1380                     continue
1381             else:
1382                 raise original_oserror
1383
1384     def http_request(self, req):
1385         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1386         # always respected by websites, some tend to give out URLs with non percent-encoded
1387         # non-ASCII characters (see telemb.py, ard.py [#3412])
1388         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1389         # To work around aforementioned issue we will replace request's original URL with
1390         # percent-encoded one
1391         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1392         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1393         url = req.get_full_url()
1394         url_escaped = escape_url(url)
1395
1396         # Substitute URL if any change after escaping
1397         if url != url_escaped:
1398             req = update_Request(req, url=url_escaped)
1399
1400         for h, v in self._params.get('http_headers', std_headers).items():
1401             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1402             # The dict keys are capitalized because of this bug by urllib
1403             if h.capitalize() not in req.headers:
1404                 req.add_header(h, v)
1405
1406         if 'Youtubedl-no-compression' in req.headers:  # deprecated
1407             req.headers.pop('Youtubedl-no-compression', None)
1408             req.add_header('Accept-encoding', 'identity')
1409
1410         if 'Accept-encoding' not in req.headers:
1411             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1412
1413         return super().do_request_(req)
1414
1415     def http_response(self, req, resp):
1416         old_resp = resp
1417
1418         # Content-Encoding header lists the encodings in order that they were applied [1].
1419         # To decompress, we simply do the reverse.
1420         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
1421         decoded_response = None
1422         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
1423             if encoding == 'gzip':
1424                 decoded_response = self.gz(decoded_response or resp.read())
1425             elif encoding == 'deflate':
1426                 decoded_response = self.deflate(decoded_response or resp.read())
1427             elif encoding == 'br' and brotli:
1428                 decoded_response = self.brotli(decoded_response or resp.read())
1429
1430         if decoded_response is not None:
1431             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
1432             resp.msg = old_resp.msg
1433         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1434         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1435         if 300 <= resp.code < 400:
1436             location = resp.headers.get('Location')
1437             if location:
1438                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1439                 location = location.encode('iso-8859-1').decode()
1440                 location_escaped = escape_url(location)
1441                 if location != location_escaped:
1442                     del resp.headers['Location']
1443                     resp.headers['Location'] = location_escaped
1444         return resp
1445
1446     https_request = http_request
1447     https_response = http_response
1448
1449
1450 def make_socks_conn_class(base_class, socks_proxy):
1451     assert issubclass(base_class, (
1452         http.client.HTTPConnection, http.client.HTTPSConnection))
1453
1454     url_components = urllib.parse.urlparse(socks_proxy)
1455     if url_components.scheme.lower() == 'socks5':
1456         socks_type = ProxyType.SOCKS5
1457     elif url_components.scheme.lower() in ('socks', 'socks4'):
1458         socks_type = ProxyType.SOCKS4
1459     elif url_components.scheme.lower() == 'socks4a':
1460         socks_type = ProxyType.SOCKS4A
1461
1462     def unquote_if_non_empty(s):
1463         if not s:
1464             return s
1465         return urllib.parse.unquote_plus(s)
1466
1467     proxy_args = (
1468         socks_type,
1469         url_components.hostname, url_components.port or 1080,
1470         True,  # Remote DNS
1471         unquote_if_non_empty(url_components.username),
1472         unquote_if_non_empty(url_components.password),
1473     )
1474
1475     class SocksConnection(base_class):
1476         def connect(self):
1477             self.sock = sockssocket()
1478             self.sock.setproxy(*proxy_args)
1479             if isinstance(self.timeout, (int, float)):
1480                 self.sock.settimeout(self.timeout)
1481             self.sock.connect((self.host, self.port))
1482
1483             if isinstance(self, http.client.HTTPSConnection):
1484                 if hasattr(self, '_context'):  # Python > 2.6
1485                     self.sock = self._context.wrap_socket(
1486                         self.sock, server_hostname=self.host)
1487                 else:
1488                     self.sock = ssl.wrap_socket(self.sock)
1489
1490     return SocksConnection
1491
1492
1493 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1494     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1495         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1496         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1497         self._params = params
1498
1499     def https_open(self, req):
1500         kwargs = {}
1501         conn_class = self._https_conn_class
1502
1503         if hasattr(self, '_context'):  # python > 2.6
1504             kwargs['context'] = self._context
1505         if hasattr(self, '_check_hostname'):  # python 3.x
1506             kwargs['check_hostname'] = self._check_hostname
1507
1508         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1509         if socks_proxy:
1510             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1511             del req.headers['Ytdl-socks-proxy']
1512
1513         try:
1514             return self.do_open(
1515                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1516         except urllib.error.URLError as e:
1517             if (isinstance(e.reason, ssl.SSLError)
1518                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1519                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1520             raise
1521
1522
1523 def is_path_like(f):
1524     return isinstance(f, (str, bytes, os.PathLike))
1525
1526
1527 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1528     def __init__(self, cookiejar=None):
1529         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1530
1531     def http_response(self, request, response):
1532         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1533
1534     https_request = urllib.request.HTTPCookieProcessor.http_request
1535     https_response = http_response
1536
1537
1538 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1539     """YoutubeDL redirect handler
1540
1541     The code is based on HTTPRedirectHandler implementation from CPython [1].
1542
1543     This redirect handler fixes and improves the logic to better align with RFC7261
1544      and what browsers tend to do [2][3]
1545
1546     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1547     2. https://datatracker.ietf.org/doc/html/rfc7231
1548     3. https://github.com/python/cpython/issues/91306
1549     """
1550
1551     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1552
1553     def redirect_request(self, req, fp, code, msg, headers, newurl):
1554         if code not in (301, 302, 303, 307, 308):
1555             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1556
1557         new_method = req.get_method()
1558         new_data = req.data
1559         remove_headers = []
1560         # A 303 must either use GET or HEAD for subsequent request
1561         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1562         if code == 303 and req.get_method() != 'HEAD':
1563             new_method = 'GET'
1564         # 301 and 302 redirects are commonly turned into a GET from a POST
1565         # for subsequent requests by browsers, so we'll do the same.
1566         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1567         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1568         elif code in (301, 302) and req.get_method() == 'POST':
1569             new_method = 'GET'
1570
1571         # only remove payload if method changed (e.g. POST to GET)
1572         if new_method != req.get_method():
1573             new_data = None
1574             remove_headers.extend(['Content-Length', 'Content-Type'])
1575
1576         new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
1577
1578         return urllib.request.Request(
1579             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1580             unverifiable=True, method=new_method, data=new_data)
1581
1582
1583 def extract_timezone(date_str):
1584     m = re.search(
1585         r'''(?x)
1586             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1587             (?P<tz>Z|                                            # just the UTC Z, or
1588                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1589                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1590                    [ ]?                                          # optional space
1591                 (?P<sign>\+|-)                                   # +/-
1592                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1593             $)
1594         ''', date_str)
1595     if not m:
1596         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1597         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1598         if timezone is not None:
1599             date_str = date_str[:-len(m.group('tz'))]
1600         timezone = datetime.timedelta(hours=timezone or 0)
1601     else:
1602         date_str = date_str[:-len(m.group('tz'))]
1603         if not m.group('sign'):
1604             timezone = datetime.timedelta()
1605         else:
1606             sign = 1 if m.group('sign') == '+' else -1
1607             timezone = datetime.timedelta(
1608                 hours=sign * int(m.group('hours')),
1609                 minutes=sign * int(m.group('minutes')))
1610     return timezone, date_str
1611
1612
1613 def parse_iso8601(date_str, delimiter='T', timezone=None):
1614     """ Return a UNIX timestamp from the given date """
1615
1616     if date_str is None:
1617         return None
1618
1619     date_str = re.sub(r'\.[0-9]+', '', date_str)
1620
1621     if timezone is None:
1622         timezone, date_str = extract_timezone(date_str)
1623
1624     with contextlib.suppress(ValueError):
1625         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1626         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1627         return calendar.timegm(dt.timetuple())
1628
1629
1630 def date_formats(day_first=True):
1631     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1632
1633
1634 def unified_strdate(date_str, day_first=True):
1635     """Return a string with the date in the format YYYYMMDD"""
1636
1637     if date_str is None:
1638         return None
1639     upload_date = None
1640     # Replace commas
1641     date_str = date_str.replace(',', ' ')
1642     # Remove AM/PM + timezone
1643     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1644     _, date_str = extract_timezone(date_str)
1645
1646     for expression in date_formats(day_first):
1647         with contextlib.suppress(ValueError):
1648             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1649     if upload_date is None:
1650         timetuple = email.utils.parsedate_tz(date_str)
1651         if timetuple:
1652             with contextlib.suppress(ValueError):
1653                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1654     if upload_date is not None:
1655         return str(upload_date)
1656
1657
1658 def unified_timestamp(date_str, day_first=True):
1659     if not isinstance(date_str, str):
1660         return None
1661
1662     date_str = re.sub(r'\s+', ' ', re.sub(
1663         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1664
1665     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1666     timezone, date_str = extract_timezone(date_str)
1667
1668     # Remove AM/PM + timezone
1669     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1670
1671     # Remove unrecognized timezones from ISO 8601 alike timestamps
1672     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1673     if m:
1674         date_str = date_str[:-len(m.group('tz'))]
1675
1676     # Python only supports microseconds, so remove nanoseconds
1677     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1678     if m:
1679         date_str = m.group(1)
1680
1681     for expression in date_formats(day_first):
1682         with contextlib.suppress(ValueError):
1683             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1684             return calendar.timegm(dt.timetuple())
1685
1686     timetuple = email.utils.parsedate_tz(date_str)
1687     if timetuple:
1688         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1689
1690
1691 def determine_ext(url, default_ext='unknown_video'):
1692     if url is None or '.' not in url:
1693         return default_ext
1694     guess = url.partition('?')[0].rpartition('.')[2]
1695     if re.match(r'^[A-Za-z0-9]+$', guess):
1696         return guess
1697     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1698     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1699         return guess.rstrip('/')
1700     else:
1701         return default_ext
1702
1703
1704 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1705     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1706
1707
1708 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1709     R"""
1710     Return a datetime object from a string.
1711     Supported format:
1712         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1713
1714     @param format       strftime format of DATE
1715     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1716                         auto: round to the unit provided in date_str (if applicable).
1717     """
1718     auto_precision = False
1719     if precision == 'auto':
1720         auto_precision = True
1721         precision = 'microsecond'
1722     today = datetime_round(datetime.datetime.utcnow(), precision)
1723     if date_str in ('now', 'today'):
1724         return today
1725     if date_str == 'yesterday':
1726         return today - datetime.timedelta(days=1)
1727     match = re.match(
1728         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1729         date_str)
1730     if match is not None:
1731         start_time = datetime_from_str(match.group('start'), precision, format)
1732         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1733         unit = match.group('unit')
1734         if unit == 'month' or unit == 'year':
1735             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1736             unit = 'day'
1737         else:
1738             if unit == 'week':
1739                 unit = 'day'
1740                 time *= 7
1741             delta = datetime.timedelta(**{unit + 's': time})
1742             new_date = start_time + delta
1743         if auto_precision:
1744             return datetime_round(new_date, unit)
1745         return new_date
1746
1747     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1748
1749
1750 def date_from_str(date_str, format='%Y%m%d', strict=False):
1751     R"""
1752     Return a date object from a string using datetime_from_str
1753
1754     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1755                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1756     """
1757     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1758         raise ValueError(f'Invalid date format "{date_str}"')
1759     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1760
1761
1762 def datetime_add_months(dt, months):
1763     """Increment/Decrement a datetime object by months."""
1764     month = dt.month + months - 1
1765     year = dt.year + month // 12
1766     month = month % 12 + 1
1767     day = min(dt.day, calendar.monthrange(year, month)[1])
1768     return dt.replace(year, month, day)
1769
1770
1771 def datetime_round(dt, precision='day'):
1772     """
1773     Round a datetime object's time to a specific precision
1774     """
1775     if precision == 'microsecond':
1776         return dt
1777
1778     unit_seconds = {
1779         'day': 86400,
1780         'hour': 3600,
1781         'minute': 60,
1782         'second': 1,
1783     }
1784     roundto = lambda x, n: ((x + n / 2) // n) * n
1785     timestamp = calendar.timegm(dt.timetuple())
1786     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1787
1788
1789 def hyphenate_date(date_str):
1790     """
1791     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1792     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1793     if match is not None:
1794         return '-'.join(match.groups())
1795     else:
1796         return date_str
1797
1798
1799 class DateRange:
1800     """Represents a time interval between two dates"""
1801
1802     def __init__(self, start=None, end=None):
1803         """start and end must be strings in the format accepted by date"""
1804         if start is not None:
1805             self.start = date_from_str(start, strict=True)
1806         else:
1807             self.start = datetime.datetime.min.date()
1808         if end is not None:
1809             self.end = date_from_str(end, strict=True)
1810         else:
1811             self.end = datetime.datetime.max.date()
1812         if self.start > self.end:
1813             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1814
1815     @classmethod
1816     def day(cls, day):
1817         """Returns a range that only contains the given day"""
1818         return cls(day, day)
1819
1820     def __contains__(self, date):
1821         """Check if the date is in the range"""
1822         if not isinstance(date, datetime.date):
1823             date = date_from_str(date)
1824         return self.start <= date <= self.end
1825
1826     def __repr__(self):
1827         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1828
1829     def __eq__(self, other):
1830         return (isinstance(other, DateRange)
1831                 and self.start == other.start and self.end == other.end)
1832
1833
1834 @functools.cache
1835 def system_identifier():
1836     python_implementation = platform.python_implementation()
1837     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1838         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1839     libc_ver = []
1840     with contextlib.suppress(OSError):  # We may not have access to the executable
1841         libc_ver = platform.libc_ver()
1842
1843     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1844         platform.python_version(),
1845         python_implementation,
1846         platform.machine(),
1847         platform.architecture()[0],
1848         platform.platform(),
1849         ssl.OPENSSL_VERSION,
1850         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1851     )
1852
1853
1854 @functools.cache
1855 def get_windows_version():
1856     ''' Get Windows version. returns () if it's not running on Windows '''
1857     if compat_os_name == 'nt':
1858         return version_tuple(platform.win32_ver()[1])
1859     else:
1860         return ()
1861
1862
1863 def write_string(s, out=None, encoding=None):
1864     assert isinstance(s, str)
1865     out = out or sys.stderr
1866     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1867     if not out:
1868         return
1869
1870     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1871         s = re.sub(r'([\r\n]+)', r' \1', s)
1872
1873     enc, buffer = None, out
1874     if 'b' in getattr(out, 'mode', ''):
1875         enc = encoding or preferredencoding()
1876     elif hasattr(out, 'buffer'):
1877         buffer = out.buffer
1878         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1879
1880     buffer.write(s.encode(enc, 'ignore') if enc else s)
1881     out.flush()
1882
1883
1884 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1885     from .. import _IN_CLI
1886     if _IN_CLI:
1887         if msg in deprecation_warning._cache:
1888             return
1889         deprecation_warning._cache.add(msg)
1890         if printer:
1891             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1892         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1893     else:
1894         import warnings
1895         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1896
1897
1898 deprecation_warning._cache = set()
1899
1900
1901 def bytes_to_intlist(bs):
1902     if not bs:
1903         return []
1904     if isinstance(bs[0], int):  # Python 3
1905         return list(bs)
1906     else:
1907         return [ord(c) for c in bs]
1908
1909
1910 def intlist_to_bytes(xs):
1911     if not xs:
1912         return b''
1913     return struct.pack('%dB' % len(xs), *xs)
1914
1915
1916 class LockingUnsupportedError(OSError):
1917     msg = 'File locking is not supported'
1918
1919     def __init__(self):
1920         super().__init__(self.msg)
1921
1922
1923 # Cross-platform file locking
1924 if sys.platform == 'win32':
1925     import ctypes
1926     import ctypes.wintypes
1927     import msvcrt
1928
1929     class OVERLAPPED(ctypes.Structure):
1930         _fields_ = [
1931             ('Internal', ctypes.wintypes.LPVOID),
1932             ('InternalHigh', ctypes.wintypes.LPVOID),
1933             ('Offset', ctypes.wintypes.DWORD),
1934             ('OffsetHigh', ctypes.wintypes.DWORD),
1935             ('hEvent', ctypes.wintypes.HANDLE),
1936         ]
1937
1938     kernel32 = ctypes.WinDLL('kernel32')
1939     LockFileEx = kernel32.LockFileEx
1940     LockFileEx.argtypes = [
1941         ctypes.wintypes.HANDLE,     # hFile
1942         ctypes.wintypes.DWORD,      # dwFlags
1943         ctypes.wintypes.DWORD,      # dwReserved
1944         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1945         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1946         ctypes.POINTER(OVERLAPPED)  # Overlapped
1947     ]
1948     LockFileEx.restype = ctypes.wintypes.BOOL
1949     UnlockFileEx = kernel32.UnlockFileEx
1950     UnlockFileEx.argtypes = [
1951         ctypes.wintypes.HANDLE,     # hFile
1952         ctypes.wintypes.DWORD,      # dwReserved
1953         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1954         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1955         ctypes.POINTER(OVERLAPPED)  # Overlapped
1956     ]
1957     UnlockFileEx.restype = ctypes.wintypes.BOOL
1958     whole_low = 0xffffffff
1959     whole_high = 0x7fffffff
1960
1961     def _lock_file(f, exclusive, block):
1962         overlapped = OVERLAPPED()
1963         overlapped.Offset = 0
1964         overlapped.OffsetHigh = 0
1965         overlapped.hEvent = 0
1966         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1967
1968         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1969                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1970                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1971             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1972             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1973
1974     def _unlock_file(f):
1975         assert f._lock_file_overlapped_p
1976         handle = msvcrt.get_osfhandle(f.fileno())
1977         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1978             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1979
1980 else:
1981     try:
1982         import fcntl
1983
1984         def _lock_file(f, exclusive, block):
1985             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1986             if not block:
1987                 flags |= fcntl.LOCK_NB
1988             try:
1989                 fcntl.flock(f, flags)
1990             except BlockingIOError:
1991                 raise
1992             except OSError:  # AOSP does not have flock()
1993                 fcntl.lockf(f, flags)
1994
1995         def _unlock_file(f):
1996             with contextlib.suppress(OSError):
1997                 return fcntl.flock(f, fcntl.LOCK_UN)
1998             with contextlib.suppress(OSError):
1999                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
2000             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
2001
2002     except ImportError:
2003
2004         def _lock_file(f, exclusive, block):
2005             raise LockingUnsupportedError()
2006
2007         def _unlock_file(f):
2008             raise LockingUnsupportedError()
2009
2010
2011 class locked_file:
2012     locked = False
2013
2014     def __init__(self, filename, mode, block=True, encoding=None):
2015         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2016             raise NotImplementedError(mode)
2017         self.mode, self.block = mode, block
2018
2019         writable = any(f in mode for f in 'wax+')
2020         readable = any(f in mode for f in 'r+')
2021         flags = functools.reduce(operator.ior, (
2022             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2023             getattr(os, 'O_BINARY', 0),  # Windows only
2024             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2025             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2026             os.O_APPEND if 'a' in mode else 0,
2027             os.O_EXCL if 'x' in mode else 0,
2028             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2029         ))
2030
2031         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2032
2033     def __enter__(self):
2034         exclusive = 'r' not in self.mode
2035         try:
2036             _lock_file(self.f, exclusive, self.block)
2037             self.locked = True
2038         except OSError:
2039             self.f.close()
2040             raise
2041         if 'w' in self.mode:
2042             try:
2043                 self.f.truncate()
2044             except OSError as e:
2045                 if e.errno not in (
2046                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2047                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2048                 ):
2049                     raise
2050         return self
2051
2052     def unlock(self):
2053         if not self.locked:
2054             return
2055         try:
2056             _unlock_file(self.f)
2057         finally:
2058             self.locked = False
2059
2060     def __exit__(self, *_):
2061         try:
2062             self.unlock()
2063         finally:
2064             self.f.close()
2065
2066     open = __enter__
2067     close = __exit__
2068
2069     def __getattr__(self, attr):
2070         return getattr(self.f, attr)
2071
2072     def __iter__(self):
2073         return iter(self.f)
2074
2075
2076 @functools.cache
2077 def get_filesystem_encoding():
2078     encoding = sys.getfilesystemencoding()
2079     return encoding if encoding is not None else 'utf-8'
2080
2081
2082 def shell_quote(args):
2083     quoted_args = []
2084     encoding = get_filesystem_encoding()
2085     for a in args:
2086         if isinstance(a, bytes):
2087             # We may get a filename encoded with 'encodeFilename'
2088             a = a.decode(encoding)
2089         quoted_args.append(compat_shlex_quote(a))
2090     return ' '.join(quoted_args)
2091
2092
2093 def smuggle_url(url, data):
2094     """ Pass additional data in a URL for internal use. """
2095
2096     url, idata = unsmuggle_url(url, {})
2097     data.update(idata)
2098     sdata = urllib.parse.urlencode(
2099         {'__youtubedl_smuggle': json.dumps(data)})
2100     return url + '#' + sdata
2101
2102
2103 def unsmuggle_url(smug_url, default=None):
2104     if '#__youtubedl_smuggle' not in smug_url:
2105         return smug_url, default
2106     url, _, sdata = smug_url.rpartition('#')
2107     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2108     data = json.loads(jsond)
2109     return url, data
2110
2111
2112 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2113     """ Formats numbers with decimal sufixes like K, M, etc """
2114     num, factor = float_or_none(num), float(factor)
2115     if num is None or num < 0:
2116         return None
2117     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2118     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2119     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2120     if factor == 1024:
2121         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2122     converted = num / (factor ** exponent)
2123     return fmt % (converted, suffix)
2124
2125
2126 def format_bytes(bytes):
2127     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2128
2129
2130 def lookup_unit_table(unit_table, s, strict=False):
2131     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2132     units_re = '|'.join(re.escape(u) for u in unit_table)
2133     m = (re.fullmatch if strict else re.match)(
2134         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2135     if not m:
2136         return None
2137
2138     num = float(m.group('num').replace(',', '.'))
2139     mult = unit_table[m.group('unit')]
2140     return round(num * mult)
2141
2142
2143 def parse_bytes(s):
2144     """Parse a string indicating a byte quantity into an integer"""
2145     return lookup_unit_table(
2146         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2147         s.upper(), strict=True)
2148
2149
2150 def parse_filesize(s):
2151     if s is None:
2152         return None
2153
2154     # The lower-case forms are of course incorrect and unofficial,
2155     # but we support those too
2156     _UNIT_TABLE = {
2157         'B': 1,
2158         'b': 1,
2159         'bytes': 1,
2160         'KiB': 1024,
2161         'KB': 1000,
2162         'kB': 1024,
2163         'Kb': 1000,
2164         'kb': 1000,
2165         'kilobytes': 1000,
2166         'kibibytes': 1024,
2167         'MiB': 1024 ** 2,
2168         'MB': 1000 ** 2,
2169         'mB': 1024 ** 2,
2170         'Mb': 1000 ** 2,
2171         'mb': 1000 ** 2,
2172         'megabytes': 1000 ** 2,
2173         'mebibytes': 1024 ** 2,
2174         'GiB': 1024 ** 3,
2175         'GB': 1000 ** 3,
2176         'gB': 1024 ** 3,
2177         'Gb': 1000 ** 3,
2178         'gb': 1000 ** 3,
2179         'gigabytes': 1000 ** 3,
2180         'gibibytes': 1024 ** 3,
2181         'TiB': 1024 ** 4,
2182         'TB': 1000 ** 4,
2183         'tB': 1024 ** 4,
2184         'Tb': 1000 ** 4,
2185         'tb': 1000 ** 4,
2186         'terabytes': 1000 ** 4,
2187         'tebibytes': 1024 ** 4,
2188         'PiB': 1024 ** 5,
2189         'PB': 1000 ** 5,
2190         'pB': 1024 ** 5,
2191         'Pb': 1000 ** 5,
2192         'pb': 1000 ** 5,
2193         'petabytes': 1000 ** 5,
2194         'pebibytes': 1024 ** 5,
2195         'EiB': 1024 ** 6,
2196         'EB': 1000 ** 6,
2197         'eB': 1024 ** 6,
2198         'Eb': 1000 ** 6,
2199         'eb': 1000 ** 6,
2200         'exabytes': 1000 ** 6,
2201         'exbibytes': 1024 ** 6,
2202         'ZiB': 1024 ** 7,
2203         'ZB': 1000 ** 7,
2204         'zB': 1024 ** 7,
2205         'Zb': 1000 ** 7,
2206         'zb': 1000 ** 7,
2207         'zettabytes': 1000 ** 7,
2208         'zebibytes': 1024 ** 7,
2209         'YiB': 1024 ** 8,
2210         'YB': 1000 ** 8,
2211         'yB': 1024 ** 8,
2212         'Yb': 1000 ** 8,
2213         'yb': 1000 ** 8,
2214         'yottabytes': 1000 ** 8,
2215         'yobibytes': 1024 ** 8,
2216     }
2217
2218     return lookup_unit_table(_UNIT_TABLE, s)
2219
2220
2221 def parse_count(s):
2222     if s is None:
2223         return None
2224
2225     s = re.sub(r'^[^\d]+\s', '', s).strip()
2226
2227     if re.match(r'^[\d,.]+$', s):
2228         return str_to_int(s)
2229
2230     _UNIT_TABLE = {
2231         'k': 1000,
2232         'K': 1000,
2233         'm': 1000 ** 2,
2234         'M': 1000 ** 2,
2235         'kk': 1000 ** 2,
2236         'KK': 1000 ** 2,
2237         'b': 1000 ** 3,
2238         'B': 1000 ** 3,
2239     }
2240
2241     ret = lookup_unit_table(_UNIT_TABLE, s)
2242     if ret is not None:
2243         return ret
2244
2245     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2246     if mobj:
2247         return str_to_int(mobj.group(1))
2248
2249
2250 def parse_resolution(s, *, lenient=False):
2251     if s is None:
2252         return {}
2253
2254     if lenient:
2255         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2256     else:
2257         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2258     if mobj:
2259         return {
2260             'width': int(mobj.group('w')),
2261             'height': int(mobj.group('h')),
2262         }
2263
2264     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2265     if mobj:
2266         return {'height': int(mobj.group(1))}
2267
2268     mobj = re.search(r'\b([48])[kK]\b', s)
2269     if mobj:
2270         return {'height': int(mobj.group(1)) * 540}
2271
2272     return {}
2273
2274
2275 def parse_bitrate(s):
2276     if not isinstance(s, str):
2277         return
2278     mobj = re.search(r'\b(\d+)\s*kbps', s)
2279     if mobj:
2280         return int(mobj.group(1))
2281
2282
2283 def month_by_name(name, lang='en'):
2284     """ Return the number of a month by (locale-independently) English name """
2285
2286     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2287
2288     try:
2289         return month_names.index(name) + 1
2290     except ValueError:
2291         return None
2292
2293
2294 def month_by_abbreviation(abbrev):
2295     """ Return the number of a month by (locale-independently) English
2296         abbreviations """
2297
2298     try:
2299         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2300     except ValueError:
2301         return None
2302
2303
2304 def fix_xml_ampersands(xml_str):
2305     """Replace all the '&' by '&amp;' in XML"""
2306     return re.sub(
2307         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2308         '&amp;',
2309         xml_str)
2310
2311
2312 def setproctitle(title):
2313     assert isinstance(title, str)
2314
2315     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2316     try:
2317         import ctypes
2318     except ImportError:
2319         return
2320
2321     try:
2322         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2323     except OSError:
2324         return
2325     except TypeError:
2326         # LoadLibrary in Windows Python 2.7.13 only expects
2327         # a bytestring, but since unicode_literals turns
2328         # every string into a unicode string, it fails.
2329         return
2330     title_bytes = title.encode()
2331     buf = ctypes.create_string_buffer(len(title_bytes))
2332     buf.value = title_bytes
2333     try:
2334         libc.prctl(15, buf, 0, 0, 0)
2335     except AttributeError:
2336         return  # Strange libc, just skip this
2337
2338
2339 def remove_start(s, start):
2340     return s[len(start):] if s is not None and s.startswith(start) else s
2341
2342
2343 def remove_end(s, end):
2344     return s[:-len(end)] if s is not None and s.endswith(end) else s
2345
2346
2347 def remove_quotes(s):
2348     if s is None or len(s) < 2:
2349         return s
2350     for quote in ('"', "'", ):
2351         if s[0] == quote and s[-1] == quote:
2352             return s[1:-1]
2353     return s
2354
2355
2356 def get_domain(url):
2357     """
2358     This implementation is inconsistent, but is kept for compatibility.
2359     Use this only for "webpage_url_domain"
2360     """
2361     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2362
2363
2364 def url_basename(url):
2365     path = urllib.parse.urlparse(url).path
2366     return path.strip('/').split('/')[-1]
2367
2368
2369 def base_url(url):
2370     return re.match(r'https?://[^?#]+/', url).group()
2371
2372
2373 def urljoin(base, path):
2374     if isinstance(path, bytes):
2375         path = path.decode()
2376     if not isinstance(path, str) or not path:
2377         return None
2378     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2379         return path
2380     if isinstance(base, bytes):
2381         base = base.decode()
2382     if not isinstance(base, str) or not re.match(
2383             r'^(?:https?:)?//', base):
2384         return None
2385     return urllib.parse.urljoin(base, path)
2386
2387
2388 class HEADRequest(urllib.request.Request):
2389     def get_method(self):
2390         return 'HEAD'
2391
2392
2393 class PUTRequest(urllib.request.Request):
2394     def get_method(self):
2395         return 'PUT'
2396
2397
2398 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2399     if get_attr and v is not None:
2400         v = getattr(v, get_attr, None)
2401     try:
2402         return int(v) * invscale // scale
2403     except (ValueError, TypeError, OverflowError):
2404         return default
2405
2406
2407 def str_or_none(v, default=None):
2408     return default if v is None else str(v)
2409
2410
2411 def str_to_int(int_str):
2412     """ A more relaxed version of int_or_none """
2413     if isinstance(int_str, int):
2414         return int_str
2415     elif isinstance(int_str, str):
2416         int_str = re.sub(r'[,\.\+]', '', int_str)
2417         return int_or_none(int_str)
2418
2419
2420 def float_or_none(v, scale=1, invscale=1, default=None):
2421     if v is None:
2422         return default
2423     try:
2424         return float(v) * invscale / scale
2425     except (ValueError, TypeError):
2426         return default
2427
2428
2429 def bool_or_none(v, default=None):
2430     return v if isinstance(v, bool) else default
2431
2432
2433 def strip_or_none(v, default=None):
2434     return v.strip() if isinstance(v, str) else default
2435
2436
2437 def url_or_none(url):
2438     if not url or not isinstance(url, str):
2439         return None
2440     url = url.strip()
2441     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2442
2443
2444 def request_to_url(req):
2445     if isinstance(req, urllib.request.Request):
2446         return req.get_full_url()
2447     else:
2448         return req
2449
2450
2451 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
2452     datetime_object = None
2453     try:
2454         if isinstance(timestamp, (int, float)):  # unix timestamp
2455             # Using naive datetime here can break timestamp() in Windows
2456             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2457             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2458             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2459             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2460                                + datetime.timedelta(seconds=timestamp))
2461         elif isinstance(timestamp, str):  # assume YYYYMMDD
2462             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2463         date_format = re.sub(  # Support %s on windows
2464             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2465         return datetime_object.strftime(date_format)
2466     except (ValueError, TypeError, AttributeError):
2467         return default
2468
2469
2470 def parse_duration(s):
2471     if not isinstance(s, str):
2472         return None
2473     s = s.strip()
2474     if not s:
2475         return None
2476
2477     days, hours, mins, secs, ms = [None] * 5
2478     m = re.match(r'''(?x)
2479             (?P<before_secs>
2480                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2481             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2482             (?P<ms>[.:][0-9]+)?Z?$
2483         ''', s)
2484     if m:
2485         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2486     else:
2487         m = re.match(
2488             r'''(?ix)(?:P?
2489                 (?:
2490                     [0-9]+\s*y(?:ears?)?,?\s*
2491                 )?
2492                 (?:
2493                     [0-9]+\s*m(?:onths?)?,?\s*
2494                 )?
2495                 (?:
2496                     [0-9]+\s*w(?:eeks?)?,?\s*
2497                 )?
2498                 (?:
2499                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2500                 )?
2501                 T)?
2502                 (?:
2503                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2504                 )?
2505                 (?:
2506                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2507                 )?
2508                 (?:
2509                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2510                 )?Z?$''', s)
2511         if m:
2512             days, hours, mins, secs, ms = m.groups()
2513         else:
2514             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2515             if m:
2516                 hours, mins = m.groups()
2517             else:
2518                 return None
2519
2520     if ms:
2521         ms = ms.replace(':', '.')
2522     return sum(float(part or 0) * mult for part, mult in (
2523         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2524
2525
2526 def prepend_extension(filename, ext, expected_real_ext=None):
2527     name, real_ext = os.path.splitext(filename)
2528     return (
2529         f'{name}.{ext}{real_ext}'
2530         if not expected_real_ext or real_ext[1:] == expected_real_ext
2531         else f'{filename}.{ext}')
2532
2533
2534 def replace_extension(filename, ext, expected_real_ext=None):
2535     name, real_ext = os.path.splitext(filename)
2536     return '{}.{}'.format(
2537         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2538         ext)
2539
2540
2541 def check_executable(exe, args=[]):
2542     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2543     args can be a list of arguments for a short output (like -version) """
2544     try:
2545         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2546     except OSError:
2547         return False
2548     return exe
2549
2550
2551 def _get_exe_version_output(exe, args):
2552     try:
2553         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2554         # SIGTTOU if yt-dlp is run in the background.
2555         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2556         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2557                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2558         if ret:
2559             return None
2560     except OSError:
2561         return False
2562     return stdout
2563
2564
2565 def detect_exe_version(output, version_re=None, unrecognized='present'):
2566     assert isinstance(output, str)
2567     if version_re is None:
2568         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2569     m = re.search(version_re, output)
2570     if m:
2571         return m.group(1)
2572     else:
2573         return unrecognized
2574
2575
2576 def get_exe_version(exe, args=['--version'],
2577                     version_re=None, unrecognized=('present', 'broken')):
2578     """ Returns the version of the specified executable,
2579     or False if the executable is not present """
2580     unrecognized = variadic(unrecognized)
2581     assert len(unrecognized) in (1, 2)
2582     out = _get_exe_version_output(exe, args)
2583     if out is None:
2584         return unrecognized[-1]
2585     return out and detect_exe_version(out, version_re, unrecognized[0])
2586
2587
2588 def frange(start=0, stop=None, step=1):
2589     """Float range"""
2590     if stop is None:
2591         start, stop = 0, start
2592     sign = [-1, 1][step > 0] if step else 0
2593     while sign * start < sign * stop:
2594         yield start
2595         start += step
2596
2597
2598 class LazyList(collections.abc.Sequence):
2599     """Lazy immutable list from an iterable
2600     Note that slices of a LazyList are lists and not LazyList"""
2601
2602     class IndexError(IndexError):
2603         pass
2604
2605     def __init__(self, iterable, *, reverse=False, _cache=None):
2606         self._iterable = iter(iterable)
2607         self._cache = [] if _cache is None else _cache
2608         self._reversed = reverse
2609
2610     def __iter__(self):
2611         if self._reversed:
2612             # We need to consume the entire iterable to iterate in reverse
2613             yield from self.exhaust()
2614             return
2615         yield from self._cache
2616         for item in self._iterable:
2617             self._cache.append(item)
2618             yield item
2619
2620     def _exhaust(self):
2621         self._cache.extend(self._iterable)
2622         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2623         return self._cache
2624
2625     def exhaust(self):
2626         """Evaluate the entire iterable"""
2627         return self._exhaust()[::-1 if self._reversed else 1]
2628
2629     @staticmethod
2630     def _reverse_index(x):
2631         return None if x is None else ~x
2632
2633     def __getitem__(self, idx):
2634         if isinstance(idx, slice):
2635             if self._reversed:
2636                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2637             start, stop, step = idx.start, idx.stop, idx.step or 1
2638         elif isinstance(idx, int):
2639             if self._reversed:
2640                 idx = self._reverse_index(idx)
2641             start, stop, step = idx, idx, 0
2642         else:
2643             raise TypeError('indices must be integers or slices')
2644         if ((start or 0) < 0 or (stop or 0) < 0
2645                 or (start is None and step < 0)
2646                 or (stop is None and step > 0)):
2647             # We need to consume the entire iterable to be able to slice from the end
2648             # Obviously, never use this with infinite iterables
2649             self._exhaust()
2650             try:
2651                 return self._cache[idx]
2652             except IndexError as e:
2653                 raise self.IndexError(e) from e
2654         n = max(start or 0, stop or 0) - len(self._cache) + 1
2655         if n > 0:
2656             self._cache.extend(itertools.islice(self._iterable, n))
2657         try:
2658             return self._cache[idx]
2659         except IndexError as e:
2660             raise self.IndexError(e) from e
2661
2662     def __bool__(self):
2663         try:
2664             self[-1] if self._reversed else self[0]
2665         except self.IndexError:
2666             return False
2667         return True
2668
2669     def __len__(self):
2670         self._exhaust()
2671         return len(self._cache)
2672
2673     def __reversed__(self):
2674         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2675
2676     def __copy__(self):
2677         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2678
2679     def __repr__(self):
2680         # repr and str should mimic a list. So we exhaust the iterable
2681         return repr(self.exhaust())
2682
2683     def __str__(self):
2684         return repr(self.exhaust())
2685
2686
2687 class PagedList:
2688
2689     class IndexError(IndexError):
2690         pass
2691
2692     def __len__(self):
2693         # This is only useful for tests
2694         return len(self.getslice())
2695
2696     def __init__(self, pagefunc, pagesize, use_cache=True):
2697         self._pagefunc = pagefunc
2698         self._pagesize = pagesize
2699         self._pagecount = float('inf')
2700         self._use_cache = use_cache
2701         self._cache = {}
2702
2703     def getpage(self, pagenum):
2704         page_results = self._cache.get(pagenum)
2705         if page_results is None:
2706             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2707         if self._use_cache:
2708             self._cache[pagenum] = page_results
2709         return page_results
2710
2711     def getslice(self, start=0, end=None):
2712         return list(self._getslice(start, end))
2713
2714     def _getslice(self, start, end):
2715         raise NotImplementedError('This method must be implemented by subclasses')
2716
2717     def __getitem__(self, idx):
2718         assert self._use_cache, 'Indexing PagedList requires cache'
2719         if not isinstance(idx, int) or idx < 0:
2720             raise TypeError('indices must be non-negative integers')
2721         entries = self.getslice(idx, idx + 1)
2722         if not entries:
2723             raise self.IndexError()
2724         return entries[0]
2725
2726
2727 class OnDemandPagedList(PagedList):
2728     """Download pages until a page with less than maximum results"""
2729
2730     def _getslice(self, start, end):
2731         for pagenum in itertools.count(start // self._pagesize):
2732             firstid = pagenum * self._pagesize
2733             nextfirstid = pagenum * self._pagesize + self._pagesize
2734             if start >= nextfirstid:
2735                 continue
2736
2737             startv = (
2738                 start % self._pagesize
2739                 if firstid <= start < nextfirstid
2740                 else 0)
2741             endv = (
2742                 ((end - 1) % self._pagesize) + 1
2743                 if (end is not None and firstid <= end <= nextfirstid)
2744                 else None)
2745
2746             try:
2747                 page_results = self.getpage(pagenum)
2748             except Exception:
2749                 self._pagecount = pagenum - 1
2750                 raise
2751             if startv != 0 or endv is not None:
2752                 page_results = page_results[startv:endv]
2753             yield from page_results
2754
2755             # A little optimization - if current page is not "full", ie. does
2756             # not contain page_size videos then we can assume that this page
2757             # is the last one - there are no more ids on further pages -
2758             # i.e. no need to query again.
2759             if len(page_results) + startv < self._pagesize:
2760                 break
2761
2762             # If we got the whole page, but the next page is not interesting,
2763             # break out early as well
2764             if end == nextfirstid:
2765                 break
2766
2767
2768 class InAdvancePagedList(PagedList):
2769     """PagedList with total number of pages known in advance"""
2770
2771     def __init__(self, pagefunc, pagecount, pagesize):
2772         PagedList.__init__(self, pagefunc, pagesize, True)
2773         self._pagecount = pagecount
2774
2775     def _getslice(self, start, end):
2776         start_page = start // self._pagesize
2777         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2778         skip_elems = start - start_page * self._pagesize
2779         only_more = None if end is None else end - start
2780         for pagenum in range(start_page, end_page):
2781             page_results = self.getpage(pagenum)
2782             if skip_elems:
2783                 page_results = page_results[skip_elems:]
2784                 skip_elems = None
2785             if only_more is not None:
2786                 if len(page_results) < only_more:
2787                     only_more -= len(page_results)
2788                 else:
2789                     yield from page_results[:only_more]
2790                     break
2791             yield from page_results
2792
2793
2794 class PlaylistEntries:
2795     MissingEntry = object()
2796     is_exhausted = False
2797
2798     def __init__(self, ydl, info_dict):
2799         self.ydl = ydl
2800
2801         # _entries must be assigned now since infodict can change during iteration
2802         entries = info_dict.get('entries')
2803         if entries is None:
2804             raise EntryNotInPlaylist('There are no entries')
2805         elif isinstance(entries, list):
2806             self.is_exhausted = True
2807
2808         requested_entries = info_dict.get('requested_entries')
2809         self.is_incomplete = requested_entries is not None
2810         if self.is_incomplete:
2811             assert self.is_exhausted
2812             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2813             for i, entry in zip(requested_entries, entries):
2814                 self._entries[i - 1] = entry
2815         elif isinstance(entries, (list, PagedList, LazyList)):
2816             self._entries = entries
2817         else:
2818             self._entries = LazyList(entries)
2819
2820     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2821         (?P<start>[+-]?\d+)?
2822         (?P<range>[:-]
2823             (?P<end>[+-]?\d+|inf(?:inite)?)?
2824             (?::(?P<step>[+-]?\d+))?
2825         )?''')
2826
2827     @classmethod
2828     def parse_playlist_items(cls, string):
2829         for segment in string.split(','):
2830             if not segment:
2831                 raise ValueError('There is two or more consecutive commas')
2832             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2833             if not mobj:
2834                 raise ValueError(f'{segment!r} is not a valid specification')
2835             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2836             if int_or_none(step) == 0:
2837                 raise ValueError(f'Step in {segment!r} cannot be zero')
2838             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2839
2840     def get_requested_items(self):
2841         playlist_items = self.ydl.params.get('playlist_items')
2842         playlist_start = self.ydl.params.get('playliststart', 1)
2843         playlist_end = self.ydl.params.get('playlistend')
2844         # For backwards compatibility, interpret -1 as whole list
2845         if playlist_end in (-1, None):
2846             playlist_end = ''
2847         if not playlist_items:
2848             playlist_items = f'{playlist_start}:{playlist_end}'
2849         elif playlist_start != 1 or playlist_end:
2850             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2851
2852         for index in self.parse_playlist_items(playlist_items):
2853             for i, entry in self[index]:
2854                 yield i, entry
2855                 if not entry:
2856                     continue
2857                 try:
2858                     # The item may have just been added to archive. Don't break due to it
2859                     if not self.ydl.params.get('lazy_playlist'):
2860                         # TODO: Add auto-generated fields
2861                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2862                 except (ExistingVideoReached, RejectedVideoReached):
2863                     return
2864
2865     def get_full_count(self):
2866         if self.is_exhausted and not self.is_incomplete:
2867             return len(self)
2868         elif isinstance(self._entries, InAdvancePagedList):
2869             if self._entries._pagesize == 1:
2870                 return self._entries._pagecount
2871
2872     @functools.cached_property
2873     def _getter(self):
2874         if isinstance(self._entries, list):
2875             def get_entry(i):
2876                 try:
2877                     entry = self._entries[i]
2878                 except IndexError:
2879                     entry = self.MissingEntry
2880                     if not self.is_incomplete:
2881                         raise self.IndexError()
2882                 if entry is self.MissingEntry:
2883                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2884                 return entry
2885         else:
2886             def get_entry(i):
2887                 try:
2888                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2889                 except (LazyList.IndexError, PagedList.IndexError):
2890                     raise self.IndexError()
2891         return get_entry
2892
2893     def __getitem__(self, idx):
2894         if isinstance(idx, int):
2895             idx = slice(idx, idx)
2896
2897         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2898         step = 1 if idx.step is None else idx.step
2899         if idx.start is None:
2900             start = 0 if step > 0 else len(self) - 1
2901         else:
2902             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2903
2904         # NB: Do not call len(self) when idx == [:]
2905         if idx.stop is None:
2906             stop = 0 if step < 0 else float('inf')
2907         else:
2908             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2909         stop += [-1, 1][step > 0]
2910
2911         for i in frange(start, stop, step):
2912             if i < 0:
2913                 continue
2914             try:
2915                 entry = self._getter(i)
2916             except self.IndexError:
2917                 self.is_exhausted = True
2918                 if step > 0:
2919                     break
2920                 continue
2921             yield i + 1, entry
2922
2923     def __len__(self):
2924         return len(tuple(self[:]))
2925
2926     class IndexError(IndexError):
2927         pass
2928
2929
2930 def uppercase_escape(s):
2931     unicode_escape = codecs.getdecoder('unicode_escape')
2932     return re.sub(
2933         r'\\U[0-9a-fA-F]{8}',
2934         lambda m: unicode_escape(m.group(0))[0],
2935         s)
2936
2937
2938 def lowercase_escape(s):
2939     unicode_escape = codecs.getdecoder('unicode_escape')
2940     return re.sub(
2941         r'\\u[0-9a-fA-F]{4}',
2942         lambda m: unicode_escape(m.group(0))[0],
2943         s)
2944
2945
2946 def escape_rfc3986(s):
2947     """Escape non-ASCII characters as suggested by RFC 3986"""
2948     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2949
2950
2951 def escape_url(url):
2952     """Escape URL as suggested by RFC 3986"""
2953     url_parsed = urllib.parse.urlparse(url)
2954     return url_parsed._replace(
2955         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2956         path=escape_rfc3986(url_parsed.path),
2957         params=escape_rfc3986(url_parsed.params),
2958         query=escape_rfc3986(url_parsed.query),
2959         fragment=escape_rfc3986(url_parsed.fragment)
2960     ).geturl()
2961
2962
2963 def parse_qs(url, **kwargs):
2964     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2965
2966
2967 def read_batch_urls(batch_fd):
2968     def fixup(url):
2969         if not isinstance(url, str):
2970             url = url.decode('utf-8', 'replace')
2971         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2972         for bom in BOM_UTF8:
2973             if url.startswith(bom):
2974                 url = url[len(bom):]
2975         url = url.lstrip()
2976         if not url or url.startswith(('#', ';', ']')):
2977             return False
2978         # "#" cannot be stripped out since it is part of the URI
2979         # However, it can be safely stripped out if following a whitespace
2980         return re.split(r'\s#', url, 1)[0].rstrip()
2981
2982     with contextlib.closing(batch_fd) as fd:
2983         return [url for url in map(fixup, fd) if url]
2984
2985
2986 def urlencode_postdata(*args, **kargs):
2987     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2988
2989
2990 def update_url(url, *, query_update=None, **kwargs):
2991     """Replace URL components specified by kwargs
2992        @param url           str or parse url tuple
2993        @param query_update  update query
2994        @returns             str
2995     """
2996     if isinstance(url, str):
2997         if not kwargs and not query_update:
2998             return url
2999         else:
3000             url = urllib.parse.urlparse(url)
3001     if query_update:
3002         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3003         kwargs['query'] = urllib.parse.urlencode({
3004             **urllib.parse.parse_qs(url.query),
3005             **query_update
3006         }, True)
3007     return urllib.parse.urlunparse(url._replace(**kwargs))
3008
3009
3010 def update_url_query(url, query):
3011     return update_url(url, query_update=query)
3012
3013
3014 def update_Request(req, url=None, data=None, headers=None, query=None):
3015     req_headers = req.headers.copy()
3016     req_headers.update(headers or {})
3017     req_data = data or req.data
3018     req_url = update_url_query(url or req.get_full_url(), query)
3019     req_get_method = req.get_method()
3020     if req_get_method == 'HEAD':
3021         req_type = HEADRequest
3022     elif req_get_method == 'PUT':
3023         req_type = PUTRequest
3024     else:
3025         req_type = urllib.request.Request
3026     new_req = req_type(
3027         req_url, data=req_data, headers=req_headers,
3028         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3029     if hasattr(req, 'timeout'):
3030         new_req.timeout = req.timeout
3031     return new_req
3032
3033
3034 def _multipart_encode_impl(data, boundary):
3035     content_type = 'multipart/form-data; boundary=%s' % boundary
3036
3037     out = b''
3038     for k, v in data.items():
3039         out += b'--' + boundary.encode('ascii') + b'\r\n'
3040         if isinstance(k, str):
3041             k = k.encode()
3042         if isinstance(v, str):
3043             v = v.encode()
3044         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3045         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3046         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3047         if boundary.encode('ascii') in content:
3048             raise ValueError('Boundary overlaps with data')
3049         out += content
3050
3051     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3052
3053     return out, content_type
3054
3055
3056 def multipart_encode(data, boundary=None):
3057     '''
3058     Encode a dict to RFC 7578-compliant form-data
3059
3060     data:
3061         A dict where keys and values can be either Unicode or bytes-like
3062         objects.
3063     boundary:
3064         If specified a Unicode object, it's used as the boundary. Otherwise
3065         a random boundary is generated.
3066
3067     Reference: https://tools.ietf.org/html/rfc7578
3068     '''
3069     has_specified_boundary = boundary is not None
3070
3071     while True:
3072         if boundary is None:
3073             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3074
3075         try:
3076             out, content_type = _multipart_encode_impl(data, boundary)
3077             break
3078         except ValueError:
3079             if has_specified_boundary:
3080                 raise
3081             boundary = None
3082
3083     return out, content_type
3084
3085
3086 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3087     if blocked_types is NO_DEFAULT:
3088         blocked_types = (str, bytes, collections.abc.Mapping)
3089     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3090
3091
3092 def variadic(x, allowed_types=NO_DEFAULT):
3093     if not isinstance(allowed_types, (tuple, type)):
3094         deprecation_warning('allowed_types should be a tuple or a type')
3095         allowed_types = tuple(allowed_types)
3096     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3097
3098
3099 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3100     for f in funcs:
3101         try:
3102             val = f(*args, **kwargs)
3103         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3104             pass
3105         else:
3106             if expected_type is None or isinstance(val, expected_type):
3107                 return val
3108
3109
3110 def try_get(src, getter, expected_type=None):
3111     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3112
3113
3114 def filter_dict(dct, cndn=lambda _, v: v is not None):
3115     return {k: v for k, v in dct.items() if cndn(k, v)}
3116
3117
3118 def merge_dicts(*dicts):
3119     merged = {}
3120     for a_dict in dicts:
3121         for k, v in a_dict.items():
3122             if (v is not None and k not in merged
3123                     or isinstance(v, str) and merged[k] == ''):
3124                 merged[k] = v
3125     return merged
3126
3127
3128 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3129     return string if isinstance(string, str) else str(string, encoding, errors)
3130
3131
3132 US_RATINGS = {
3133     'G': 0,
3134     'PG': 10,
3135     'PG-13': 13,
3136     'R': 16,
3137     'NC': 18,
3138 }
3139
3140
3141 TV_PARENTAL_GUIDELINES = {
3142     'TV-Y': 0,
3143     'TV-Y7': 7,
3144     'TV-G': 0,
3145     'TV-PG': 0,
3146     'TV-14': 14,
3147     'TV-MA': 17,
3148 }
3149
3150
3151 def parse_age_limit(s):
3152     # isinstance(False, int) is True. So type() must be used instead
3153     if type(s) is int:  # noqa: E721
3154         return s if 0 <= s <= 21 else None
3155     elif not isinstance(s, str):
3156         return None
3157     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3158     if m:
3159         return int(m.group('age'))
3160     s = s.upper()
3161     if s in US_RATINGS:
3162         return US_RATINGS[s]
3163     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3164     if m:
3165         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3166     return None
3167
3168
3169 def strip_jsonp(code):
3170     return re.sub(
3171         r'''(?sx)^
3172             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3173             (?:\s*&&\s*(?P=func_name))?
3174             \s*\(\s*(?P<callback_data>.*)\);?
3175             \s*?(?://[^\n]*)*$''',
3176         r'\g<callback_data>', code)
3177
3178
3179 def js_to_json(code, vars={}, *, strict=False):
3180     # vars is a dict of var, val pairs to substitute
3181     STRING_QUOTES = '\'"`'
3182     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3183     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3184     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3185     INTEGER_TABLE = (
3186         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3187         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3188     )
3189
3190     def process_escape(match):
3191         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3192         escape = match.group(1) or match.group(2)
3193
3194         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3195                 else R'\u00' if escape == 'x'
3196                 else '' if escape == '\n'
3197                 else escape)
3198
3199     def template_substitute(match):
3200         evaluated = js_to_json(match.group(1), vars, strict=strict)
3201         if evaluated[0] == '"':
3202             return json.loads(evaluated)
3203         return evaluated
3204
3205     def fix_kv(m):
3206         v = m.group(0)
3207         if v in ('true', 'false', 'null'):
3208             return v
3209         elif v in ('undefined', 'void 0'):
3210             return 'null'
3211         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3212             return ''
3213
3214         if v[0] in STRING_QUOTES:
3215             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3216             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3217             return f'"{escaped}"'
3218
3219         for regex, base in INTEGER_TABLE:
3220             im = re.match(regex, v)
3221             if im:
3222                 i = int(im.group(1), base)
3223                 return f'"{i}":' if v.endswith(':') else str(i)
3224
3225         if v in vars:
3226             try:
3227                 if not strict:
3228                     json.loads(vars[v])
3229             except json.JSONDecodeError:
3230                 return json.dumps(vars[v])
3231             else:
3232                 return vars[v]
3233
3234         if not strict:
3235             return f'"{v}"'
3236
3237         raise ValueError(f'Unknown value: {v}')
3238
3239     def create_map(mobj):
3240         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3241
3242     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3243     if not strict:
3244         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3245         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3246         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3247         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3248
3249     return re.sub(rf'''(?sx)
3250         {STRING_RE}|
3251         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3252         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3253         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3254         [0-9]+(?={SKIP_RE}:)|
3255         !+
3256         ''', fix_kv, code)
3257
3258
3259 def qualities(quality_ids):
3260     """ Get a numeric quality value out of a list of possible values """
3261     def q(qid):
3262         try:
3263             return quality_ids.index(qid)
3264         except ValueError:
3265             return -1
3266     return q
3267
3268
3269 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3270
3271
3272 DEFAULT_OUTTMPL = {
3273     'default': '%(title)s [%(id)s].%(ext)s',
3274     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3275 }
3276 OUTTMPL_TYPES = {
3277     'chapter': None,
3278     'subtitle': None,
3279     'thumbnail': None,
3280     'description': 'description',
3281     'annotation': 'annotations.xml',
3282     'infojson': 'info.json',
3283     'link': None,
3284     'pl_video': None,
3285     'pl_thumbnail': None,
3286     'pl_description': 'description',
3287     'pl_infojson': 'info.json',
3288 }
3289
3290 # As of [1] format syntax is:
3291 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3292 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3293 STR_FORMAT_RE_TMPL = r'''(?x)
3294     (?<!%)(?P<prefix>(?:%%)*)
3295     %
3296     (?P<has_key>\((?P<key>{0})\))?
3297     (?P<format>
3298         (?P<conversion>[#0\-+ ]+)?
3299         (?P<min_width>\d+)?
3300         (?P<precision>\.\d+)?
3301         (?P<len_mod>[hlL])?  # unused in python
3302         {1}  # conversion type
3303     )
3304 '''
3305
3306
3307 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
3308
3309
3310 def limit_length(s, length):
3311     """ Add ellipses to overly long strings """
3312     if s is None:
3313         return None
3314     ELLIPSES = '...'
3315     if len(s) > length:
3316         return s[:length - len(ELLIPSES)] + ELLIPSES
3317     return s
3318
3319
3320 def version_tuple(v):
3321     return tuple(int(e) for e in re.split(r'[-.]', v))
3322
3323
3324 def is_outdated_version(version, limit, assume_new=True):
3325     if not version:
3326         return not assume_new
3327     try:
3328         return version_tuple(version) < version_tuple(limit)
3329     except ValueError:
3330         return not assume_new
3331
3332
3333 def ytdl_is_updateable():
3334     """ Returns if yt-dlp can be updated with -U """
3335
3336     from ..update import is_non_updateable
3337
3338     return not is_non_updateable()
3339
3340
3341 def args_to_str(args):
3342     # Get a short string representation for a subprocess command
3343     return ' '.join(compat_shlex_quote(a) for a in args)
3344
3345
3346 def error_to_str(err):
3347     return f'{type(err).__name__}: {err}'
3348
3349
3350 def mimetype2ext(mt, default=NO_DEFAULT):
3351     if not isinstance(mt, str):
3352         if default is not NO_DEFAULT:
3353             return default
3354         return None
3355
3356     MAP = {
3357         # video
3358         '3gpp': '3gp',
3359         'mp2t': 'ts',
3360         'mp4': 'mp4',
3361         'mpeg': 'mpeg',
3362         'mpegurl': 'm3u8',
3363         'quicktime': 'mov',
3364         'webm': 'webm',
3365         'vp9': 'vp9',
3366         'x-flv': 'flv',
3367         'x-m4v': 'm4v',
3368         'x-matroska': 'mkv',
3369         'x-mng': 'mng',
3370         'x-mp4-fragmented': 'mp4',
3371         'x-ms-asf': 'asf',
3372         'x-ms-wmv': 'wmv',
3373         'x-msvideo': 'avi',
3374
3375         # application (streaming playlists)
3376         'dash+xml': 'mpd',
3377         'f4m+xml': 'f4m',
3378         'hds+xml': 'f4m',
3379         'vnd.apple.mpegurl': 'm3u8',
3380         'vnd.ms-sstr+xml': 'ism',
3381         'x-mpegurl': 'm3u8',
3382
3383         # audio
3384         'audio/mp4': 'm4a',
3385         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3386         # Using .mp3 as it's the most popular one
3387         'audio/mpeg': 'mp3',
3388         'audio/webm': 'webm',
3389         'audio/x-matroska': 'mka',
3390         'audio/x-mpegurl': 'm3u',
3391         'midi': 'mid',
3392         'ogg': 'ogg',
3393         'wav': 'wav',
3394         'wave': 'wav',
3395         'x-aac': 'aac',
3396         'x-flac': 'flac',
3397         'x-m4a': 'm4a',
3398         'x-realaudio': 'ra',
3399         'x-wav': 'wav',
3400
3401         # image
3402         'avif': 'avif',
3403         'bmp': 'bmp',
3404         'gif': 'gif',
3405         'jpeg': 'jpg',
3406         'png': 'png',
3407         'svg+xml': 'svg',
3408         'tiff': 'tif',
3409         'vnd.wap.wbmp': 'wbmp',
3410         'webp': 'webp',
3411         'x-icon': 'ico',
3412         'x-jng': 'jng',
3413         'x-ms-bmp': 'bmp',
3414
3415         # caption
3416         'filmstrip+json': 'fs',
3417         'smptett+xml': 'tt',
3418         'ttaf+xml': 'dfxp',
3419         'ttml+xml': 'ttml',
3420         'x-ms-sami': 'sami',
3421
3422         # misc
3423         'gzip': 'gz',
3424         'json': 'json',
3425         'xml': 'xml',
3426         'zip': 'zip',
3427     }
3428
3429     mimetype = mt.partition(';')[0].strip().lower()
3430     _, _, subtype = mimetype.rpartition('/')
3431
3432     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3433     if ext:
3434         return ext
3435     elif default is not NO_DEFAULT:
3436         return default
3437     return subtype.replace('+', '.')
3438
3439
3440 def ext2mimetype(ext_or_url):
3441     if not ext_or_url:
3442         return None
3443     if '.' not in ext_or_url:
3444         ext_or_url = f'file.{ext_or_url}'
3445     return mimetypes.guess_type(ext_or_url)[0]
3446
3447
3448 def parse_codecs(codecs_str):
3449     # http://tools.ietf.org/html/rfc6381
3450     if not codecs_str:
3451         return {}
3452     split_codecs = list(filter(None, map(
3453         str.strip, codecs_str.strip().strip(',').split(','))))
3454     vcodec, acodec, scodec, hdr = None, None, None, None
3455     for full_codec in split_codecs:
3456         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3457         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3458                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3459             if vcodec:
3460                 continue
3461             vcodec = full_codec
3462             if parts[0] in ('dvh1', 'dvhe'):
3463                 hdr = 'DV'
3464             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3465                 hdr = 'HDR10'
3466             elif parts[:2] == ['vp9', '2']:
3467                 hdr = 'HDR10'
3468         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3469                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3470             acodec = acodec or full_codec
3471         elif parts[0] in ('stpp', 'wvtt'):
3472             scodec = scodec or full_codec
3473         else:
3474             write_string(f'WARNING: Unknown codec {full_codec}\n')
3475     if vcodec or acodec or scodec:
3476         return {
3477             'vcodec': vcodec or 'none',
3478             'acodec': acodec or 'none',
3479             'dynamic_range': hdr,
3480             **({'scodec': scodec} if scodec is not None else {}),
3481         }
3482     elif len(split_codecs) == 2:
3483         return {
3484             'vcodec': split_codecs[0],
3485             'acodec': split_codecs[1],
3486         }
3487     return {}
3488
3489
3490 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3491     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3492
3493     allow_mkv = not preferences or 'mkv' in preferences
3494
3495     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3496         return 'mkv'  # TODO: any other format allows this?
3497
3498     # TODO: All codecs supported by parse_codecs isn't handled here
3499     COMPATIBLE_CODECS = {
3500         'mp4': {
3501             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3502             'h264', 'aacl', 'ec-3',  # Set in ISM
3503         },
3504         'webm': {
3505             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3506             'vp9x', 'vp8x',  # in the webm spec
3507         },
3508     }
3509
3510     sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3511     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3512
3513     for ext in preferences or COMPATIBLE_CODECS.keys():
3514         codec_set = COMPATIBLE_CODECS.get(ext, set())
3515         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3516             return ext
3517
3518     COMPATIBLE_EXTS = (
3519         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3520         {'webm', 'weba'},
3521     )
3522     for ext in preferences or vexts:
3523         current_exts = {ext, *vexts, *aexts}
3524         if ext == 'mkv' or current_exts == {ext} or any(
3525                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3526             return ext
3527     return 'mkv' if allow_mkv else preferences[-1]
3528
3529
3530 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3531     getheader = url_handle.headers.get
3532
3533     cd = getheader('Content-Disposition')
3534     if cd:
3535         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3536         if m:
3537             e = determine_ext(m.group('filename'), default_ext=None)
3538             if e:
3539                 return e
3540
3541     meta_ext = getheader('x-amz-meta-name')
3542     if meta_ext:
3543         e = meta_ext.rpartition('.')[2]
3544         if e:
3545             return e
3546
3547     return mimetype2ext(getheader('Content-Type'), default=default)
3548
3549
3550 def encode_data_uri(data, mime_type):
3551     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3552
3553
3554 def age_restricted(content_limit, age_limit):
3555     """ Returns True iff the content should be blocked """
3556
3557     if age_limit is None:  # No limit set
3558         return False
3559     if content_limit is None:
3560         return False  # Content available for everyone
3561     return age_limit < content_limit
3562
3563
3564 # List of known byte-order-marks (BOM)
3565 BOMS = [
3566     (b'\xef\xbb\xbf', 'utf-8'),
3567     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3568     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3569     (b'\xff\xfe', 'utf-16-le'),
3570     (b'\xfe\xff', 'utf-16-be'),
3571 ]
3572
3573
3574 def is_html(first_bytes):
3575     """ Detect whether a file contains HTML by examining its first bytes. """
3576
3577     encoding = 'utf-8'
3578     for bom, enc in BOMS:
3579         while first_bytes.startswith(bom):
3580             encoding, first_bytes = enc, first_bytes[len(bom):]
3581
3582     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3583
3584
3585 def determine_protocol(info_dict):
3586     protocol = info_dict.get('protocol')
3587     if protocol is not None:
3588         return protocol
3589
3590     url = sanitize_url(info_dict['url'])
3591     if url.startswith('rtmp'):
3592         return 'rtmp'
3593     elif url.startswith('mms'):
3594         return 'mms'
3595     elif url.startswith('rtsp'):
3596         return 'rtsp'
3597
3598     ext = determine_ext(url)
3599     if ext == 'm3u8':
3600         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3601     elif ext == 'f4m':
3602         return 'f4m'
3603
3604     return urllib.parse.urlparse(url).scheme
3605
3606
3607 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3608     """ Render a list of rows, each as a list of values.
3609     Text after a \t will be right aligned """
3610     def width(string):
3611         return len(remove_terminal_sequences(string).replace('\t', ''))
3612
3613     def get_max_lens(table):
3614         return [max(width(str(v)) for v in col) for col in zip(*table)]
3615
3616     def filter_using_list(row, filterArray):
3617         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3618
3619     max_lens = get_max_lens(data) if hide_empty else []
3620     header_row = filter_using_list(header_row, max_lens)
3621     data = [filter_using_list(row, max_lens) for row in data]
3622
3623     table = [header_row] + data
3624     max_lens = get_max_lens(table)
3625     extra_gap += 1
3626     if delim:
3627         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3628         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3629     for row in table:
3630         for pos, text in enumerate(map(str, row)):
3631             if '\t' in text:
3632                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3633             else:
3634                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3635     ret = '\n'.join(''.join(row).rstrip() for row in table)
3636     return ret
3637
3638
3639 def _match_one(filter_part, dct, incomplete):
3640     # TODO: Generalize code with YoutubeDL._build_format_filter
3641     STRING_OPERATORS = {
3642         '*=': operator.contains,
3643         '^=': lambda attr, value: attr.startswith(value),
3644         '$=': lambda attr, value: attr.endswith(value),
3645         '~=': lambda attr, value: re.search(value, attr),
3646     }
3647     COMPARISON_OPERATORS = {
3648         **STRING_OPERATORS,
3649         '<=': operator.le,  # "<=" must be defined above "<"
3650         '<': operator.lt,
3651         '>=': operator.ge,
3652         '>': operator.gt,
3653         '=': operator.eq,
3654     }
3655
3656     if isinstance(incomplete, bool):
3657         is_incomplete = lambda _: incomplete
3658     else:
3659         is_incomplete = lambda k: k in incomplete
3660
3661     operator_rex = re.compile(r'''(?x)
3662         (?P<key>[a-z_]+)
3663         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3664         (?:
3665             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3666             (?P<strval>.+?)
3667         )
3668         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3669     m = operator_rex.fullmatch(filter_part.strip())
3670     if m:
3671         m = m.groupdict()
3672         unnegated_op = COMPARISON_OPERATORS[m['op']]
3673         if m['negation']:
3674             op = lambda attr, value: not unnegated_op(attr, value)
3675         else:
3676             op = unnegated_op
3677         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3678         if m['quote']:
3679             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3680         actual_value = dct.get(m['key'])
3681         numeric_comparison = None
3682         if isinstance(actual_value, (int, float)):
3683             # If the original field is a string and matching comparisonvalue is
3684             # a number we should respect the origin of the original field
3685             # and process comparison value as a string (see
3686             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3687             try:
3688                 numeric_comparison = int(comparison_value)
3689             except ValueError:
3690                 numeric_comparison = parse_filesize(comparison_value)
3691                 if numeric_comparison is None:
3692                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3693                 if numeric_comparison is None:
3694                     numeric_comparison = parse_duration(comparison_value)
3695         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3696             raise ValueError('Operator %s only supports string values!' % m['op'])
3697         if actual_value is None:
3698             return is_incomplete(m['key']) or m['none_inclusive']
3699         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3700
3701     UNARY_OPERATORS = {
3702         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3703         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3704     }
3705     operator_rex = re.compile(r'''(?x)
3706         (?P<op>%s)\s*(?P<key>[a-z_]+)
3707         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3708     m = operator_rex.fullmatch(filter_part.strip())
3709     if m:
3710         op = UNARY_OPERATORS[m.group('op')]
3711         actual_value = dct.get(m.group('key'))
3712         if is_incomplete(m.group('key')) and actual_value is None:
3713             return True
3714         return op(actual_value)
3715
3716     raise ValueError('Invalid filter part %r' % filter_part)
3717
3718
3719 def match_str(filter_str, dct, incomplete=False):
3720     """ Filter a dictionary with a simple string syntax.
3721     @returns           Whether the filter passes
3722     @param incomplete  Set of keys that is expected to be missing from dct.
3723                        Can be True/False to indicate all/none of the keys may be missing.
3724                        All conditions on incomplete keys pass if the key is missing
3725     """
3726     return all(
3727         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3728         for filter_part in re.split(r'(?<!\\)&', filter_str))
3729
3730
3731 def match_filter_func(filters, breaking_filters=None):
3732     if not filters and not breaking_filters:
3733         return None
3734     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3735     filters = set(variadic(filters or []))
3736
3737     interactive = '-' in filters
3738     if interactive:
3739         filters.remove('-')
3740
3741     def _match_func(info_dict, incomplete=False):
3742         ret = breaking_filters(info_dict, incomplete)
3743         if ret is not None:
3744             raise RejectedVideoReached(ret)
3745
3746         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3747             return NO_DEFAULT if interactive and not incomplete else None
3748         else:
3749             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3750             filter_str = ') | ('.join(map(str.strip, filters))
3751             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3752     return _match_func
3753
3754
3755 class download_range_func:
3756     def __init__(self, chapters, ranges, from_info=False):
3757         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3758
3759     def __call__(self, info_dict, ydl):
3760         if not any((self.ranges, self.chapters, self.from_info)):
3761             yield {}
3762
3763         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3764                    else 'Cannot match chapters since chapter information is unavailable')
3765         for regex in self.chapters or []:
3766             for i, chapter in enumerate(info_dict.get('chapters') or []):
3767                 if re.search(regex, chapter['title']):
3768                     warning = None
3769                     yield {**chapter, 'index': i}
3770         if self.chapters and warning:
3771             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3772
3773         for start, end in self.ranges or []:
3774             yield {
3775                 'start_time': self._handle_negative_timestamp(start, info_dict),
3776                 'end_time': self._handle_negative_timestamp(end, info_dict),
3777             }
3778
3779         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3780             yield {
3781                 'start_time': info_dict.get('start_time'),
3782                 'end_time': info_dict.get('end_time'),
3783             }
3784
3785     @staticmethod
3786     def _handle_negative_timestamp(time, info):
3787         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3788
3789     def __eq__(self, other):
3790         return (isinstance(other, download_range_func)
3791                 and self.chapters == other.chapters and self.ranges == other.ranges)
3792
3793     def __repr__(self):
3794         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3795
3796
3797 def parse_dfxp_time_expr(time_expr):
3798     if not time_expr:
3799         return
3800
3801     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3802     if mobj:
3803         return float(mobj.group('time_offset'))
3804
3805     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3806     if mobj:
3807         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3808
3809
3810 def srt_subtitles_timecode(seconds):
3811     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3812
3813
3814 def ass_subtitles_timecode(seconds):
3815     time = timetuple_from_msec(seconds * 1000)
3816     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3817
3818
3819 def dfxp2srt(dfxp_data):
3820     '''
3821     @param dfxp_data A bytes-like object containing DFXP data
3822     @returns A unicode object containing converted SRT data
3823     '''
3824     LEGACY_NAMESPACES = (
3825         (b'http://www.w3.org/ns/ttml', [
3826             b'http://www.w3.org/2004/11/ttaf1',
3827             b'http://www.w3.org/2006/04/ttaf1',
3828             b'http://www.w3.org/2006/10/ttaf1',
3829         ]),
3830         (b'http://www.w3.org/ns/ttml#styling', [
3831             b'http://www.w3.org/ns/ttml#style',
3832         ]),
3833     )
3834
3835     SUPPORTED_STYLING = [
3836         'color',
3837         'fontFamily',
3838         'fontSize',
3839         'fontStyle',
3840         'fontWeight',
3841         'textDecoration'
3842     ]
3843
3844     _x = functools.partial(xpath_with_ns, ns_map={
3845         'xml': 'http://www.w3.org/XML/1998/namespace',
3846         'ttml': 'http://www.w3.org/ns/ttml',
3847         'tts': 'http://www.w3.org/ns/ttml#styling',
3848     })
3849
3850     styles = {}
3851     default_style = {}
3852
3853     class TTMLPElementParser:
3854         _out = ''
3855         _unclosed_elements = []
3856         _applied_styles = []
3857
3858         def start(self, tag, attrib):
3859             if tag in (_x('ttml:br'), 'br'):
3860                 self._out += '\n'
3861             else:
3862                 unclosed_elements = []
3863                 style = {}
3864                 element_style_id = attrib.get('style')
3865                 if default_style:
3866                     style.update(default_style)
3867                 if element_style_id:
3868                     style.update(styles.get(element_style_id, {}))
3869                 for prop in SUPPORTED_STYLING:
3870                     prop_val = attrib.get(_x('tts:' + prop))
3871                     if prop_val:
3872                         style[prop] = prop_val
3873                 if style:
3874                     font = ''
3875                     for k, v in sorted(style.items()):
3876                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3877                             continue
3878                         if k == 'color':
3879                             font += ' color="%s"' % v
3880                         elif k == 'fontSize':
3881                             font += ' size="%s"' % v
3882                         elif k == 'fontFamily':
3883                             font += ' face="%s"' % v
3884                         elif k == 'fontWeight' and v == 'bold':
3885                             self._out += '<b>'
3886                             unclosed_elements.append('b')
3887                         elif k == 'fontStyle' and v == 'italic':
3888                             self._out += '<i>'
3889                             unclosed_elements.append('i')
3890                         elif k == 'textDecoration' and v == 'underline':
3891                             self._out += '<u>'
3892                             unclosed_elements.append('u')
3893                     if font:
3894                         self._out += '<font' + font + '>'
3895                         unclosed_elements.append('font')
3896                     applied_style = {}
3897                     if self._applied_styles:
3898                         applied_style.update(self._applied_styles[-1])
3899                     applied_style.update(style)
3900                     self._applied_styles.append(applied_style)
3901                 self._unclosed_elements.append(unclosed_elements)
3902
3903         def end(self, tag):
3904             if tag not in (_x('ttml:br'), 'br'):
3905                 unclosed_elements = self._unclosed_elements.pop()
3906                 for element in reversed(unclosed_elements):
3907                     self._out += '</%s>' % element
3908                 if unclosed_elements and self._applied_styles:
3909                     self._applied_styles.pop()
3910
3911         def data(self, data):
3912             self._out += data
3913
3914         def close(self):
3915             return self._out.strip()
3916
3917     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3918     # This will not trigger false positives since only UTF-8 text is being replaced
3919     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3920
3921     def parse_node(node):
3922         target = TTMLPElementParser()
3923         parser = xml.etree.ElementTree.XMLParser(target=target)
3924         parser.feed(xml.etree.ElementTree.tostring(node))
3925         return parser.close()
3926
3927     for k, v in LEGACY_NAMESPACES:
3928         for ns in v:
3929             dfxp_data = dfxp_data.replace(ns, k)
3930
3931     dfxp = compat_etree_fromstring(dfxp_data)
3932     out = []
3933     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3934
3935     if not paras:
3936         raise ValueError('Invalid dfxp/TTML subtitle')
3937
3938     repeat = False
3939     while True:
3940         for style in dfxp.findall(_x('.//ttml:style')):
3941             style_id = style.get('id') or style.get(_x('xml:id'))
3942             if not style_id:
3943                 continue
3944             parent_style_id = style.get('style')
3945             if parent_style_id:
3946                 if parent_style_id not in styles:
3947                     repeat = True
3948                     continue
3949                 styles[style_id] = styles[parent_style_id].copy()
3950             for prop in SUPPORTED_STYLING:
3951                 prop_val = style.get(_x('tts:' + prop))
3952                 if prop_val:
3953                     styles.setdefault(style_id, {})[prop] = prop_val
3954         if repeat:
3955             repeat = False
3956         else:
3957             break
3958
3959     for p in ('body', 'div'):
3960         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3961         if ele is None:
3962             continue
3963         style = styles.get(ele.get('style'))
3964         if not style:
3965             continue
3966         default_style.update(style)
3967
3968     for para, index in zip(paras, itertools.count(1)):
3969         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3970         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3971         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3972         if begin_time is None:
3973             continue
3974         if not end_time:
3975             if not dur:
3976                 continue
3977             end_time = begin_time + dur
3978         out.append('%d\n%s --> %s\n%s\n\n' % (
3979             index,
3980             srt_subtitles_timecode(begin_time),
3981             srt_subtitles_timecode(end_time),
3982             parse_node(para)))
3983
3984     return ''.join(out)
3985
3986
3987 def cli_option(params, command_option, param, separator=None):
3988     param = params.get(param)
3989     return ([] if param is None
3990             else [command_option, str(param)] if separator is None
3991             else [f'{command_option}{separator}{param}'])
3992
3993
3994 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3995     param = params.get(param)
3996     assert param in (True, False, None)
3997     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3998
3999
4000 def cli_valueless_option(params, command_option, param, expected_value=True):
4001     return [command_option] if params.get(param) == expected_value else []
4002
4003
4004 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4005     if isinstance(argdict, (list, tuple)):  # for backward compatibility
4006         if use_compat:
4007             return argdict
4008         else:
4009             argdict = None
4010     if argdict is None:
4011         return default
4012     assert isinstance(argdict, dict)
4013
4014     assert isinstance(keys, (list, tuple))
4015     for key_list in keys:
4016         arg_list = list(filter(
4017             lambda x: x is not None,
4018             [argdict.get(key.lower()) for key in variadic(key_list)]))
4019         if arg_list:
4020             return [arg for args in arg_list for arg in args]
4021     return default
4022
4023
4024 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4025     main_key, exe = main_key.lower(), exe.lower()
4026     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4027     keys = [f'{root_key}{k}' for k in (keys or [''])]
4028     if root_key in keys:
4029         if main_key != exe:
4030             keys.append((main_key, exe))
4031         keys.append('default')
4032     else:
4033         use_compat = False
4034     return cli_configuration_args(argdict, keys, default, use_compat)
4035
4036
4037 class ISO639Utils:
4038     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4039     _lang_map = {
4040         'aa': 'aar',
4041         'ab': 'abk',
4042         'ae': 'ave',
4043         'af': 'afr',
4044         'ak': 'aka',
4045         'am': 'amh',
4046         'an': 'arg',
4047         'ar': 'ara',
4048         'as': 'asm',
4049         'av': 'ava',
4050         'ay': 'aym',
4051         'az': 'aze',
4052         'ba': 'bak',
4053         'be': 'bel',
4054         'bg': 'bul',
4055         'bh': 'bih',
4056         'bi': 'bis',
4057         'bm': 'bam',
4058         'bn': 'ben',
4059         'bo': 'bod',
4060         'br': 'bre',
4061         'bs': 'bos',
4062         'ca': 'cat',
4063         'ce': 'che',
4064         'ch': 'cha',
4065         'co': 'cos',
4066         'cr': 'cre',
4067         'cs': 'ces',
4068         'cu': 'chu',
4069         'cv': 'chv',
4070         'cy': 'cym',
4071         'da': 'dan',
4072         'de': 'deu',
4073         'dv': 'div',
4074         'dz': 'dzo',
4075         'ee': 'ewe',
4076         'el': 'ell',
4077         'en': 'eng',
4078         'eo': 'epo',
4079         'es': 'spa',
4080         'et': 'est',
4081         'eu': 'eus',
4082         'fa': 'fas',
4083         'ff': 'ful',
4084         'fi': 'fin',
4085         'fj': 'fij',
4086         'fo': 'fao',
4087         'fr': 'fra',
4088         'fy': 'fry',
4089         'ga': 'gle',
4090         'gd': 'gla',
4091         'gl': 'glg',
4092         'gn': 'grn',
4093         'gu': 'guj',
4094         'gv': 'glv',
4095         'ha': 'hau',
4096         'he': 'heb',
4097         'iw': 'heb',  # Replaced by he in 1989 revision
4098         'hi': 'hin',
4099         'ho': 'hmo',
4100         'hr': 'hrv',
4101         'ht': 'hat',
4102         'hu': 'hun',
4103         'hy': 'hye',
4104         'hz': 'her',
4105         'ia': 'ina',
4106         'id': 'ind',
4107         'in': 'ind',  # Replaced by id in 1989 revision
4108         'ie': 'ile',
4109         'ig': 'ibo',
4110         'ii': 'iii',
4111         'ik': 'ipk',
4112         'io': 'ido',
4113         'is': 'isl',
4114         'it': 'ita',
4115         'iu': 'iku',
4116         'ja': 'jpn',
4117         'jv': 'jav',
4118         'ka': 'kat',
4119         'kg': 'kon',
4120         'ki': 'kik',
4121         'kj': 'kua',
4122         'kk': 'kaz',
4123         'kl': 'kal',
4124         'km': 'khm',
4125         'kn': 'kan',
4126         'ko': 'kor',
4127         'kr': 'kau',
4128         'ks': 'kas',
4129         'ku': 'kur',
4130         'kv': 'kom',
4131         'kw': 'cor',
4132         'ky': 'kir',
4133         'la': 'lat',
4134         'lb': 'ltz',
4135         'lg': 'lug',
4136         'li': 'lim',
4137         'ln': 'lin',
4138         'lo': 'lao',
4139         'lt': 'lit',
4140         'lu': 'lub',
4141         'lv': 'lav',
4142         'mg': 'mlg',
4143         'mh': 'mah',
4144         'mi': 'mri',
4145         'mk': 'mkd',
4146         'ml': 'mal',
4147         'mn': 'mon',
4148         'mr': 'mar',
4149         'ms': 'msa',
4150         'mt': 'mlt',
4151         'my': 'mya',
4152         'na': 'nau',
4153         'nb': 'nob',
4154         'nd': 'nde',
4155         'ne': 'nep',
4156         'ng': 'ndo',
4157         'nl': 'nld',
4158         'nn': 'nno',
4159         'no': 'nor',
4160         'nr': 'nbl',
4161         'nv': 'nav',
4162         'ny': 'nya',
4163         'oc': 'oci',
4164         'oj': 'oji',
4165         'om': 'orm',
4166         'or': 'ori',
4167         'os': 'oss',
4168         'pa': 'pan',
4169         'pe': 'per',
4170         'pi': 'pli',
4171         'pl': 'pol',
4172         'ps': 'pus',
4173         'pt': 'por',
4174         'qu': 'que',
4175         'rm': 'roh',
4176         'rn': 'run',
4177         'ro': 'ron',
4178         'ru': 'rus',
4179         'rw': 'kin',
4180         'sa': 'san',
4181         'sc': 'srd',
4182         'sd': 'snd',
4183         'se': 'sme',
4184         'sg': 'sag',
4185         'si': 'sin',
4186         'sk': 'slk',
4187         'sl': 'slv',
4188         'sm': 'smo',
4189         'sn': 'sna',
4190         'so': 'som',
4191         'sq': 'sqi',
4192         'sr': 'srp',
4193         'ss': 'ssw',
4194         'st': 'sot',
4195         'su': 'sun',
4196         'sv': 'swe',
4197         'sw': 'swa',
4198         'ta': 'tam',
4199         'te': 'tel',
4200         'tg': 'tgk',
4201         'th': 'tha',
4202         'ti': 'tir',
4203         'tk': 'tuk',
4204         'tl': 'tgl',
4205         'tn': 'tsn',
4206         'to': 'ton',
4207         'tr': 'tur',
4208         'ts': 'tso',
4209         'tt': 'tat',
4210         'tw': 'twi',
4211         'ty': 'tah',
4212         'ug': 'uig',
4213         'uk': 'ukr',
4214         'ur': 'urd',
4215         'uz': 'uzb',
4216         've': 'ven',
4217         'vi': 'vie',
4218         'vo': 'vol',
4219         'wa': 'wln',
4220         'wo': 'wol',
4221         'xh': 'xho',
4222         'yi': 'yid',
4223         'ji': 'yid',  # Replaced by yi in 1989 revision
4224         'yo': 'yor',
4225         'za': 'zha',
4226         'zh': 'zho',
4227         'zu': 'zul',
4228     }
4229
4230     @classmethod
4231     def short2long(cls, code):
4232         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4233         return cls._lang_map.get(code[:2])
4234
4235     @classmethod
4236     def long2short(cls, code):
4237         """Convert language code from ISO 639-2/T to ISO 639-1"""
4238         for short_name, long_name in cls._lang_map.items():
4239             if long_name == code:
4240                 return short_name
4241
4242
4243 class ISO3166Utils:
4244     # From http://data.okfn.org/data/core/country-list
4245     _country_map = {
4246         'AF': 'Afghanistan',
4247         'AX': 'Åland Islands',
4248         'AL': 'Albania',
4249         'DZ': 'Algeria',
4250         'AS': 'American Samoa',
4251         'AD': 'Andorra',
4252         'AO': 'Angola',
4253         'AI': 'Anguilla',
4254         'AQ': 'Antarctica',
4255         'AG': 'Antigua and Barbuda',
4256         'AR': 'Argentina',
4257         'AM': 'Armenia',
4258         'AW': 'Aruba',
4259         'AU': 'Australia',
4260         'AT': 'Austria',
4261         'AZ': 'Azerbaijan',
4262         'BS': 'Bahamas',
4263         'BH': 'Bahrain',
4264         'BD': 'Bangladesh',
4265         'BB': 'Barbados',
4266         'BY': 'Belarus',
4267         'BE': 'Belgium',
4268         'BZ': 'Belize',
4269         'BJ': 'Benin',
4270         'BM': 'Bermuda',
4271         'BT': 'Bhutan',
4272         'BO': 'Bolivia, Plurinational State of',
4273         'BQ': 'Bonaire, Sint Eustatius and Saba',
4274         'BA': 'Bosnia and Herzegovina',
4275         'BW': 'Botswana',
4276         'BV': 'Bouvet Island',
4277         'BR': 'Brazil',
4278         'IO': 'British Indian Ocean Territory',
4279         'BN': 'Brunei Darussalam',
4280         'BG': 'Bulgaria',
4281         'BF': 'Burkina Faso',
4282         'BI': 'Burundi',
4283         'KH': 'Cambodia',
4284         'CM': 'Cameroon',
4285         'CA': 'Canada',
4286         'CV': 'Cape Verde',
4287         'KY': 'Cayman Islands',
4288         'CF': 'Central African Republic',
4289         'TD': 'Chad',
4290         'CL': 'Chile',
4291         'CN': 'China',
4292         'CX': 'Christmas Island',
4293         'CC': 'Cocos (Keeling) Islands',
4294         'CO': 'Colombia',
4295         'KM': 'Comoros',
4296         'CG': 'Congo',
4297         'CD': 'Congo, the Democratic Republic of the',
4298         'CK': 'Cook Islands',
4299         'CR': 'Costa Rica',
4300         'CI': 'Côte d\'Ivoire',
4301         'HR': 'Croatia',
4302         'CU': 'Cuba',
4303         'CW': 'Curaçao',
4304         'CY': 'Cyprus',
4305         'CZ': 'Czech Republic',
4306         'DK': 'Denmark',
4307         'DJ': 'Djibouti',
4308         'DM': 'Dominica',
4309         'DO': 'Dominican Republic',
4310         'EC': 'Ecuador',
4311         'EG': 'Egypt',
4312         'SV': 'El Salvador',
4313         'GQ': 'Equatorial Guinea',
4314         'ER': 'Eritrea',
4315         'EE': 'Estonia',
4316         'ET': 'Ethiopia',
4317         'FK': 'Falkland Islands (Malvinas)',
4318         'FO': 'Faroe Islands',
4319         'FJ': 'Fiji',
4320         'FI': 'Finland',
4321         'FR': 'France',
4322         'GF': 'French Guiana',
4323         'PF': 'French Polynesia',
4324         'TF': 'French Southern Territories',
4325         'GA': 'Gabon',
4326         'GM': 'Gambia',
4327         'GE': 'Georgia',
4328         'DE': 'Germany',
4329         'GH': 'Ghana',
4330         'GI': 'Gibraltar',
4331         'GR': 'Greece',
4332         'GL': 'Greenland',
4333         'GD': 'Grenada',
4334         'GP': 'Guadeloupe',
4335         'GU': 'Guam',
4336         'GT': 'Guatemala',
4337         'GG': 'Guernsey',
4338         'GN': 'Guinea',
4339         'GW': 'Guinea-Bissau',
4340         'GY': 'Guyana',
4341         'HT': 'Haiti',
4342         'HM': 'Heard Island and McDonald Islands',
4343         'VA': 'Holy See (Vatican City State)',
4344         'HN': 'Honduras',
4345         'HK': 'Hong Kong',
4346         'HU': 'Hungary',
4347         'IS': 'Iceland',
4348         'IN': 'India',
4349         'ID': 'Indonesia',
4350         'IR': 'Iran, Islamic Republic of',
4351         'IQ': 'Iraq',
4352         'IE': 'Ireland',
4353         'IM': 'Isle of Man',
4354         'IL': 'Israel',
4355         'IT': 'Italy',
4356         'JM': 'Jamaica',
4357         'JP': 'Japan',
4358         'JE': 'Jersey',
4359         'JO': 'Jordan',
4360         'KZ': 'Kazakhstan',
4361         'KE': 'Kenya',
4362         'KI': 'Kiribati',
4363         'KP': 'Korea, Democratic People\'s Republic of',
4364         'KR': 'Korea, Republic of',
4365         'KW': 'Kuwait',
4366         'KG': 'Kyrgyzstan',
4367         'LA': 'Lao People\'s Democratic Republic',
4368         'LV': 'Latvia',
4369         'LB': 'Lebanon',
4370         'LS': 'Lesotho',
4371         'LR': 'Liberia',
4372         'LY': 'Libya',
4373         'LI': 'Liechtenstein',
4374         'LT': 'Lithuania',
4375         'LU': 'Luxembourg',
4376         'MO': 'Macao',
4377         'MK': 'Macedonia, the Former Yugoslav Republic of',
4378         'MG': 'Madagascar',
4379         'MW': 'Malawi',
4380         'MY': 'Malaysia',
4381         'MV': 'Maldives',
4382         'ML': 'Mali',
4383         'MT': 'Malta',
4384         'MH': 'Marshall Islands',
4385         'MQ': 'Martinique',
4386         'MR': 'Mauritania',
4387         'MU': 'Mauritius',
4388         'YT': 'Mayotte',
4389         'MX': 'Mexico',
4390         'FM': 'Micronesia, Federated States of',
4391         'MD': 'Moldova, Republic of',
4392         'MC': 'Monaco',
4393         'MN': 'Mongolia',
4394         'ME': 'Montenegro',
4395         'MS': 'Montserrat',
4396         'MA': 'Morocco',
4397         'MZ': 'Mozambique',
4398         'MM': 'Myanmar',
4399         'NA': 'Namibia',
4400         'NR': 'Nauru',
4401         'NP': 'Nepal',
4402         'NL': 'Netherlands',
4403         'NC': 'New Caledonia',
4404         'NZ': 'New Zealand',
4405         'NI': 'Nicaragua',
4406         'NE': 'Niger',
4407         'NG': 'Nigeria',
4408         'NU': 'Niue',
4409         'NF': 'Norfolk Island',
4410         'MP': 'Northern Mariana Islands',
4411         'NO': 'Norway',
4412         'OM': 'Oman',
4413         'PK': 'Pakistan',
4414         'PW': 'Palau',
4415         'PS': 'Palestine, State of',
4416         'PA': 'Panama',
4417         'PG': 'Papua New Guinea',
4418         'PY': 'Paraguay',
4419         'PE': 'Peru',
4420         'PH': 'Philippines',
4421         'PN': 'Pitcairn',
4422         'PL': 'Poland',
4423         'PT': 'Portugal',
4424         'PR': 'Puerto Rico',
4425         'QA': 'Qatar',
4426         'RE': 'Réunion',
4427         'RO': 'Romania',
4428         'RU': 'Russian Federation',
4429         'RW': 'Rwanda',
4430         'BL': 'Saint Barthélemy',
4431         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4432         'KN': 'Saint Kitts and Nevis',
4433         'LC': 'Saint Lucia',
4434         'MF': 'Saint Martin (French part)',
4435         'PM': 'Saint Pierre and Miquelon',
4436         'VC': 'Saint Vincent and the Grenadines',
4437         'WS': 'Samoa',
4438         'SM': 'San Marino',
4439         'ST': 'Sao Tome and Principe',
4440         'SA': 'Saudi Arabia',
4441         'SN': 'Senegal',
4442         'RS': 'Serbia',
4443         'SC': 'Seychelles',
4444         'SL': 'Sierra Leone',
4445         'SG': 'Singapore',
4446         'SX': 'Sint Maarten (Dutch part)',
4447         'SK': 'Slovakia',
4448         'SI': 'Slovenia',
4449         'SB': 'Solomon Islands',
4450         'SO': 'Somalia',
4451         'ZA': 'South Africa',
4452         'GS': 'South Georgia and the South Sandwich Islands',
4453         'SS': 'South Sudan',
4454         'ES': 'Spain',
4455         'LK': 'Sri Lanka',
4456         'SD': 'Sudan',
4457         'SR': 'Suriname',
4458         'SJ': 'Svalbard and Jan Mayen',
4459         'SZ': 'Swaziland',
4460         'SE': 'Sweden',
4461         'CH': 'Switzerland',
4462         'SY': 'Syrian Arab Republic',
4463         'TW': 'Taiwan, Province of China',
4464         'TJ': 'Tajikistan',
4465         'TZ': 'Tanzania, United Republic of',
4466         'TH': 'Thailand',
4467         'TL': 'Timor-Leste',
4468         'TG': 'Togo',
4469         'TK': 'Tokelau',
4470         'TO': 'Tonga',
4471         'TT': 'Trinidad and Tobago',
4472         'TN': 'Tunisia',
4473         'TR': 'Turkey',
4474         'TM': 'Turkmenistan',
4475         'TC': 'Turks and Caicos Islands',
4476         'TV': 'Tuvalu',
4477         'UG': 'Uganda',
4478         'UA': 'Ukraine',
4479         'AE': 'United Arab Emirates',
4480         'GB': 'United Kingdom',
4481         'US': 'United States',
4482         'UM': 'United States Minor Outlying Islands',
4483         'UY': 'Uruguay',
4484         'UZ': 'Uzbekistan',
4485         'VU': 'Vanuatu',
4486         'VE': 'Venezuela, Bolivarian Republic of',
4487         'VN': 'Viet Nam',
4488         'VG': 'Virgin Islands, British',
4489         'VI': 'Virgin Islands, U.S.',
4490         'WF': 'Wallis and Futuna',
4491         'EH': 'Western Sahara',
4492         'YE': 'Yemen',
4493         'ZM': 'Zambia',
4494         'ZW': 'Zimbabwe',
4495         # Not ISO 3166 codes, but used for IP blocks
4496         'AP': 'Asia/Pacific Region',
4497         'EU': 'Europe',
4498     }
4499
4500     @classmethod
4501     def short2full(cls, code):
4502         """Convert an ISO 3166-2 country code to the corresponding full name"""
4503         return cls._country_map.get(code.upper())
4504
4505
4506 class GeoUtils:
4507     # Major IPv4 address blocks per country
4508     _country_ip_map = {
4509         'AD': '46.172.224.0/19',
4510         'AE': '94.200.0.0/13',
4511         'AF': '149.54.0.0/17',
4512         'AG': '209.59.64.0/18',
4513         'AI': '204.14.248.0/21',
4514         'AL': '46.99.0.0/16',
4515         'AM': '46.70.0.0/15',
4516         'AO': '105.168.0.0/13',
4517         'AP': '182.50.184.0/21',
4518         'AQ': '23.154.160.0/24',
4519         'AR': '181.0.0.0/12',
4520         'AS': '202.70.112.0/20',
4521         'AT': '77.116.0.0/14',
4522         'AU': '1.128.0.0/11',
4523         'AW': '181.41.0.0/18',
4524         'AX': '185.217.4.0/22',
4525         'AZ': '5.197.0.0/16',
4526         'BA': '31.176.128.0/17',
4527         'BB': '65.48.128.0/17',
4528         'BD': '114.130.0.0/16',
4529         'BE': '57.0.0.0/8',
4530         'BF': '102.178.0.0/15',
4531         'BG': '95.42.0.0/15',
4532         'BH': '37.131.0.0/17',
4533         'BI': '154.117.192.0/18',
4534         'BJ': '137.255.0.0/16',
4535         'BL': '185.212.72.0/23',
4536         'BM': '196.12.64.0/18',
4537         'BN': '156.31.0.0/16',
4538         'BO': '161.56.0.0/16',
4539         'BQ': '161.0.80.0/20',
4540         'BR': '191.128.0.0/12',
4541         'BS': '24.51.64.0/18',
4542         'BT': '119.2.96.0/19',
4543         'BW': '168.167.0.0/16',
4544         'BY': '178.120.0.0/13',
4545         'BZ': '179.42.192.0/18',
4546         'CA': '99.224.0.0/11',
4547         'CD': '41.243.0.0/16',
4548         'CF': '197.242.176.0/21',
4549         'CG': '160.113.0.0/16',
4550         'CH': '85.0.0.0/13',
4551         'CI': '102.136.0.0/14',
4552         'CK': '202.65.32.0/19',
4553         'CL': '152.172.0.0/14',
4554         'CM': '102.244.0.0/14',
4555         'CN': '36.128.0.0/10',
4556         'CO': '181.240.0.0/12',
4557         'CR': '201.192.0.0/12',
4558         'CU': '152.206.0.0/15',
4559         'CV': '165.90.96.0/19',
4560         'CW': '190.88.128.0/17',
4561         'CY': '31.153.0.0/16',
4562         'CZ': '88.100.0.0/14',
4563         'DE': '53.0.0.0/8',
4564         'DJ': '197.241.0.0/17',
4565         'DK': '87.48.0.0/12',
4566         'DM': '192.243.48.0/20',
4567         'DO': '152.166.0.0/15',
4568         'DZ': '41.96.0.0/12',
4569         'EC': '186.68.0.0/15',
4570         'EE': '90.190.0.0/15',
4571         'EG': '156.160.0.0/11',
4572         'ER': '196.200.96.0/20',
4573         'ES': '88.0.0.0/11',
4574         'ET': '196.188.0.0/14',
4575         'EU': '2.16.0.0/13',
4576         'FI': '91.152.0.0/13',
4577         'FJ': '144.120.0.0/16',
4578         'FK': '80.73.208.0/21',
4579         'FM': '119.252.112.0/20',
4580         'FO': '88.85.32.0/19',
4581         'FR': '90.0.0.0/9',
4582         'GA': '41.158.0.0/15',
4583         'GB': '25.0.0.0/8',
4584         'GD': '74.122.88.0/21',
4585         'GE': '31.146.0.0/16',
4586         'GF': '161.22.64.0/18',
4587         'GG': '62.68.160.0/19',
4588         'GH': '154.160.0.0/12',
4589         'GI': '95.164.0.0/16',
4590         'GL': '88.83.0.0/19',
4591         'GM': '160.182.0.0/15',
4592         'GN': '197.149.192.0/18',
4593         'GP': '104.250.0.0/19',
4594         'GQ': '105.235.224.0/20',
4595         'GR': '94.64.0.0/13',
4596         'GT': '168.234.0.0/16',
4597         'GU': '168.123.0.0/16',
4598         'GW': '197.214.80.0/20',
4599         'GY': '181.41.64.0/18',
4600         'HK': '113.252.0.0/14',
4601         'HN': '181.210.0.0/16',
4602         'HR': '93.136.0.0/13',
4603         'HT': '148.102.128.0/17',
4604         'HU': '84.0.0.0/14',
4605         'ID': '39.192.0.0/10',
4606         'IE': '87.32.0.0/12',
4607         'IL': '79.176.0.0/13',
4608         'IM': '5.62.80.0/20',
4609         'IN': '117.192.0.0/10',
4610         'IO': '203.83.48.0/21',
4611         'IQ': '37.236.0.0/14',
4612         'IR': '2.176.0.0/12',
4613         'IS': '82.221.0.0/16',
4614         'IT': '79.0.0.0/10',
4615         'JE': '87.244.64.0/18',
4616         'JM': '72.27.0.0/17',
4617         'JO': '176.29.0.0/16',
4618         'JP': '133.0.0.0/8',
4619         'KE': '105.48.0.0/12',
4620         'KG': '158.181.128.0/17',
4621         'KH': '36.37.128.0/17',
4622         'KI': '103.25.140.0/22',
4623         'KM': '197.255.224.0/20',
4624         'KN': '198.167.192.0/19',
4625         'KP': '175.45.176.0/22',
4626         'KR': '175.192.0.0/10',
4627         'KW': '37.36.0.0/14',
4628         'KY': '64.96.0.0/15',
4629         'KZ': '2.72.0.0/13',
4630         'LA': '115.84.64.0/18',
4631         'LB': '178.135.0.0/16',
4632         'LC': '24.92.144.0/20',
4633         'LI': '82.117.0.0/19',
4634         'LK': '112.134.0.0/15',
4635         'LR': '102.183.0.0/16',
4636         'LS': '129.232.0.0/17',
4637         'LT': '78.56.0.0/13',
4638         'LU': '188.42.0.0/16',
4639         'LV': '46.109.0.0/16',
4640         'LY': '41.252.0.0/14',
4641         'MA': '105.128.0.0/11',
4642         'MC': '88.209.64.0/18',
4643         'MD': '37.246.0.0/16',
4644         'ME': '178.175.0.0/17',
4645         'MF': '74.112.232.0/21',
4646         'MG': '154.126.0.0/17',
4647         'MH': '117.103.88.0/21',
4648         'MK': '77.28.0.0/15',
4649         'ML': '154.118.128.0/18',
4650         'MM': '37.111.0.0/17',
4651         'MN': '49.0.128.0/17',
4652         'MO': '60.246.0.0/16',
4653         'MP': '202.88.64.0/20',
4654         'MQ': '109.203.224.0/19',
4655         'MR': '41.188.64.0/18',
4656         'MS': '208.90.112.0/22',
4657         'MT': '46.11.0.0/16',
4658         'MU': '105.16.0.0/12',
4659         'MV': '27.114.128.0/18',
4660         'MW': '102.70.0.0/15',
4661         'MX': '187.192.0.0/11',
4662         'MY': '175.136.0.0/13',
4663         'MZ': '197.218.0.0/15',
4664         'NA': '41.182.0.0/16',
4665         'NC': '101.101.0.0/18',
4666         'NE': '197.214.0.0/18',
4667         'NF': '203.17.240.0/22',
4668         'NG': '105.112.0.0/12',
4669         'NI': '186.76.0.0/15',
4670         'NL': '145.96.0.0/11',
4671         'NO': '84.208.0.0/13',
4672         'NP': '36.252.0.0/15',
4673         'NR': '203.98.224.0/19',
4674         'NU': '49.156.48.0/22',
4675         'NZ': '49.224.0.0/14',
4676         'OM': '5.36.0.0/15',
4677         'PA': '186.72.0.0/15',
4678         'PE': '186.160.0.0/14',
4679         'PF': '123.50.64.0/18',
4680         'PG': '124.240.192.0/19',
4681         'PH': '49.144.0.0/13',
4682         'PK': '39.32.0.0/11',
4683         'PL': '83.0.0.0/11',
4684         'PM': '70.36.0.0/20',
4685         'PR': '66.50.0.0/16',
4686         'PS': '188.161.0.0/16',
4687         'PT': '85.240.0.0/13',
4688         'PW': '202.124.224.0/20',
4689         'PY': '181.120.0.0/14',
4690         'QA': '37.210.0.0/15',
4691         'RE': '102.35.0.0/16',
4692         'RO': '79.112.0.0/13',
4693         'RS': '93.86.0.0/15',
4694         'RU': '5.136.0.0/13',
4695         'RW': '41.186.0.0/16',
4696         'SA': '188.48.0.0/13',
4697         'SB': '202.1.160.0/19',
4698         'SC': '154.192.0.0/11',
4699         'SD': '102.120.0.0/13',
4700         'SE': '78.64.0.0/12',
4701         'SG': '8.128.0.0/10',
4702         'SI': '188.196.0.0/14',
4703         'SK': '78.98.0.0/15',
4704         'SL': '102.143.0.0/17',
4705         'SM': '89.186.32.0/19',
4706         'SN': '41.82.0.0/15',
4707         'SO': '154.115.192.0/18',
4708         'SR': '186.179.128.0/17',
4709         'SS': '105.235.208.0/21',
4710         'ST': '197.159.160.0/19',
4711         'SV': '168.243.0.0/16',
4712         'SX': '190.102.0.0/20',
4713         'SY': '5.0.0.0/16',
4714         'SZ': '41.84.224.0/19',
4715         'TC': '65.255.48.0/20',
4716         'TD': '154.68.128.0/19',
4717         'TG': '196.168.0.0/14',
4718         'TH': '171.96.0.0/13',
4719         'TJ': '85.9.128.0/18',
4720         'TK': '27.96.24.0/21',
4721         'TL': '180.189.160.0/20',
4722         'TM': '95.85.96.0/19',
4723         'TN': '197.0.0.0/11',
4724         'TO': '175.176.144.0/21',
4725         'TR': '78.160.0.0/11',
4726         'TT': '186.44.0.0/15',
4727         'TV': '202.2.96.0/19',
4728         'TW': '120.96.0.0/11',
4729         'TZ': '156.156.0.0/14',
4730         'UA': '37.52.0.0/14',
4731         'UG': '102.80.0.0/13',
4732         'US': '6.0.0.0/8',
4733         'UY': '167.56.0.0/13',
4734         'UZ': '84.54.64.0/18',
4735         'VA': '212.77.0.0/19',
4736         'VC': '207.191.240.0/21',
4737         'VE': '186.88.0.0/13',
4738         'VG': '66.81.192.0/20',
4739         'VI': '146.226.0.0/16',
4740         'VN': '14.160.0.0/11',
4741         'VU': '202.80.32.0/20',
4742         'WF': '117.20.32.0/21',
4743         'WS': '202.4.32.0/19',
4744         'YE': '134.35.0.0/16',
4745         'YT': '41.242.116.0/22',
4746         'ZA': '41.0.0.0/11',
4747         'ZM': '102.144.0.0/13',
4748         'ZW': '102.177.192.0/18',
4749     }
4750
4751     @classmethod
4752     def random_ipv4(cls, code_or_block):
4753         if len(code_or_block) == 2:
4754             block = cls._country_ip_map.get(code_or_block.upper())
4755             if not block:
4756                 return None
4757         else:
4758             block = code_or_block
4759         addr, preflen = block.split('/')
4760         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4761         addr_max = addr_min | (0xffffffff >> int(preflen))
4762         return str(socket.inet_ntoa(
4763             struct.pack('!L', random.randint(addr_min, addr_max))))
4764
4765
4766 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4767     def __init__(self, proxies=None):
4768         # Set default handlers
4769         for type in ('http', 'https'):
4770             setattr(self, '%s_open' % type,
4771                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4772                         meth(r, proxy, type))
4773         urllib.request.ProxyHandler.__init__(self, proxies)
4774
4775     def proxy_open(self, req, proxy, type):
4776         req_proxy = req.headers.get('Ytdl-request-proxy')
4777         if req_proxy is not None:
4778             proxy = req_proxy
4779             del req.headers['Ytdl-request-proxy']
4780
4781         if proxy == '__noproxy__':
4782             return None  # No Proxy
4783         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4784             req.add_header('Ytdl-socks-proxy', proxy)
4785             # yt-dlp's http/https handlers do wrapping the socket with socks
4786             return None
4787         return urllib.request.ProxyHandler.proxy_open(
4788             self, req, proxy, type)
4789
4790
4791 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4792 # released into Public Domain
4793 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4794
4795 def long_to_bytes(n, blocksize=0):
4796     """long_to_bytes(n:long, blocksize:int) : string
4797     Convert a long integer to a byte string.
4798
4799     If optional blocksize is given and greater than zero, pad the front of the
4800     byte string with binary zeros so that the length is a multiple of
4801     blocksize.
4802     """
4803     # after much testing, this algorithm was deemed to be the fastest
4804     s = b''
4805     n = int(n)
4806     while n > 0:
4807         s = struct.pack('>I', n & 0xffffffff) + s
4808         n = n >> 32
4809     # strip off leading zeros
4810     for i in range(len(s)):
4811         if s[i] != b'\000'[0]:
4812             break
4813     else:
4814         # only happens when n == 0
4815         s = b'\000'
4816         i = 0
4817     s = s[i:]
4818     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4819     # de-padding being done above, but sigh...
4820     if blocksize > 0 and len(s) % blocksize:
4821         s = (blocksize - len(s) % blocksize) * b'\000' + s
4822     return s
4823
4824
4825 def bytes_to_long(s):
4826     """bytes_to_long(string) : long
4827     Convert a byte string to a long integer.
4828
4829     This is (essentially) the inverse of long_to_bytes().
4830     """
4831     acc = 0
4832     length = len(s)
4833     if length % 4:
4834         extra = (4 - length % 4)
4835         s = b'\000' * extra + s
4836         length = length + extra
4837     for i in range(0, length, 4):
4838         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4839     return acc
4840
4841
4842 def ohdave_rsa_encrypt(data, exponent, modulus):
4843     '''
4844     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4845
4846     Input:
4847         data: data to encrypt, bytes-like object
4848         exponent, modulus: parameter e and N of RSA algorithm, both integer
4849     Output: hex string of encrypted data
4850
4851     Limitation: supports one block encryption only
4852     '''
4853
4854     payload = int(binascii.hexlify(data[::-1]), 16)
4855     encrypted = pow(payload, exponent, modulus)
4856     return '%x' % encrypted
4857
4858
4859 def pkcs1pad(data, length):
4860     """
4861     Padding input data with PKCS#1 scheme
4862
4863     @param {int[]} data        input data
4864     @param {int}   length      target length
4865     @returns {int[]}           padded data
4866     """
4867     if len(data) > length - 11:
4868         raise ValueError('Input data too long for PKCS#1 padding')
4869
4870     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4871     return [0, 2] + pseudo_random + [0] + data
4872
4873
4874 def _base_n_table(n, table):
4875     if not table and not n:
4876         raise ValueError('Either table or n must be specified')
4877     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4878
4879     if n and n != len(table):
4880         raise ValueError(f'base {n} exceeds table length {len(table)}')
4881     return table
4882
4883
4884 def encode_base_n(num, n=None, table=None):
4885     """Convert given int to a base-n string"""
4886     table = _base_n_table(n, table)
4887     if not num:
4888         return table[0]
4889
4890     result, base = '', len(table)
4891     while num:
4892         result = table[num % base] + result
4893         num = num // base
4894     return result
4895
4896
4897 def decode_base_n(string, n=None, table=None):
4898     """Convert given base-n string to int"""
4899     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4900     result, base = 0, len(table)
4901     for char in string:
4902         result = result * base + table[char]
4903     return result
4904
4905
4906 def decode_packed_codes(code):
4907     mobj = re.search(PACKED_CODES_RE, code)
4908     obfuscated_code, base, count, symbols = mobj.groups()
4909     base = int(base)
4910     count = int(count)
4911     symbols = symbols.split('|')
4912     symbol_table = {}
4913
4914     while count:
4915         count -= 1
4916         base_n_count = encode_base_n(count, base)
4917         symbol_table[base_n_count] = symbols[count] or base_n_count
4918
4919     return re.sub(
4920         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4921         obfuscated_code)
4922
4923
4924 def caesar(s, alphabet, shift):
4925     if shift == 0:
4926         return s
4927     l = len(alphabet)
4928     return ''.join(
4929         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4930         for c in s)
4931
4932
4933 def rot47(s):
4934     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4935
4936
4937 def parse_m3u8_attributes(attrib):
4938     info = {}
4939     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4940         if val.startswith('"'):
4941             val = val[1:-1]
4942         info[key] = val
4943     return info
4944
4945
4946 def urshift(val, n):
4947     return val >> n if val >= 0 else (val + 0x100000000) >> n
4948
4949
4950 def write_xattr(path, key, value):
4951     # Windows: Write xattrs to NTFS Alternate Data Streams:
4952     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4953     if compat_os_name == 'nt':
4954         assert ':' not in key
4955         assert os.path.exists(path)
4956
4957         try:
4958             with open(f'{path}:{key}', 'wb') as f:
4959                 f.write(value)
4960         except OSError as e:
4961             raise XAttrMetadataError(e.errno, e.strerror)
4962         return
4963
4964     # UNIX Method 1. Use xattrs/pyxattrs modules
4965
4966     setxattr = None
4967     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4968         # Unicode arguments are not supported in pyxattr until version 0.5.0
4969         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4970         if version_tuple(xattr.__version__) >= (0, 5, 0):
4971             setxattr = xattr.set
4972     elif xattr:
4973         setxattr = xattr.setxattr
4974
4975     if setxattr:
4976         try:
4977             setxattr(path, key, value)
4978         except OSError as e:
4979             raise XAttrMetadataError(e.errno, e.strerror)
4980         return
4981
4982     # UNIX Method 2. Use setfattr/xattr executables
4983     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4984            else 'xattr' if check_executable('xattr', ['-h']) else None)
4985     if not exe:
4986         raise XAttrUnavailableError(
4987             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4988             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4989
4990     value = value.decode()
4991     try:
4992         _, stderr, returncode = Popen.run(
4993             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4994             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4995     except OSError as e:
4996         raise XAttrMetadataError(e.errno, e.strerror)
4997     if returncode:
4998         raise XAttrMetadataError(returncode, stderr)
4999
5000
5001 def random_birthday(year_field, month_field, day_field):
5002     start_date = datetime.date(1950, 1, 1)
5003     end_date = datetime.date(1995, 12, 31)
5004     offset = random.randint(0, (end_date - start_date).days)
5005     random_date = start_date + datetime.timedelta(offset)
5006     return {
5007         year_field: str(random_date.year),
5008         month_field: str(random_date.month),
5009         day_field: str(random_date.day),
5010     }
5011
5012
5013 def find_available_port(interface=''):
5014     try:
5015         with socket.socket() as sock:
5016             sock.bind((interface, 0))
5017             return sock.getsockname()[1]
5018     except OSError:
5019         return None
5020
5021
5022 # Templates for internet shortcut files, which are plain text files.
5023 DOT_URL_LINK_TEMPLATE = '''\
5024 [InternetShortcut]
5025 URL=%(url)s
5026 '''
5027
5028 DOT_WEBLOC_LINK_TEMPLATE = '''\
5029 <?xml version="1.0" encoding="UTF-8"?>
5030 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5031 <plist version="1.0">
5032 <dict>
5033 \t<key>URL</key>
5034 \t<string>%(url)s</string>
5035 </dict>
5036 </plist>
5037 '''
5038
5039 DOT_DESKTOP_LINK_TEMPLATE = '''\
5040 [Desktop Entry]
5041 Encoding=UTF-8
5042 Name=%(filename)s
5043 Type=Link
5044 URL=%(url)s
5045 Icon=text-html
5046 '''
5047
5048 LINK_TEMPLATES = {
5049     'url': DOT_URL_LINK_TEMPLATE,
5050     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5051     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5052 }
5053
5054
5055 def iri_to_uri(iri):
5056     """
5057     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5058
5059     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5060     """
5061
5062     iri_parts = urllib.parse.urlparse(iri)
5063
5064     if '[' in iri_parts.netloc:
5065         raise ValueError('IPv6 URIs are not, yet, supported.')
5066         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5067
5068     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5069
5070     net_location = ''
5071     if iri_parts.username:
5072         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5073         if iri_parts.password is not None:
5074             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5075         net_location += '@'
5076
5077     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5078     # The 'idna' encoding produces ASCII text.
5079     if iri_parts.port is not None and iri_parts.port != 80:
5080         net_location += ':' + str(iri_parts.port)
5081
5082     return urllib.parse.urlunparse(
5083         (iri_parts.scheme,
5084             net_location,
5085
5086             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5087
5088             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5089             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5090
5091             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5092             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5093
5094             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5095
5096     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5097
5098
5099 def to_high_limit_path(path):
5100     if sys.platform in ['win32', 'cygwin']:
5101         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5102         return '\\\\?\\' + os.path.abspath(path)
5103
5104     return path
5105
5106
5107 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5108     val = traversal.traverse_obj(obj, *variadic(field))
5109     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5110         return default
5111     return template % func(val)
5112
5113
5114 def clean_podcast_url(url):
5115     return re.sub(r'''(?x)
5116         (?:
5117             (?:
5118                 chtbl\.com/track|
5119                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5120                 play\.podtrac\.com
5121             )/[^/]+|
5122             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5123             flex\.acast\.com|
5124             pd(?:
5125                 cn\.co| # https://podcorn.com/analytics-prefix/
5126                 st\.fm # https://podsights.com/docs/
5127             )/e
5128         )/''', '', url)
5129
5130
5131 _HEX_TABLE = '0123456789abcdef'
5132
5133
5134 def random_uuidv4():
5135     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5136
5137
5138 def make_dir(path, to_screen=None):
5139     try:
5140         dn = os.path.dirname(path)
5141         if dn:
5142             os.makedirs(dn, exist_ok=True)
5143         return True
5144     except OSError as err:
5145         if callable(to_screen) is not None:
5146             to_screen(f'unable to create directory {err}')
5147         return False
5148
5149
5150 def get_executable_path():
5151     from ..update import _get_variant_and_executable_path
5152
5153     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5154
5155
5156 def get_user_config_dirs(package_name):
5157     # .config (e.g. ~/.config/package_name)
5158     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5159     yield os.path.join(xdg_config_home, package_name)
5160
5161     # appdata (%APPDATA%/package_name)
5162     appdata_dir = os.getenv('appdata')
5163     if appdata_dir:
5164         yield os.path.join(appdata_dir, package_name)
5165
5166     # home (~/.package_name)
5167     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5168
5169
5170 def get_system_config_dirs(package_name):
5171     # /etc/package_name
5172     yield os.path.join('/etc', package_name)
5173
5174
5175 def time_seconds(**kwargs):
5176     """
5177     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5178     """
5179     return time.time() + datetime.timedelta(**kwargs).total_seconds()
5180
5181
5182 # create a JSON Web Signature (jws) with HS256 algorithm
5183 # the resulting format is in JWS Compact Serialization
5184 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5185 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5186 def jwt_encode_hs256(payload_data, key, headers={}):
5187     header_data = {
5188         'alg': 'HS256',
5189         'typ': 'JWT',
5190     }
5191     if headers:
5192         header_data.update(headers)
5193     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5194     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5195     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5196     signature_b64 = base64.b64encode(h.digest())
5197     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5198     return token
5199
5200
5201 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5202 def jwt_decode_hs256(jwt):
5203     header_b64, payload_b64, signature_b64 = jwt.split('.')
5204     # add trailing ='s that may have been stripped, superfluous ='s are ignored
5205     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5206     return payload_data
5207
5208
5209 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5210
5211
5212 @functools.cache
5213 def supports_terminal_sequences(stream):
5214     if compat_os_name == 'nt':
5215         if not WINDOWS_VT_MODE:
5216             return False
5217     elif not os.getenv('TERM'):
5218         return False
5219     try:
5220         return stream.isatty()
5221     except BaseException:
5222         return False
5223
5224
5225 def windows_enable_vt_mode():
5226     """Ref: https://bugs.python.org/issue30075 """
5227     if get_windows_version() < (10, 0, 10586):
5228         return
5229
5230     import ctypes
5231     import ctypes.wintypes
5232     import msvcrt
5233
5234     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5235
5236     dll = ctypes.WinDLL('kernel32', use_last_error=False)
5237     handle = os.open('CONOUT$', os.O_RDWR)
5238     try:
5239         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5240         dw_original_mode = ctypes.wintypes.DWORD()
5241         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5242         if not success:
5243             raise Exception('GetConsoleMode failed')
5244
5245         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5246             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5247         if not success:
5248             raise Exception('SetConsoleMode failed')
5249     finally:
5250         os.close(handle)
5251
5252     global WINDOWS_VT_MODE
5253     WINDOWS_VT_MODE = True
5254     supports_terminal_sequences.cache_clear()
5255
5256
5257 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5258
5259
5260 def remove_terminal_sequences(string):
5261     return _terminal_sequences_re.sub('', string)
5262
5263
5264 def number_of_digits(number):
5265     return len('%d' % number)
5266
5267
5268 def join_nonempty(*values, delim='-', from_dict=None):
5269     if from_dict is not None:
5270         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
5271     return delim.join(map(str, filter(None, values)))
5272
5273
5274 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5275     """
5276     Find the largest format dimensions in terms of video width and, for each thumbnail:
5277     * Modify the URL: Match the width with the provided regex and replace with the former width
5278     * Update dimensions
5279
5280     This function is useful with video services that scale the provided thumbnails on demand
5281     """
5282     _keys = ('width', 'height')
5283     max_dimensions = max(
5284         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5285         default=(0, 0))
5286     if not max_dimensions[0]:
5287         return thumbnails
5288     return [
5289         merge_dicts(
5290             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5291             dict(zip(_keys, max_dimensions)), thumbnail)
5292         for thumbnail in thumbnails
5293     ]
5294
5295
5296 def parse_http_range(range):
5297     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5298     if not range:
5299         return None, None, None
5300     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5301     if not crg:
5302         return None, None, None
5303     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5304
5305
5306 def read_stdin(what):
5307     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5308     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5309     return sys.stdin
5310
5311
5312 def determine_file_encoding(data):
5313     """
5314     Detect the text encoding used
5315     @returns (encoding, bytes to skip)
5316     """
5317
5318     # BOM marks are given priority over declarations
5319     for bom, enc in BOMS:
5320         if data.startswith(bom):
5321             return enc, len(bom)
5322
5323     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5324     # We ignore the endianness to get a good enough match
5325     data = data.replace(b'\0', b'')
5326     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5327     return mobj.group(1).decode() if mobj else None, 0
5328
5329
5330 class Config:
5331     own_args = None
5332     parsed_args = None
5333     filename = None
5334     __initialized = False
5335
5336     def __init__(self, parser, label=None):
5337         self.parser, self.label = parser, label
5338         self._loaded_paths, self.configs = set(), []
5339
5340     def init(self, args=None, filename=None):
5341         assert not self.__initialized
5342         self.own_args, self.filename = args, filename
5343         return self.load_configs()
5344
5345     def load_configs(self):
5346         directory = ''
5347         if self.filename:
5348             location = os.path.realpath(self.filename)
5349             directory = os.path.dirname(location)
5350             if location in self._loaded_paths:
5351                 return False
5352             self._loaded_paths.add(location)
5353
5354         self.__initialized = True
5355         opts, _ = self.parser.parse_known_args(self.own_args)
5356         self.parsed_args = self.own_args
5357         for location in opts.config_locations or []:
5358             if location == '-':
5359                 if location in self._loaded_paths:
5360                     continue
5361                 self._loaded_paths.add(location)
5362                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5363                 continue
5364             location = os.path.join(directory, expand_path(location))
5365             if os.path.isdir(location):
5366                 location = os.path.join(location, 'yt-dlp.conf')
5367             if not os.path.exists(location):
5368                 self.parser.error(f'config location {location} does not exist')
5369             self.append_config(self.read_file(location), location)
5370         return True
5371
5372     def __str__(self):
5373         label = join_nonempty(
5374             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5375             delim=' ')
5376         return join_nonempty(
5377             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5378             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5379             delim='\n')
5380
5381     @staticmethod
5382     def read_file(filename, default=[]):
5383         try:
5384             optionf = open(filename, 'rb')
5385         except OSError:
5386             return default  # silently skip if file is not present
5387         try:
5388             enc, skip = determine_file_encoding(optionf.read(512))
5389             optionf.seek(skip, io.SEEK_SET)
5390         except OSError:
5391             enc = None  # silently skip read errors
5392         try:
5393             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5394             contents = optionf.read().decode(enc or preferredencoding())
5395             res = shlex.split(contents, comments=True)
5396         except Exception as err:
5397             raise ValueError(f'Unable to parse "{filename}": {err}')
5398         finally:
5399             optionf.close()
5400         return res
5401
5402     @staticmethod
5403     def hide_login_info(opts):
5404         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5405         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5406
5407         def _scrub_eq(o):
5408             m = eqre.match(o)
5409             if m:
5410                 return m.group('key') + '=PRIVATE'
5411             else:
5412                 return o
5413
5414         opts = list(map(_scrub_eq, opts))
5415         for idx, opt in enumerate(opts):
5416             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5417                 opts[idx + 1] = 'PRIVATE'
5418         return opts
5419
5420     def append_config(self, *args, label=None):
5421         config = type(self)(self.parser, label)
5422         config._loaded_paths = self._loaded_paths
5423         if config.init(*args):
5424             self.configs.append(config)
5425
5426     @property
5427     def all_args(self):
5428         for config in reversed(self.configs):
5429             yield from config.all_args
5430         yield from self.parsed_args or []
5431
5432     def parse_known_args(self, **kwargs):
5433         return self.parser.parse_known_args(self.all_args, **kwargs)
5434
5435     def parse_args(self):
5436         return self.parser.parse_args(self.all_args)
5437
5438
5439 class WebSocketsWrapper:
5440     """Wraps websockets module to use in non-async scopes"""
5441     pool = None
5442
5443     def __init__(self, url, headers=None, connect=True):
5444         self.loop = asyncio.new_event_loop()
5445         # XXX: "loop" is deprecated
5446         self.conn = websockets.connect(
5447             url, extra_headers=headers, ping_interval=None,
5448             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5449         if connect:
5450             self.__enter__()
5451         atexit.register(self.__exit__, None, None, None)
5452
5453     def __enter__(self):
5454         if not self.pool:
5455             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5456         return self
5457
5458     def send(self, *args):
5459         self.run_with_loop(self.pool.send(*args), self.loop)
5460
5461     def recv(self, *args):
5462         return self.run_with_loop(self.pool.recv(*args), self.loop)
5463
5464     def __exit__(self, type, value, traceback):
5465         try:
5466             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5467         finally:
5468             self.loop.close()
5469             self._cancel_all_tasks(self.loop)
5470
5471     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5472     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5473     @staticmethod
5474     def run_with_loop(main, loop):
5475         if not asyncio.iscoroutine(main):
5476             raise ValueError(f'a coroutine was expected, got {main!r}')
5477
5478         try:
5479             return loop.run_until_complete(main)
5480         finally:
5481             loop.run_until_complete(loop.shutdown_asyncgens())
5482             if hasattr(loop, 'shutdown_default_executor'):
5483                 loop.run_until_complete(loop.shutdown_default_executor())
5484
5485     @staticmethod
5486     def _cancel_all_tasks(loop):
5487         to_cancel = asyncio.all_tasks(loop)
5488
5489         if not to_cancel:
5490             return
5491
5492         for task in to_cancel:
5493             task.cancel()
5494
5495         # XXX: "loop" is removed in python 3.10+
5496         loop.run_until_complete(
5497             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5498
5499         for task in to_cancel:
5500             if task.cancelled():
5501                 continue
5502             if task.exception() is not None:
5503                 loop.call_exception_handler({
5504                     'message': 'unhandled exception during asyncio.run() shutdown',
5505                     'exception': task.exception(),
5506                     'task': task,
5507                 })
5508
5509
5510 def merge_headers(*dicts):
5511     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5512     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5513
5514
5515 def cached_method(f):
5516     """Cache a method"""
5517     signature = inspect.signature(f)
5518
5519     @functools.wraps(f)
5520     def wrapper(self, *args, **kwargs):
5521         bound_args = signature.bind(self, *args, **kwargs)
5522         bound_args.apply_defaults()
5523         key = tuple(bound_args.arguments.values())[1:]
5524
5525         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5526         if key not in cache:
5527             cache[key] = f(self, *args, **kwargs)
5528         return cache[key]
5529     return wrapper
5530
5531
5532 class classproperty:
5533     """property access for class methods with optional caching"""
5534     def __new__(cls, func=None, *args, **kwargs):
5535         if not func:
5536             return functools.partial(cls, *args, **kwargs)
5537         return super().__new__(cls)
5538
5539     def __init__(self, func, *, cache=False):
5540         functools.update_wrapper(self, func)
5541         self.func = func
5542         self._cache = {} if cache else None
5543
5544     def __get__(self, _, cls):
5545         if self._cache is None:
5546             return self.func(cls)
5547         elif cls not in self._cache:
5548             self._cache[cls] = self.func(cls)
5549         return self._cache[cls]
5550
5551
5552 class function_with_repr:
5553     def __init__(self, func, repr_=None):
5554         functools.update_wrapper(self, func)
5555         self.func, self.__repr = func, repr_
5556
5557     def __call__(self, *args, **kwargs):
5558         return self.func(*args, **kwargs)
5559
5560     def __repr__(self):
5561         if self.__repr:
5562             return self.__repr
5563         return f'{self.func.__module__}.{self.func.__qualname__}'
5564
5565
5566 class Namespace(types.SimpleNamespace):
5567     """Immutable namespace"""
5568
5569     def __iter__(self):
5570         return iter(self.__dict__.values())
5571
5572     @property
5573     def items_(self):
5574         return self.__dict__.items()
5575
5576
5577 MEDIA_EXTENSIONS = Namespace(
5578     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5579     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5580     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5581     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5582     thumbnails=('jpg', 'png', 'webp'),
5583     storyboards=('mhtml', ),
5584     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5585     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5586 )
5587 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5588 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5589
5590 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5591
5592
5593 class RetryManager:
5594     """Usage:
5595         for retry in RetryManager(...):
5596             try:
5597                 ...
5598             except SomeException as err:
5599                 retry.error = err
5600                 continue
5601     """
5602     attempt, _error = 0, None
5603
5604     def __init__(self, _retries, _error_callback, **kwargs):
5605         self.retries = _retries or 0
5606         self.error_callback = functools.partial(_error_callback, **kwargs)
5607
5608     def _should_retry(self):
5609         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5610
5611     @property
5612     def error(self):
5613         if self._error is NO_DEFAULT:
5614             return None
5615         return self._error
5616
5617     @error.setter
5618     def error(self, value):
5619         self._error = value
5620
5621     def __iter__(self):
5622         while self._should_retry():
5623             self.error = NO_DEFAULT
5624             self.attempt += 1
5625             yield self
5626             if self.error:
5627                 self.error_callback(self.error, self.attempt, self.retries)
5628
5629     @staticmethod
5630     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5631         """Utility function for reporting retries"""
5632         if count > retries:
5633             if error:
5634                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5635             raise e
5636
5637         if not count:
5638             return warn(e)
5639         elif isinstance(e, ExtractorError):
5640             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5641         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5642
5643         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5644         if delay:
5645             info(f'Sleeping {delay:.2f} seconds ...')
5646             time.sleep(delay)
5647
5648
5649 def make_archive_id(ie, video_id):
5650     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5651     return f'{ie_key.lower()} {video_id}'
5652
5653
5654 def truncate_string(s, left, right=0):
5655     assert left > 3 and right >= 0
5656     if s is None or len(s) <= left + right:
5657         return s
5658     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5659
5660
5661 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5662     assert 'all' in alias_dict, '"all" alias is required'
5663     requested = list(start or [])
5664     for val in options:
5665         discard = val.startswith('-')
5666         if discard:
5667             val = val[1:]
5668
5669         if val in alias_dict:
5670             val = alias_dict[val] if not discard else [
5671                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5672             # NB: Do not allow regex in aliases for performance
5673             requested = orderedSet_from_options(val, alias_dict, start=requested)
5674             continue
5675
5676         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5677                    else [val] if val in alias_dict['all'] else None)
5678         if current is None:
5679             raise ValueError(val)
5680
5681         if discard:
5682             for item in current:
5683                 while item in requested:
5684                     requested.remove(item)
5685         else:
5686             requested.extend(current)
5687
5688     return orderedSet(requested)
5689
5690
5691 # TODO: Rewrite
5692 class FormatSorter:
5693     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5694
5695     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5696                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5697                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5698     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5699                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5700                     'fps', 'fs_approx', 'source', 'id')
5701
5702     settings = {
5703         'vcodec': {'type': 'ordered', 'regex': True,
5704                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5705         'acodec': {'type': 'ordered', 'regex': True,
5706                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5707         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5708                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5709         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5710                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5711         'vext': {'type': 'ordered', 'field': 'video_ext',
5712                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5713                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5714         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5715                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5716                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5717         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5718         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5719                        'field': ('vcodec', 'acodec'),
5720                        'function': lambda it: int(any(v != 'none' for v in it))},
5721         'ie_pref': {'priority': True, 'type': 'extractor'},
5722         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5723         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5724         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5725         'quality': {'convert': 'float', 'default': -1},
5726         'filesize': {'convert': 'bytes'},
5727         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5728         'id': {'convert': 'string', 'field': 'format_id'},
5729         'height': {'convert': 'float_none'},
5730         'width': {'convert': 'float_none'},
5731         'fps': {'convert': 'float_none'},
5732         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5733         'tbr': {'convert': 'float_none'},
5734         'vbr': {'convert': 'float_none'},
5735         'abr': {'convert': 'float_none'},
5736         'asr': {'convert': 'float_none'},
5737         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5738
5739         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5740         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'),
5741                'function': lambda it: next(filter(None, it), None)},
5742         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'),
5743                  'function': lambda it: next(filter(None, it), None)},
5744         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5745         'res': {'type': 'multiple', 'field': ('height', 'width'),
5746                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5747
5748         # Actual field names
5749         'format_id': {'type': 'alias', 'field': 'id'},
5750         'preference': {'type': 'alias', 'field': 'ie_pref'},
5751         'language_preference': {'type': 'alias', 'field': 'lang'},
5752         'source_preference': {'type': 'alias', 'field': 'source'},
5753         'protocol': {'type': 'alias', 'field': 'proto'},
5754         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5755         'audio_channels': {'type': 'alias', 'field': 'channels'},
5756
5757         # Deprecated
5758         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5759         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5760         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5761         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5762         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5763         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5764         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5765         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5766         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5767         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5768         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5769         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5770         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5771         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5772         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5773         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5774         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5775         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5776         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5777         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5778     }
5779
5780     def __init__(self, ydl, field_preference):
5781         self.ydl = ydl
5782         self._order = []
5783         self.evaluate_params(self.ydl.params, field_preference)
5784         if ydl.params.get('verbose'):
5785             self.print_verbose_info(self.ydl.write_debug)
5786
5787     def _get_field_setting(self, field, key):
5788         if field not in self.settings:
5789             if key in ('forced', 'priority'):
5790                 return False
5791             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5792                                         'deprecated and may be removed in a future version')
5793             self.settings[field] = {}
5794         propObj = self.settings[field]
5795         if key not in propObj:
5796             type = propObj.get('type')
5797             if key == 'field':
5798                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5799             elif key == 'convert':
5800                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5801             else:
5802                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5803             propObj[key] = default
5804         return propObj[key]
5805
5806     def _resolve_field_value(self, field, value, convertNone=False):
5807         if value is None:
5808             if not convertNone:
5809                 return None
5810         else:
5811             value = value.lower()
5812         conversion = self._get_field_setting(field, 'convert')
5813         if conversion == 'ignore':
5814             return None
5815         if conversion == 'string':
5816             return value
5817         elif conversion == 'float_none':
5818             return float_or_none(value)
5819         elif conversion == 'bytes':
5820             return parse_bytes(value)
5821         elif conversion == 'order':
5822             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5823             use_regex = self._get_field_setting(field, 'regex')
5824             list_length = len(order_list)
5825             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5826             if use_regex and value is not None:
5827                 for i, regex in enumerate(order_list):
5828                     if regex and re.match(regex, value):
5829                         return list_length - i
5830                 return list_length - empty_pos  # not in list
5831             else:  # not regex or  value = None
5832                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5833         else:
5834             if value.isnumeric():
5835                 return float(value)
5836             else:
5837                 self.settings[field]['convert'] = 'string'
5838                 return value
5839
5840     def evaluate_params(self, params, sort_extractor):
5841         self._use_free_order = params.get('prefer_free_formats', False)
5842         self._sort_user = params.get('format_sort', [])
5843         self._sort_extractor = sort_extractor
5844
5845         def add_item(field, reverse, closest, limit_text):
5846             field = field.lower()
5847             if field in self._order:
5848                 return
5849             self._order.append(field)
5850             limit = self._resolve_field_value(field, limit_text)
5851             data = {
5852                 'reverse': reverse,
5853                 'closest': False if limit is None else closest,
5854                 'limit_text': limit_text,
5855                 'limit': limit}
5856             if field in self.settings:
5857                 self.settings[field].update(data)
5858             else:
5859                 self.settings[field] = data
5860
5861         sort_list = (
5862             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5863             + (tuple() if params.get('format_sort_force', False)
5864                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5865             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5866
5867         for item in sort_list:
5868             match = re.match(self.regex, item)
5869             if match is None:
5870                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5871             field = match.group('field')
5872             if field is None:
5873                 continue
5874             if self._get_field_setting(field, 'type') == 'alias':
5875                 alias, field = field, self._get_field_setting(field, 'field')
5876                 if self._get_field_setting(alias, 'deprecated'):
5877                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5878                                                 f'be removed in a future version. Please use {field} instead')
5879             reverse = match.group('reverse') is not None
5880             closest = match.group('separator') == '~'
5881             limit_text = match.group('limit')
5882
5883             has_limit = limit_text is not None
5884             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5885             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5886
5887             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5888             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5889             limit_count = len(limits)
5890             for (i, f) in enumerate(fields):
5891                 add_item(f, reverse, closest,
5892                          limits[i] if i < limit_count
5893                          else limits[0] if has_limit and not has_multiple_limits
5894                          else None)
5895
5896     def print_verbose_info(self, write_debug):
5897         if self._sort_user:
5898             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5899         if self._sort_extractor:
5900             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5901         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5902             '+' if self._get_field_setting(field, 'reverse') else '', field,
5903             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5904                           self._get_field_setting(field, 'limit_text'),
5905                           self._get_field_setting(field, 'limit'))
5906             if self._get_field_setting(field, 'limit_text') is not None else '')
5907             for field in self._order if self._get_field_setting(field, 'visible')]))
5908
5909     def _calculate_field_preference_from_value(self, format, field, type, value):
5910         reverse = self._get_field_setting(field, 'reverse')
5911         closest = self._get_field_setting(field, 'closest')
5912         limit = self._get_field_setting(field, 'limit')
5913
5914         if type == 'extractor':
5915             maximum = self._get_field_setting(field, 'max')
5916             if value is None or (maximum is not None and value >= maximum):
5917                 value = -1
5918         elif type == 'boolean':
5919             in_list = self._get_field_setting(field, 'in_list')
5920             not_in_list = self._get_field_setting(field, 'not_in_list')
5921             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5922         elif type == 'ordered':
5923             value = self._resolve_field_value(field, value, True)
5924
5925         # try to convert to number
5926         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5927         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5928         if is_num:
5929             value = val_num
5930
5931         return ((-10, 0) if value is None
5932                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5933                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5934                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5935                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5936                 else (-1, value, 0))
5937
5938     def _calculate_field_preference(self, format, field):
5939         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5940         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5941         if type == 'multiple':
5942             type = 'field'  # Only 'field' is allowed in multiple for now
5943             actual_fields = self._get_field_setting(field, 'field')
5944
5945             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5946         else:
5947             value = get_value(field)
5948         return self._calculate_field_preference_from_value(format, field, type, value)
5949
5950     def calculate_preference(self, format):
5951         # Determine missing protocol
5952         if not format.get('protocol'):
5953             format['protocol'] = determine_protocol(format)
5954
5955         # Determine missing ext
5956         if not format.get('ext') and 'url' in format:
5957             format['ext'] = determine_ext(format['url'])
5958         if format.get('vcodec') == 'none':
5959             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5960             format['video_ext'] = 'none'
5961         else:
5962             format['video_ext'] = format['ext']
5963             format['audio_ext'] = 'none'
5964         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5965         #    format['preference'] = -1000
5966
5967         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5968             # HEVC-over-FLV is out-of-spec by FLV's original spec
5969             # ref. https://trac.ffmpeg.org/ticket/6389
5970             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5971             format['preference'] = -100
5972
5973         # Determine missing bitrates
5974         if format.get('vcodec') == 'none':
5975             format['vbr'] = 0
5976         if format.get('acodec') == 'none':
5977             format['abr'] = 0
5978         if not format.get('vbr') and format.get('vcodec') != 'none':
5979             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5980         if not format.get('abr') and format.get('acodec') != 'none':
5981             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5982         if not format.get('tbr'):
5983             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5984
5985         return tuple(self._calculate_field_preference(format, field) for field in self._order)