yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import unicodedata
  44 import urllib.error
  45 import urllib.parse
  46 import urllib.request
  47 import xml.etree.ElementTree
  48 import zlib
  49
  50 from . import traversal
  51
  52 from ..compat import functools  # isort: split
  53 from ..compat import (
  54     compat_etree_fromstring,
  55     compat_expanduser,
  56     compat_HTMLParseError,
  57     compat_os_name,
  58     compat_shlex_quote,
  59 )
  60 from ..dependencies import brotli, certifi, websockets, xattr
  61 from ..socks import ProxyType, sockssocket
  62
  63 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  64
  65 # This is not clearly defined otherwise
  66 compiled_regex_type = type(re.compile(''))
  67
  68
  69 def random_user_agent():
  70     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  71     _CHROME_VERSIONS = (
  72         '90.0.4430.212',
  73         '90.0.4430.24',
  74         '90.0.4430.70',
  75         '90.0.4430.72',
  76         '90.0.4430.85',
  77         '90.0.4430.93',
  78         '91.0.4472.101',
  79         '91.0.4472.106',
  80         '91.0.4472.114',
  81         '91.0.4472.124',
  82         '91.0.4472.164',
  83         '91.0.4472.19',
  84         '91.0.4472.77',
  85         '92.0.4515.107',
  86         '92.0.4515.115',
  87         '92.0.4515.131',
  88         '92.0.4515.159',
  89         '92.0.4515.43',
  90         '93.0.4556.0',
  91         '93.0.4577.15',
  92         '93.0.4577.63',
  93         '93.0.4577.82',
  94         '94.0.4606.41',
  95         '94.0.4606.54',
  96         '94.0.4606.61',
  97         '94.0.4606.71',
  98         '94.0.4606.81',
  99         '94.0.4606.85',
 100         '95.0.4638.17',
 101         '95.0.4638.50',
 102         '95.0.4638.54',
 103         '95.0.4638.69',
 104         '95.0.4638.74',
 105         '96.0.4664.18',
 106         '96.0.4664.45',
 107         '96.0.4664.55',
 108         '96.0.4664.93',
 109         '97.0.4692.20',
 110     )
 111     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 112
 113
 114 SUPPORTED_ENCODINGS = [
 115     'gzip', 'deflate'
 116 ]
 117 if brotli:
 118     SUPPORTED_ENCODINGS.append('br')
 119
 120 std_headers = {
 121     'User-Agent': random_user_agent(),
 122     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 123     'Accept-Language': 'en-us,en;q=0.5',
 124     'Sec-Fetch-Mode': 'navigate',
 125 }
 126
 127
 128 USER_AGENTS = {
 129     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 130 }
 131
 132
 133 class NO_DEFAULT:
 134     pass
 135
 136
 137 def IDENTITY(x):
 138     return x
 139
 140
 141 ENGLISH_MONTH_NAMES = [
 142     'January', 'February', 'March', 'April', 'May', 'June',
 143     'July', 'August', 'September', 'October', 'November', 'December']
 144
 145 MONTH_NAMES = {
 146     'en': ENGLISH_MONTH_NAMES,
 147     'fr': [
 148         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 149         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 150     # these follow the genitive grammatical case (dopełniacz)
 151     # some websites might be using nominative, which will require another month list
 152     # https://en.wikibooks.org/wiki/Polish/Noun_cases
 153     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
 154            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
 155 }
 156
 157 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 158 TIMEZONE_NAMES = {
 159     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 160     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 161     'EST': -5, 'EDT': -4,  # Eastern
 162     'CST': -6, 'CDT': -5,  # Central
 163     'MST': -7, 'MDT': -6,  # Mountain
 164     'PST': -8, 'PDT': -7   # Pacific
 165 }
 166
 167 # needed for sanitizing filenames in restricted mode
 168 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 169                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 170                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 171
 172 DATE_FORMATS = (
 173     '%d %B %Y',
 174     '%d %b %Y',
 175     '%B %d %Y',
 176     '%B %dst %Y',
 177     '%B %dnd %Y',
 178     '%B %drd %Y',
 179     '%B %dth %Y',
 180     '%b %d %Y',
 181     '%b %dst %Y',
 182     '%b %dnd %Y',
 183     '%b %drd %Y',
 184     '%b %dth %Y',
 185     '%b %dst %Y %I:%M',
 186     '%b %dnd %Y %I:%M',
 187     '%b %drd %Y %I:%M',
 188     '%b %dth %Y %I:%M',
 189     '%Y %m %d',
 190     '%Y-%m-%d',
 191     '%Y.%m.%d.',
 192     '%Y/%m/%d',
 193     '%Y/%m/%d %H:%M',
 194     '%Y/%m/%d %H:%M:%S',
 195     '%Y%m%d%H%M',
 196     '%Y%m%d%H%M%S',
 197     '%Y%m%d',
 198     '%Y-%m-%d %H:%M',
 199     '%Y-%m-%d %H:%M:%S',
 200     '%Y-%m-%d %H:%M:%S.%f',
 201     '%Y-%m-%d %H:%M:%S:%f',
 202     '%d.%m.%Y %H:%M',
 203     '%d.%m.%Y %H.%M',
 204     '%Y-%m-%dT%H:%M:%SZ',
 205     '%Y-%m-%dT%H:%M:%S.%fZ',
 206     '%Y-%m-%dT%H:%M:%S.%f0Z',
 207     '%Y-%m-%dT%H:%M:%S',
 208     '%Y-%m-%dT%H:%M:%S.%f',
 209     '%Y-%m-%dT%H:%M',
 210     '%b %d %Y at %H:%M',
 211     '%b %d %Y at %H:%M:%S',
 212     '%B %d %Y at %H:%M',
 213     '%B %d %Y at %H:%M:%S',
 214     '%H:%M %d-%b-%Y',
 215 )
 216
 217 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 218 DATE_FORMATS_DAY_FIRST.extend([
 219     '%d-%m-%Y',
 220     '%d.%m.%Y',
 221     '%d.%m.%y',
 222     '%d/%m/%Y',
 223     '%d/%m/%y',
 224     '%d/%m/%Y %H:%M:%S',
 225     '%d-%m-%Y %H:%M',
 226     '%H:%M %d/%m/%Y',
 227 ])
 228
 229 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 230 DATE_FORMATS_MONTH_FIRST.extend([
 231     '%m-%d-%Y',
 232     '%m.%d.%Y',
 233     '%m/%d/%Y',
 234     '%m/%d/%y',
 235     '%m/%d/%Y %H:%M:%S',
 236 ])
 237
 238 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 239 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 240
 241 NUMBER_RE = r'\d+(?:\.\d+)?'
 242
 243
 244 @functools.cache
 245 def preferredencoding():
 246     """Get preferred encoding.
 247
 248     Returns the best encoding scheme for the system, based on
 249     locale.getpreferredencoding() and some further tweaks.
 250     """
 251     try:
 252         pref = locale.getpreferredencoding()
 253         'TEST'.encode(pref)
 254     except Exception:
 255         pref = 'UTF-8'
 256
 257     return pref
 258
 259
 260 def write_json_file(obj, fn):
 261     """ Encode obj as JSON and write it to fn, atomically if possible """
 262
 263     tf = tempfile.NamedTemporaryFile(
 264         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 265         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 266
 267     try:
 268         with tf:
 269             json.dump(obj, tf, ensure_ascii=False)
 270         if sys.platform == 'win32':
 271             # Need to remove existing file on Windows, else os.rename raises
 272             # WindowsError or FileExistsError.
 273             with contextlib.suppress(OSError):
 274                 os.unlink(fn)
 275         with contextlib.suppress(OSError):
 276             mask = os.umask(0)
 277             os.umask(mask)
 278             os.chmod(tf.name, 0o666 & ~mask)
 279         os.rename(tf.name, fn)
 280     except Exception:
 281         with contextlib.suppress(OSError):
 282             os.remove(tf.name)
 283         raise
 284
 285
 286 def find_xpath_attr(node, xpath, key, val=None):
 287     """ Find the xpath xpath[@key=val] """
 288     assert re.match(r'^[a-zA-Z_-]+$', key)
 289     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 290     return node.find(expr)
 291
 292 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 293 # the namespace parameter
 294
 295
 296 def xpath_with_ns(path, ns_map):
 297     components = [c.split(':') for c in path.split('/')]
 298     replaced = []
 299     for c in components:
 300         if len(c) == 1:
 301             replaced.append(c[0])
 302         else:
 303             ns, tag = c
 304             replaced.append('{%s}%s' % (ns_map[ns], tag))
 305     return '/'.join(replaced)
 306
 307
 308 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 309     def _find_xpath(xpath):
 310         return node.find(xpath)
 311
 312     if isinstance(xpath, str):
 313         n = _find_xpath(xpath)
 314     else:
 315         for xp in xpath:
 316             n = _find_xpath(xp)
 317             if n is not None:
 318                 break
 319
 320     if n is None:
 321         if default is not NO_DEFAULT:
 322             return default
 323         elif fatal:
 324             name = xpath if name is None else name
 325             raise ExtractorError('Could not find XML element %s' % name)
 326         else:
 327             return None
 328     return n
 329
 330
 331 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 332     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 333     if n is None or n == default:
 334         return n
 335     if n.text is None:
 336         if default is not NO_DEFAULT:
 337             return default
 338         elif fatal:
 339             name = xpath if name is None else name
 340             raise ExtractorError('Could not find XML element\'s text %s' % name)
 341         else:
 342             return None
 343     return n.text
 344
 345
 346 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 347     n = find_xpath_attr(node, xpath, key)
 348     if n is None:
 349         if default is not NO_DEFAULT:
 350             return default
 351         elif fatal:
 352             name = f'{xpath}[@{key}]' if name is None else name
 353             raise ExtractorError('Could not find XML attribute %s' % name)
 354         else:
 355             return None
 356     return n.attrib[key]
 357
 358
 359 def get_element_by_id(id, html, **kwargs):
 360     """Return the content of the tag with the specified ID in the passed HTML document"""
 361     return get_element_by_attribute('id', id, html, **kwargs)
 362
 363
 364 def get_element_html_by_id(id, html, **kwargs):
 365     """Return the html of the tag with the specified ID in the passed HTML document"""
 366     return get_element_html_by_attribute('id', id, html, **kwargs)
 367
 368
 369 def get_element_by_class(class_name, html):
 370     """Return the content of the first tag with the specified class in the passed HTML document"""
 371     retval = get_elements_by_class(class_name, html)
 372     return retval[0] if retval else None
 373
 374
 375 def get_element_html_by_class(class_name, html):
 376     """Return the html of the first tag with the specified class in the passed HTML document"""
 377     retval = get_elements_html_by_class(class_name, html)
 378     return retval[0] if retval else None
 379
 380
 381 def get_element_by_attribute(attribute, value, html, **kwargs):
 382     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 383     return retval[0] if retval else None
 384
 385
 386 def get_element_html_by_attribute(attribute, value, html, **kargs):
 387     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 388     return retval[0] if retval else None
 389
 390
 391 def get_elements_by_class(class_name, html, **kargs):
 392     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 393     return get_elements_by_attribute(
 394         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 395         html, escape_value=False)
 396
 397
 398 def get_elements_html_by_class(class_name, html):
 399     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 400     return get_elements_html_by_attribute(
 401         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 402         html, escape_value=False)
 403
 404
 405 def get_elements_by_attribute(*args, **kwargs):
 406     """Return the content of the tag with the specified attribute in the passed HTML document"""
 407     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 408
 409
 410 def get_elements_html_by_attribute(*args, **kwargs):
 411     """Return the html of the tag with the specified attribute in the passed HTML document"""
 412     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 413
 414
 415 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 416     """
 417     Return the text (content) and the html (whole) of the tag with the specified
 418     attribute in the passed HTML document
 419     """
 420     if not value:
 421         return
 422
 423     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 424
 425     value = re.escape(value) if escape_value else value
 426
 427     partial_element_re = rf'''(?x)
 428         <(?P<tag>{tag})
 429          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 430          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 431         '''
 432
 433     for m in re.finditer(partial_element_re, html):
 434         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 435
 436         yield (
 437             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 438             whole
 439         )
 440
 441
 442 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 443     """
 444     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 445     closing tag for the first opening tag it has encountered, and can be used
 446     as a context manager
 447     """
 448
 449     class HTMLBreakOnClosingTagException(Exception):
 450         pass
 451
 452     def __init__(self):
 453         self.tagstack = collections.deque()
 454         html.parser.HTMLParser.__init__(self)
 455
 456     def __enter__(self):
 457         return self
 458
 459     def __exit__(self, *_):
 460         self.close()
 461
 462     def close(self):
 463         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 464         # so data remains buffered; we no longer have any interest in it, thus
 465         # override this method to discard it
 466         pass
 467
 468     def handle_starttag(self, tag, _):
 469         self.tagstack.append(tag)
 470
 471     def handle_endtag(self, tag):
 472         if not self.tagstack:
 473             raise compat_HTMLParseError('no tags in the stack')
 474         while self.tagstack:
 475             inner_tag = self.tagstack.pop()
 476             if inner_tag == tag:
 477                 break
 478         else:
 479             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 480         if not self.tagstack:
 481             raise self.HTMLBreakOnClosingTagException()
 482
 483
 484 # XXX: This should be far less strict
 485 def get_element_text_and_html_by_tag(tag, html):
 486     """
 487     For the first element with the specified tag in the passed HTML document
 488     return its' content (text) and the whole element (html)
 489     """
 490     def find_or_raise(haystack, needle, exc):
 491         try:
 492             return haystack.index(needle)
 493         except ValueError:
 494             raise exc
 495     closing_tag = f'</{tag}>'
 496     whole_start = find_or_raise(
 497         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 498     content_start = find_or_raise(
 499         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 500     content_start += whole_start + 1
 501     with HTMLBreakOnClosingTagParser() as parser:
 502         parser.feed(html[whole_start:content_start])
 503         if not parser.tagstack or parser.tagstack[0] != tag:
 504             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 505         offset = content_start
 506         while offset < len(html):
 507             next_closing_tag_start = find_or_raise(
 508                 html[offset:], closing_tag,
 509                 compat_HTMLParseError(f'closing {tag} tag not found'))
 510             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 511             try:
 512                 parser.feed(html[offset:offset + next_closing_tag_end])
 513                 offset += next_closing_tag_end
 514             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 515                 return html[content_start:offset + next_closing_tag_start], \
 516                     html[whole_start:offset + next_closing_tag_end]
 517         raise compat_HTMLParseError('unexpected end of html')
 518
 519
 520 class HTMLAttributeParser(html.parser.HTMLParser):
 521     """Trivial HTML parser to gather the attributes for a single element"""
 522
 523     def __init__(self):
 524         self.attrs = {}
 525         html.parser.HTMLParser.__init__(self)
 526
 527     def handle_starttag(self, tag, attrs):
 528         self.attrs = dict(attrs)
 529         raise compat_HTMLParseError('done')
 530
 531
 532 class HTMLListAttrsParser(html.parser.HTMLParser):
 533     """HTML parser to gather the attributes for the elements of a list"""
 534
 535     def __init__(self):
 536         html.parser.HTMLParser.__init__(self)
 537         self.items = []
 538         self._level = 0
 539
 540     def handle_starttag(self, tag, attrs):
 541         if tag == 'li' and self._level == 0:
 542             self.items.append(dict(attrs))
 543         self._level += 1
 544
 545     def handle_endtag(self, tag):
 546         self._level -= 1
 547
 548
 549 def extract_attributes(html_element):
 550     """Given a string for an HTML element such as
 551     <el
 552          a="foo" B="bar" c="&98;az" d=boz
 553          empty= noval entity="&amp;"
 554          sq='"' dq="'"
 555     >
 556     Decode and return a dictionary of attributes.
 557     {
 558         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 559         'empty': '', 'noval': None, 'entity': '&',
 560         'sq': '"', 'dq': '\''
 561     }.
 562     """
 563     parser = HTMLAttributeParser()
 564     with contextlib.suppress(compat_HTMLParseError):
 565         parser.feed(html_element)
 566         parser.close()
 567     return parser.attrs
 568
 569
 570 def parse_list(webpage):
 571     """Given a string for an series of HTML <li> elements,
 572     return a dictionary of their attributes"""
 573     parser = HTMLListAttrsParser()
 574     parser.feed(webpage)
 575     parser.close()
 576     return parser.items
 577
 578
 579 def clean_html(html):
 580     """Clean an HTML snippet into a readable string"""
 581
 582     if html is None:  # Convenience for sanitizing descriptions etc.
 583         return html
 584
 585     html = re.sub(r'\s+', ' ', html)
 586     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 587     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 588     # Strip html tags
 589     html = re.sub('<.*?>', '', html)
 590     # Replace html entities
 591     html = unescapeHTML(html)
 592     return html.strip()
 593
 594
 595 class LenientJSONDecoder(json.JSONDecoder):
 596     # TODO: Write tests
 597     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 598         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 599         self._close_attempts = 2 * close_objects
 600         super().__init__(*args, **kwargs)
 601
 602     @staticmethod
 603     def _close_object(err):
 604         doc = err.doc[:err.pos]
 605         # We need to add comma first to get the correct error message
 606         if err.msg.startswith('Expecting \',\''):
 607             return doc + ','
 608         elif not doc.endswith(','):
 609             return
 610
 611         if err.msg.startswith('Expecting property name'):
 612             return doc[:-1] + '}'
 613         elif err.msg.startswith('Expecting value'):
 614             return doc[:-1] + ']'
 615
 616     def decode(self, s):
 617         if self.transform_source:
 618             s = self.transform_source(s)
 619         for attempt in range(self._close_attempts + 1):
 620             try:
 621                 if self.ignore_extra:
 622                     return self.raw_decode(s.lstrip())[0]
 623                 return super().decode(s)
 624             except json.JSONDecodeError as e:
 625                 if e.pos is None:
 626                     raise
 627                 elif attempt < self._close_attempts:
 628                     s = self._close_object(e)
 629                     if s is not None:
 630                         continue
 631                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 632         assert False, 'Too many attempts to decode JSON'
 633
 634
 635 def sanitize_open(filename, open_mode):
 636     """Try to open the given filename, and slightly tweak it if this fails.
 637
 638     Attempts to open the given filename. If this fails, it tries to change
 639     the filename slightly, step by step, until it's either able to open it
 640     or it fails and raises a final exception, like the standard open()
 641     function.
 642
 643     It returns the tuple (stream, definitive_file_name).
 644     """
 645     if filename == '-':
 646         if sys.platform == 'win32':
 647             import msvcrt
 648
 649             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 650             with contextlib.suppress(io.UnsupportedOperation):
 651                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 652         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 653
 654     for attempt in range(2):
 655         try:
 656             try:
 657                 if sys.platform == 'win32':
 658                     # FIXME: An exclusive lock also locks the file from being read.
 659                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 660                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 661                     raise LockingUnsupportedError()
 662                 stream = locked_file(filename, open_mode, block=False).__enter__()
 663             except OSError:
 664                 stream = open(filename, open_mode)
 665             return stream, filename
 666         except OSError as err:
 667             if attempt or err.errno in (errno.EACCES,):
 668                 raise
 669             old_filename, filename = filename, sanitize_path(filename)
 670             if old_filename == filename:
 671                 raise
 672
 673
 674 def timeconvert(timestr):
 675     """Convert RFC 2822 defined time string into system timestamp"""
 676     timestamp = None
 677     timetuple = email.utils.parsedate_tz(timestr)
 678     if timetuple is not None:
 679         timestamp = email.utils.mktime_tz(timetuple)
 680     return timestamp
 681
 682
 683 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 684     """Sanitizes a string so it could be used as part of a filename.
 685     @param restricted   Use a stricter subset of allowed characters
 686     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 687                         If unset, yt-dlp's new sanitization rules are in effect
 688     """
 689     if s == '':
 690         return ''
 691
 692     def replace_insane(char):
 693         if restricted and char in ACCENT_CHARS:
 694             return ACCENT_CHARS[char]
 695         elif not restricted and char == '\n':
 696             return '\0 '
 697         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 698             # Replace with their full-width unicode counterparts
 699             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 700         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 701             return ''
 702         elif char == '"':
 703             return '' if restricted else '\''
 704         elif char == ':':
 705             return '\0_\0-' if restricted else '\0 \0-'
 706         elif char in '\\/|*<>':
 707             return '\0_'
 708         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 709             return '\0_'
 710         return char
 711
 712     # Replace look-alike Unicode glyphs
 713     if restricted and (is_id is NO_DEFAULT or not is_id):
 714         s = unicodedata.normalize('NFKC', s)
 715     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 716     result = ''.join(map(replace_insane, s))
 717     if is_id is NO_DEFAULT:
 718         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 719         STRIP_RE = r'(?:\0.|[ _-])*'
 720         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 721     result = result.replace('\0', '') or '_'
 722
 723     if not is_id:
 724         while '__' in result:
 725             result = result.replace('__', '_')
 726         result = result.strip('_')
 727         # Common case of "Foreign band name - English song title"
 728         if restricted and result.startswith('-_'):
 729             result = result[2:]
 730         if result.startswith('-'):
 731             result = '_' + result[len('-'):]
 732         result = result.lstrip('.')
 733         if not result:
 734             result = '_'
 735     return result
 736
 737
 738 def sanitize_path(s, force=False):
 739     """Sanitizes and normalizes path on Windows"""
 740     if sys.platform == 'win32':
 741         force = False
 742         drive_or_unc, _ = os.path.splitdrive(s)
 743     elif force:
 744         drive_or_unc = ''
 745     else:
 746         return s
 747
 748     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 749     if drive_or_unc:
 750         norm_path.pop(0)
 751     sanitized_path = [
 752         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 753         for path_part in norm_path]
 754     if drive_or_unc:
 755         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 756     elif force and s and s[0] == os.path.sep:
 757         sanitized_path.insert(0, os.path.sep)
 758     return os.path.join(*sanitized_path)
 759
 760
 761 def sanitize_url(url, *, scheme='http'):
 762     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 763     # the number of unwanted failures due to missing protocol
 764     if url is None:
 765         return
 766     elif url.startswith('//'):
 767         return f'{scheme}:{url}'
 768     # Fix some common typos seen so far
 769     COMMON_TYPOS = (
 770         # https://github.com/ytdl-org/youtube-dl/issues/15649
 771         (r'^httpss://', r'https://'),
 772         # https://bx1.be/lives/direct-tv/
 773         (r'^rmtp([es]?)://', r'rtmp\1://'),
 774     )
 775     for mistake, fixup in COMMON_TYPOS:
 776         if re.match(mistake, url):
 777             return re.sub(mistake, fixup, url)
 778     return url
 779
 780
 781 def extract_basic_auth(url):
 782     parts = urllib.parse.urlsplit(url)
 783     if parts.username is None:
 784         return url, None
 785     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 786         parts.hostname if parts.port is None
 787         else '%s:%d' % (parts.hostname, parts.port))))
 788     auth_payload = base64.b64encode(
 789         ('%s:%s' % (parts.username, parts.password or '')).encode())
 790     return url, f'Basic {auth_payload.decode()}'
 791
 792
 793 def sanitized_Request(url, *args, **kwargs):
 794     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 795     if auth_header is not None:
 796         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 797         headers['Authorization'] = auth_header
 798     return urllib.request.Request(url, *args, **kwargs)
 799
 800
 801 def expand_path(s):
 802     """Expand shell variables and ~"""
 803     return os.path.expandvars(compat_expanduser(s))
 804
 805
 806 def orderedSet(iterable, *, lazy=False):
 807     """Remove all duplicates from the input iterable"""
 808     def _iter():
 809         seen = []  # Do not use set since the items can be unhashable
 810         for x in iterable:
 811             if x not in seen:
 812                 seen.append(x)
 813                 yield x
 814
 815     return _iter() if lazy else list(_iter())
 816
 817
 818 def _htmlentity_transform(entity_with_semicolon):
 819     """Transforms an HTML entity to a character."""
 820     entity = entity_with_semicolon[:-1]
 821
 822     # Known non-numeric HTML entity
 823     if entity in html.entities.name2codepoint:
 824         return chr(html.entities.name2codepoint[entity])
 825
 826     # TODO: HTML5 allows entities without a semicolon.
 827     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 828     if entity_with_semicolon in html.entities.html5:
 829         return html.entities.html5[entity_with_semicolon]
 830
 831     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 832     if mobj is not None:
 833         numstr = mobj.group(1)
 834         if numstr.startswith('x'):
 835             base = 16
 836             numstr = '0%s' % numstr
 837         else:
 838             base = 10
 839         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 840         with contextlib.suppress(ValueError):
 841             return chr(int(numstr, base))
 842
 843     # Unknown entity in name, return its literal representation
 844     return '&%s;' % entity
 845
 846
 847 def unescapeHTML(s):
 848     if s is None:
 849         return None
 850     assert isinstance(s, str)
 851
 852     return re.sub(
 853         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 854
 855
 856 def escapeHTML(text):
 857     return (
 858         text
 859         .replace('&', '&amp;')
 860         .replace('<', '&lt;')
 861         .replace('>', '&gt;')
 862         .replace('"', '&quot;')
 863         .replace("'", '&#39;')
 864     )
 865
 866
 867 def process_communicate_or_kill(p, *args, **kwargs):
 868     deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
 869                         f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
 870     return Popen.communicate_or_kill(p, *args, **kwargs)
 871
 872
 873 class Popen(subprocess.Popen):
 874     if sys.platform == 'win32':
 875         _startupinfo = subprocess.STARTUPINFO()
 876         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 877     else:
 878         _startupinfo = None
 879
 880     @staticmethod
 881     def _fix_pyinstaller_ld_path(env):
 882         """Restore LD_LIBRARY_PATH when using PyInstaller
 883             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 884                  https://github.com/yt-dlp/yt-dlp/issues/4573
 885         """
 886         if not hasattr(sys, '_MEIPASS'):
 887             return
 888
 889         def _fix(key):
 890             orig = env.get(f'{key}_ORIG')
 891             if orig is None:
 892                 env.pop(key, None)
 893             else:
 894                 env[key] = orig
 895
 896         _fix('LD_LIBRARY_PATH')  # Linux
 897         _fix('DYLD_LIBRARY_PATH')  # macOS
 898
 899     def __init__(self, *args, env=None, text=False, **kwargs):
 900         if env is None:
 901             env = os.environ.copy()
 902         self._fix_pyinstaller_ld_path(env)
 903
 904         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 905         if text is True:
 906             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 907             kwargs.setdefault('encoding', 'utf-8')
 908             kwargs.setdefault('errors', 'replace')
 909         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 910
 911     def communicate_or_kill(self, *args, **kwargs):
 912         try:
 913             return self.communicate(*args, **kwargs)
 914         except BaseException:  # Including KeyboardInterrupt
 915             self.kill(timeout=None)
 916             raise
 917
 918     def kill(self, *, timeout=0):
 919         super().kill()
 920         if timeout != 0:
 921             self.wait(timeout=timeout)
 922
 923     @classmethod
 924     def run(cls, *args, timeout=None, **kwargs):
 925         with cls(*args, **kwargs) as proc:
 926             default = '' if proc.__text_mode else b''
 927             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 928             return stdout or default, stderr or default, proc.returncode
 929
 930
 931 def encodeArgument(s):
 932     # Legacy code that uses byte strings
 933     # Uncomment the following line after fixing all post processors
 934     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 935     return s if isinstance(s, str) else s.decode('ascii')
 936
 937
 938 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 939
 940
 941 def timetuple_from_msec(msec):
 942     secs, msec = divmod(msec, 1000)
 943     mins, secs = divmod(secs, 60)
 944     hrs, mins = divmod(mins, 60)
 945     return _timetuple(hrs, mins, secs, msec)
 946
 947
 948 def formatSeconds(secs, delim=':', msec=False):
 949     time = timetuple_from_msec(secs * 1000)
 950     if time.hours:
 951         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 952     elif time.minutes:
 953         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 954     else:
 955         ret = '%d' % time.seconds
 956     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 957
 958
 959 def _ssl_load_windows_store_certs(ssl_context, storename):
 960     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 961     try:
 962         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 963                  if encoding == 'x509_asn' and (
 964                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 965     except PermissionError:
 966         return
 967     for cert in certs:
 968         with contextlib.suppress(ssl.SSLError):
 969             ssl_context.load_verify_locations(cadata=cert)
 970
 971
 972 def make_HTTPS_handler(params, **kwargs):
 973     opts_check_certificate = not params.get('nocheckcertificate')
 974     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 975     context.check_hostname = opts_check_certificate
 976     if params.get('legacyserverconnect'):
 977         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 978         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 979         context.set_ciphers('DEFAULT')
 980     elif (
 981         sys.version_info < (3, 10)
 982         and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
 983         and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
 984     ):
 985         # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
 986         # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
 987         # in some situations [2][3].
 988         # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
 989         # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
 990         # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
 991         # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
 992         # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
 993         # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
 994         # 4. https://peps.python.org/pep-0644/
 995         # 5. https://peps.python.org/pep-0644/#libressl-support
 996         # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
 997         context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
 998         context.minimum_version = ssl.TLSVersion.TLSv1_2
 999
1000     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1001     if opts_check_certificate:
1002         if certifi and 'no-certifi' not in params.get('compat_opts', []):
1003             context.load_verify_locations(cafile=certifi.where())
1004         else:
1005             try:
1006                 context.load_default_certs()
1007                 # Work around the issue in load_default_certs when there are bad certificates. See:
1008                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1009                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1010             except ssl.SSLError:
1011                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1012                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1013                     for storename in ('CA', 'ROOT'):
1014                         _ssl_load_windows_store_certs(context, storename)
1015                 context.set_default_verify_paths()
1016
1017     client_certfile = params.get('client_certificate')
1018     if client_certfile:
1019         try:
1020             context.load_cert_chain(
1021                 client_certfile, keyfile=params.get('client_certificate_key'),
1022                 password=params.get('client_certificate_password'))
1023         except ssl.SSLError:
1024             raise YoutubeDLError('Unable to load client certificate')
1025
1026     # Some servers may reject requests if ALPN extension is not sent. See:
1027     # https://github.com/python/cpython/issues/85140
1028     # https://github.com/yt-dlp/yt-dlp/issues/3878
1029     with contextlib.suppress(NotImplementedError):
1030         context.set_alpn_protocols(['http/1.1'])
1031
1032     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1033
1034
1035 def bug_reports_message(before=';'):
1036     from ..update import REPOSITORY
1037
1038     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1039            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1040
1041     before = before.rstrip()
1042     if not before or before.endswith(('.', '!', '?')):
1043         msg = msg[0].title() + msg[1:]
1044
1045     return (before + ' ' if before else '') + msg
1046
1047
1048 class YoutubeDLError(Exception):
1049     """Base exception for YoutubeDL errors."""
1050     msg = None
1051
1052     def __init__(self, msg=None):
1053         if msg is not None:
1054             self.msg = msg
1055         elif self.msg is None:
1056             self.msg = type(self).__name__
1057         super().__init__(self.msg)
1058
1059
1060 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1061 if hasattr(ssl, 'CertificateError'):
1062     network_exceptions.append(ssl.CertificateError)
1063 network_exceptions = tuple(network_exceptions)
1064
1065
1066 class ExtractorError(YoutubeDLError):
1067     """Error during info extraction."""
1068
1069     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1070         """ tb, if given, is the original traceback (so that it can be printed out).
1071         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1072         """
1073         if sys.exc_info()[0] in network_exceptions:
1074             expected = True
1075
1076         self.orig_msg = str(msg)
1077         self.traceback = tb
1078         self.expected = expected
1079         self.cause = cause
1080         self.video_id = video_id
1081         self.ie = ie
1082         self.exc_info = sys.exc_info()  # preserve original exception
1083         if isinstance(self.exc_info[1], ExtractorError):
1084             self.exc_info = self.exc_info[1].exc_info
1085         super().__init__(self.__msg)
1086
1087     @property
1088     def __msg(self):
1089         return ''.join((
1090             format_field(self.ie, None, '[%s] '),
1091             format_field(self.video_id, None, '%s: '),
1092             self.orig_msg,
1093             format_field(self.cause, None, ' (caused by %r)'),
1094             '' if self.expected else bug_reports_message()))
1095
1096     def format_traceback(self):
1097         return join_nonempty(
1098             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1099             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1100             delim='\n') or None
1101
1102     def __setattr__(self, name, value):
1103         super().__setattr__(name, value)
1104         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1105             self.msg = self.__msg or type(self).__name__
1106             self.args = (self.msg, )  # Cannot be property
1107
1108
1109 class UnsupportedError(ExtractorError):
1110     def __init__(self, url):
1111         super().__init__(
1112             'Unsupported URL: %s' % url, expected=True)
1113         self.url = url
1114
1115
1116 class RegexNotFoundError(ExtractorError):
1117     """Error when a regex didn't match"""
1118     pass
1119
1120
1121 class GeoRestrictedError(ExtractorError):
1122     """Geographic restriction Error exception.
1123
1124     This exception may be thrown when a video is not available from your
1125     geographic location due to geographic restrictions imposed by a website.
1126     """
1127
1128     def __init__(self, msg, countries=None, **kwargs):
1129         kwargs['expected'] = True
1130         super().__init__(msg, **kwargs)
1131         self.countries = countries
1132
1133
1134 class UserNotLive(ExtractorError):
1135     """Error when a channel/user is not live"""
1136
1137     def __init__(self, msg=None, **kwargs):
1138         kwargs['expected'] = True
1139         super().__init__(msg or 'The channel is not currently live', **kwargs)
1140
1141
1142 class DownloadError(YoutubeDLError):
1143     """Download Error exception.
1144
1145     This exception may be thrown by FileDownloader objects if they are not
1146     configured to continue on errors. They will contain the appropriate
1147     error message.
1148     """
1149
1150     def __init__(self, msg, exc_info=None):
1151         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1152         super().__init__(msg)
1153         self.exc_info = exc_info
1154
1155
1156 class EntryNotInPlaylist(YoutubeDLError):
1157     """Entry not in playlist exception.
1158
1159     This exception will be thrown by YoutubeDL when a requested entry
1160     is not found in the playlist info_dict
1161     """
1162     msg = 'Entry not found in info'
1163
1164
1165 class SameFileError(YoutubeDLError):
1166     """Same File exception.
1167
1168     This exception will be thrown by FileDownloader objects if they detect
1169     multiple files would have to be downloaded to the same file on disk.
1170     """
1171     msg = 'Fixed output name but more than one file to download'
1172
1173     def __init__(self, filename=None):
1174         if filename is not None:
1175             self.msg += f': {filename}'
1176         super().__init__(self.msg)
1177
1178
1179 class PostProcessingError(YoutubeDLError):
1180     """Post Processing exception.
1181
1182     This exception may be raised by PostProcessor's .run() method to
1183     indicate an error in the postprocessing task.
1184     """
1185
1186
1187 class DownloadCancelled(YoutubeDLError):
1188     """ Exception raised when the download queue should be interrupted """
1189     msg = 'The download was cancelled'
1190
1191
1192 class ExistingVideoReached(DownloadCancelled):
1193     """ --break-on-existing triggered """
1194     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1195
1196
1197 class RejectedVideoReached(DownloadCancelled):
1198     """ --break-match-filter triggered """
1199     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1200
1201
1202 class MaxDownloadsReached(DownloadCancelled):
1203     """ --max-downloads limit has been reached. """
1204     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1205
1206
1207 class ReExtractInfo(YoutubeDLError):
1208     """ Video info needs to be re-extracted. """
1209
1210     def __init__(self, msg, expected=False):
1211         super().__init__(msg)
1212         self.expected = expected
1213
1214
1215 class ThrottledDownload(ReExtractInfo):
1216     """ Download speed below --throttled-rate. """
1217     msg = 'The download speed is below throttle limit'
1218
1219     def __init__(self):
1220         super().__init__(self.msg, expected=False)
1221
1222
1223 class UnavailableVideoError(YoutubeDLError):
1224     """Unavailable Format exception.
1225
1226     This exception will be thrown when a video is requested
1227     in a format that is not available for that video.
1228     """
1229     msg = 'Unable to download video'
1230
1231     def __init__(self, err=None):
1232         if err is not None:
1233             self.msg += f': {err}'
1234         super().__init__(self.msg)
1235
1236
1237 class ContentTooShortError(YoutubeDLError):
1238     """Content Too Short exception.
1239
1240     This exception may be raised by FileDownloader objects when a file they
1241     download is too small for what the server announced first, indicating
1242     the connection was probably interrupted.
1243     """
1244
1245     def __init__(self, downloaded, expected):
1246         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1247         # Both in bytes
1248         self.downloaded = downloaded
1249         self.expected = expected
1250
1251
1252 class XAttrMetadataError(YoutubeDLError):
1253     def __init__(self, code=None, msg='Unknown error'):
1254         super().__init__(msg)
1255         self.code = code
1256         self.msg = msg
1257
1258         # Parsing code and msg
1259         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1260                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1261             self.reason = 'NO_SPACE'
1262         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1263             self.reason = 'VALUE_TOO_LONG'
1264         else:
1265             self.reason = 'NOT_SUPPORTED'
1266
1267
1268 class XAttrUnavailableError(YoutubeDLError):
1269     pass
1270
1271
1272 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1273     hc = http_class(*args, **kwargs)
1274     source_address = ydl_handler._params.get('source_address')
1275
1276     if source_address is not None:
1277         # This is to workaround _create_connection() from socket where it will try all
1278         # address data from getaddrinfo() including IPv6. This filters the result from
1279         # getaddrinfo() based on the source_address value.
1280         # This is based on the cpython socket.create_connection() function.
1281         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1282         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1283             host, port = address
1284             err = None
1285             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1286             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1287             ip_addrs = [addr for addr in addrs if addr[0] == af]
1288             if addrs and not ip_addrs:
1289                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1290                 raise OSError(
1291                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1292                     % (ip_version, source_address[0]))
1293             for res in ip_addrs:
1294                 af, socktype, proto, canonname, sa = res
1295                 sock = None
1296                 try:
1297                     sock = socket.socket(af, socktype, proto)
1298                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1299                         sock.settimeout(timeout)
1300                     sock.bind(source_address)
1301                     sock.connect(sa)
1302                     err = None  # Explicitly break reference cycle
1303                     return sock
1304                 except OSError as _:
1305                     err = _
1306                     if sock is not None:
1307                         sock.close()
1308             if err is not None:
1309                 raise err
1310             else:
1311                 raise OSError('getaddrinfo returns an empty list')
1312         if hasattr(hc, '_create_connection'):
1313             hc._create_connection = _create_connection
1314         hc.source_address = (source_address, 0)
1315
1316     return hc
1317
1318
1319 class YoutubeDLHandler(urllib.request.HTTPHandler):
1320     """Handler for HTTP requests and responses.
1321
1322     This class, when installed with an OpenerDirector, automatically adds
1323     the standard headers to every HTTP request and handles gzipped, deflated and
1324     brotli responses from web servers.
1325
1326     Part of this code was copied from:
1327
1328     http://techknack.net/python-urllib2-handlers/
1329
1330     Andrew Rowls, the author of that code, agreed to release it to the
1331     public domain.
1332     """
1333
1334     def __init__(self, params, *args, **kwargs):
1335         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1336         self._params = params
1337
1338     def http_open(self, req):
1339         conn_class = http.client.HTTPConnection
1340
1341         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1342         if socks_proxy:
1343             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1344             del req.headers['Ytdl-socks-proxy']
1345
1346         return self.do_open(functools.partial(
1347             _create_http_connection, self, conn_class, False),
1348             req)
1349
1350     @staticmethod
1351     def deflate(data):
1352         if not data:
1353             return data
1354         try:
1355             return zlib.decompress(data, -zlib.MAX_WBITS)
1356         except zlib.error:
1357             return zlib.decompress(data)
1358
1359     @staticmethod
1360     def brotli(data):
1361         if not data:
1362             return data
1363         return brotli.decompress(data)
1364
1365     @staticmethod
1366     def gz(data):
1367         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
1368         try:
1369             return gz.read()
1370         except OSError as original_oserror:
1371             # There may be junk add the end of the file
1372             # See http://stackoverflow.com/q/4928560/35070 for details
1373             for i in range(1, 1024):
1374                 try:
1375                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
1376                     return gz.read()
1377                 except OSError:
1378                     continue
1379             else:
1380                 raise original_oserror
1381
1382     def http_request(self, req):
1383         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1384         # always respected by websites, some tend to give out URLs with non percent-encoded
1385         # non-ASCII characters (see telemb.py, ard.py [#3412])
1386         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1387         # To work around aforementioned issue we will replace request's original URL with
1388         # percent-encoded one
1389         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1390         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1391         url = req.get_full_url()
1392         url_escaped = escape_url(url)
1393
1394         # Substitute URL if any change after escaping
1395         if url != url_escaped:
1396             req = update_Request(req, url=url_escaped)
1397
1398         for h, v in self._params.get('http_headers', std_headers).items():
1399             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1400             # The dict keys are capitalized because of this bug by urllib
1401             if h.capitalize() not in req.headers:
1402                 req.add_header(h, v)
1403
1404         if 'Youtubedl-no-compression' in req.headers:  # deprecated
1405             req.headers.pop('Youtubedl-no-compression', None)
1406             req.add_header('Accept-encoding', 'identity')
1407
1408         if 'Accept-encoding' not in req.headers:
1409             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1410
1411         return super().do_request_(req)
1412
1413     def http_response(self, req, resp):
1414         old_resp = resp
1415
1416         # Content-Encoding header lists the encodings in order that they were applied [1].
1417         # To decompress, we simply do the reverse.
1418         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
1419         decoded_response = None
1420         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
1421             if encoding == 'gzip':
1422                 decoded_response = self.gz(decoded_response or resp.read())
1423             elif encoding == 'deflate':
1424                 decoded_response = self.deflate(decoded_response or resp.read())
1425             elif encoding == 'br' and brotli:
1426                 decoded_response = self.brotli(decoded_response or resp.read())
1427
1428         if decoded_response is not None:
1429             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
1430             resp.msg = old_resp.msg
1431         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1432         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1433         if 300 <= resp.code < 400:
1434             location = resp.headers.get('Location')
1435             if location:
1436                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1437                 location = location.encode('iso-8859-1').decode()
1438                 location_escaped = escape_url(location)
1439                 if location != location_escaped:
1440                     del resp.headers['Location']
1441                     resp.headers['Location'] = location_escaped
1442         return resp
1443
1444     https_request = http_request
1445     https_response = http_response
1446
1447
1448 def make_socks_conn_class(base_class, socks_proxy):
1449     assert issubclass(base_class, (
1450         http.client.HTTPConnection, http.client.HTTPSConnection))
1451
1452     url_components = urllib.parse.urlparse(socks_proxy)
1453     if url_components.scheme.lower() == 'socks5':
1454         socks_type = ProxyType.SOCKS5
1455     elif url_components.scheme.lower() in ('socks', 'socks4'):
1456         socks_type = ProxyType.SOCKS4
1457     elif url_components.scheme.lower() == 'socks4a':
1458         socks_type = ProxyType.SOCKS4A
1459
1460     def unquote_if_non_empty(s):
1461         if not s:
1462             return s
1463         return urllib.parse.unquote_plus(s)
1464
1465     proxy_args = (
1466         socks_type,
1467         url_components.hostname, url_components.port or 1080,
1468         True,  # Remote DNS
1469         unquote_if_non_empty(url_components.username),
1470         unquote_if_non_empty(url_components.password),
1471     )
1472
1473     class SocksConnection(base_class):
1474         def connect(self):
1475             self.sock = sockssocket()
1476             self.sock.setproxy(*proxy_args)
1477             if isinstance(self.timeout, (int, float)):
1478                 self.sock.settimeout(self.timeout)
1479             self.sock.connect((self.host, self.port))
1480
1481             if isinstance(self, http.client.HTTPSConnection):
1482                 if hasattr(self, '_context'):  # Python > 2.6
1483                     self.sock = self._context.wrap_socket(
1484                         self.sock, server_hostname=self.host)
1485                 else:
1486                     self.sock = ssl.wrap_socket(self.sock)
1487
1488     return SocksConnection
1489
1490
1491 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1492     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1493         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1494         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1495         self._params = params
1496
1497     def https_open(self, req):
1498         kwargs = {}
1499         conn_class = self._https_conn_class
1500
1501         if hasattr(self, '_context'):  # python > 2.6
1502             kwargs['context'] = self._context
1503         if hasattr(self, '_check_hostname'):  # python 3.x
1504             kwargs['check_hostname'] = self._check_hostname
1505
1506         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1507         if socks_proxy:
1508             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1509             del req.headers['Ytdl-socks-proxy']
1510
1511         try:
1512             return self.do_open(
1513                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1514         except urllib.error.URLError as e:
1515             if (isinstance(e.reason, ssl.SSLError)
1516                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1517                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1518             raise
1519
1520
1521 def is_path_like(f):
1522     return isinstance(f, (str, bytes, os.PathLike))
1523
1524
1525 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1526     def __init__(self, cookiejar=None):
1527         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1528
1529     def http_response(self, request, response):
1530         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1531
1532     https_request = urllib.request.HTTPCookieProcessor.http_request
1533     https_response = http_response
1534
1535
1536 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1537     """YoutubeDL redirect handler
1538
1539     The code is based on HTTPRedirectHandler implementation from CPython [1].
1540
1541     This redirect handler fixes and improves the logic to better align with RFC7261
1542      and what browsers tend to do [2][3]
1543
1544     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1545     2. https://datatracker.ietf.org/doc/html/rfc7231
1546     3. https://github.com/python/cpython/issues/91306
1547     """
1548
1549     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1550
1551     def redirect_request(self, req, fp, code, msg, headers, newurl):
1552         if code not in (301, 302, 303, 307, 308):
1553             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1554
1555         new_method = req.get_method()
1556         new_data = req.data
1557         remove_headers = []
1558         # A 303 must either use GET or HEAD for subsequent request
1559         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1560         if code == 303 and req.get_method() != 'HEAD':
1561             new_method = 'GET'
1562         # 301 and 302 redirects are commonly turned into a GET from a POST
1563         # for subsequent requests by browsers, so we'll do the same.
1564         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1565         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1566         elif code in (301, 302) and req.get_method() == 'POST':
1567             new_method = 'GET'
1568
1569         # only remove payload if method changed (e.g. POST to GET)
1570         if new_method != req.get_method():
1571             new_data = None
1572             remove_headers.extend(['Content-Length', 'Content-Type'])
1573
1574         new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
1575
1576         return urllib.request.Request(
1577             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1578             unverifiable=True, method=new_method, data=new_data)
1579
1580
1581 def extract_timezone(date_str):
1582     m = re.search(
1583         r'''(?x)
1584             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1585             (?P<tz>Z|                                            # just the UTC Z, or
1586                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1587                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1588                    [ ]?                                          # optional space
1589                 (?P<sign>\+|-)                                   # +/-
1590                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1591             $)
1592         ''', date_str)
1593     if not m:
1594         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1595         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1596         if timezone is not None:
1597             date_str = date_str[:-len(m.group('tz'))]
1598         timezone = datetime.timedelta(hours=timezone or 0)
1599     else:
1600         date_str = date_str[:-len(m.group('tz'))]
1601         if not m.group('sign'):
1602             timezone = datetime.timedelta()
1603         else:
1604             sign = 1 if m.group('sign') == '+' else -1
1605             timezone = datetime.timedelta(
1606                 hours=sign * int(m.group('hours')),
1607                 minutes=sign * int(m.group('minutes')))
1608     return timezone, date_str
1609
1610
1611 def parse_iso8601(date_str, delimiter='T', timezone=None):
1612     """ Return a UNIX timestamp from the given date """
1613
1614     if date_str is None:
1615         return None
1616
1617     date_str = re.sub(r'\.[0-9]+', '', date_str)
1618
1619     if timezone is None:
1620         timezone, date_str = extract_timezone(date_str)
1621
1622     with contextlib.suppress(ValueError):
1623         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1624         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1625         return calendar.timegm(dt.timetuple())
1626
1627
1628 def date_formats(day_first=True):
1629     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1630
1631
1632 def unified_strdate(date_str, day_first=True):
1633     """Return a string with the date in the format YYYYMMDD"""
1634
1635     if date_str is None:
1636         return None
1637     upload_date = None
1638     # Replace commas
1639     date_str = date_str.replace(',', ' ')
1640     # Remove AM/PM + timezone
1641     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1642     _, date_str = extract_timezone(date_str)
1643
1644     for expression in date_formats(day_first):
1645         with contextlib.suppress(ValueError):
1646             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1647     if upload_date is None:
1648         timetuple = email.utils.parsedate_tz(date_str)
1649         if timetuple:
1650             with contextlib.suppress(ValueError):
1651                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1652     if upload_date is not None:
1653         return str(upload_date)
1654
1655
1656 def unified_timestamp(date_str, day_first=True):
1657     if date_str is None:
1658         return None
1659
1660     date_str = re.sub(r'\s+', ' ', re.sub(
1661         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1662
1663     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1664     timezone, date_str = extract_timezone(date_str)
1665
1666     # Remove AM/PM + timezone
1667     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1668
1669     # Remove unrecognized timezones from ISO 8601 alike timestamps
1670     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1671     if m:
1672         date_str = date_str[:-len(m.group('tz'))]
1673
1674     # Python only supports microseconds, so remove nanoseconds
1675     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1676     if m:
1677         date_str = m.group(1)
1678
1679     for expression in date_formats(day_first):
1680         with contextlib.suppress(ValueError):
1681             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1682             return calendar.timegm(dt.timetuple())
1683
1684     timetuple = email.utils.parsedate_tz(date_str)
1685     if timetuple:
1686         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1687
1688
1689 def determine_ext(url, default_ext='unknown_video'):
1690     if url is None or '.' not in url:
1691         return default_ext
1692     guess = url.partition('?')[0].rpartition('.')[2]
1693     if re.match(r'^[A-Za-z0-9]+$', guess):
1694         return guess
1695     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1696     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1697         return guess.rstrip('/')
1698     else:
1699         return default_ext
1700
1701
1702 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1703     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1704
1705
1706 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1707     R"""
1708     Return a datetime object from a string.
1709     Supported format:
1710         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1711
1712     @param format       strftime format of DATE
1713     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1714                         auto: round to the unit provided in date_str (if applicable).
1715     """
1716     auto_precision = False
1717     if precision == 'auto':
1718         auto_precision = True
1719         precision = 'microsecond'
1720     today = datetime_round(datetime.datetime.utcnow(), precision)
1721     if date_str in ('now', 'today'):
1722         return today
1723     if date_str == 'yesterday':
1724         return today - datetime.timedelta(days=1)
1725     match = re.match(
1726         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1727         date_str)
1728     if match is not None:
1729         start_time = datetime_from_str(match.group('start'), precision, format)
1730         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1731         unit = match.group('unit')
1732         if unit == 'month' or unit == 'year':
1733             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1734             unit = 'day'
1735         else:
1736             if unit == 'week':
1737                 unit = 'day'
1738                 time *= 7
1739             delta = datetime.timedelta(**{unit + 's': time})
1740             new_date = start_time + delta
1741         if auto_precision:
1742             return datetime_round(new_date, unit)
1743         return new_date
1744
1745     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1746
1747
1748 def date_from_str(date_str, format='%Y%m%d', strict=False):
1749     R"""
1750     Return a date object from a string using datetime_from_str
1751
1752     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1753                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1754     """
1755     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1756         raise ValueError(f'Invalid date format "{date_str}"')
1757     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1758
1759
1760 def datetime_add_months(dt, months):
1761     """Increment/Decrement a datetime object by months."""
1762     month = dt.month + months - 1
1763     year = dt.year + month // 12
1764     month = month % 12 + 1
1765     day = min(dt.day, calendar.monthrange(year, month)[1])
1766     return dt.replace(year, month, day)
1767
1768
1769 def datetime_round(dt, precision='day'):
1770     """
1771     Round a datetime object's time to a specific precision
1772     """
1773     if precision == 'microsecond':
1774         return dt
1775
1776     unit_seconds = {
1777         'day': 86400,
1778         'hour': 3600,
1779         'minute': 60,
1780         'second': 1,
1781     }
1782     roundto = lambda x, n: ((x + n / 2) // n) * n
1783     timestamp = calendar.timegm(dt.timetuple())
1784     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1785
1786
1787 def hyphenate_date(date_str):
1788     """
1789     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1790     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1791     if match is not None:
1792         return '-'.join(match.groups())
1793     else:
1794         return date_str
1795
1796
1797 class DateRange:
1798     """Represents a time interval between two dates"""
1799
1800     def __init__(self, start=None, end=None):
1801         """start and end must be strings in the format accepted by date"""
1802         if start is not None:
1803             self.start = date_from_str(start, strict=True)
1804         else:
1805             self.start = datetime.datetime.min.date()
1806         if end is not None:
1807             self.end = date_from_str(end, strict=True)
1808         else:
1809             self.end = datetime.datetime.max.date()
1810         if self.start > self.end:
1811             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1812
1813     @classmethod
1814     def day(cls, day):
1815         """Returns a range that only contains the given day"""
1816         return cls(day, day)
1817
1818     def __contains__(self, date):
1819         """Check if the date is in the range"""
1820         if not isinstance(date, datetime.date):
1821             date = date_from_str(date)
1822         return self.start <= date <= self.end
1823
1824     def __repr__(self):
1825         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1826
1827     def __eq__(self, other):
1828         return (isinstance(other, DateRange)
1829                 and self.start == other.start and self.end == other.end)
1830
1831
1832 @functools.cache
1833 def system_identifier():
1834     python_implementation = platform.python_implementation()
1835     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1836         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1837     libc_ver = []
1838     with contextlib.suppress(OSError):  # We may not have access to the executable
1839         libc_ver = platform.libc_ver()
1840
1841     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1842         platform.python_version(),
1843         python_implementation,
1844         platform.machine(),
1845         platform.architecture()[0],
1846         platform.platform(),
1847         ssl.OPENSSL_VERSION,
1848         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1849     )
1850
1851
1852 @functools.cache
1853 def get_windows_version():
1854     ''' Get Windows version. returns () if it's not running on Windows '''
1855     if compat_os_name == 'nt':
1856         return version_tuple(platform.win32_ver()[1])
1857     else:
1858         return ()
1859
1860
1861 def write_string(s, out=None, encoding=None):
1862     assert isinstance(s, str)
1863     out = out or sys.stderr
1864     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1865     if not out:
1866         return
1867
1868     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1869         s = re.sub(r'([\r\n]+)', r' \1', s)
1870
1871     enc, buffer = None, out
1872     if 'b' in getattr(out, 'mode', ''):
1873         enc = encoding or preferredencoding()
1874     elif hasattr(out, 'buffer'):
1875         buffer = out.buffer
1876         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1877
1878     buffer.write(s.encode(enc, 'ignore') if enc else s)
1879     out.flush()
1880
1881
1882 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1883     from .. import _IN_CLI
1884     if _IN_CLI:
1885         if msg in deprecation_warning._cache:
1886             return
1887         deprecation_warning._cache.add(msg)
1888         if printer:
1889             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1890         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1891     else:
1892         import warnings
1893         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1894
1895
1896 deprecation_warning._cache = set()
1897
1898
1899 def bytes_to_intlist(bs):
1900     if not bs:
1901         return []
1902     if isinstance(bs[0], int):  # Python 3
1903         return list(bs)
1904     else:
1905         return [ord(c) for c in bs]
1906
1907
1908 def intlist_to_bytes(xs):
1909     if not xs:
1910         return b''
1911     return struct.pack('%dB' % len(xs), *xs)
1912
1913
1914 class LockingUnsupportedError(OSError):
1915     msg = 'File locking is not supported'
1916
1917     def __init__(self):
1918         super().__init__(self.msg)
1919
1920
1921 # Cross-platform file locking
1922 if sys.platform == 'win32':
1923     import ctypes
1924     import ctypes.wintypes
1925     import msvcrt
1926
1927     class OVERLAPPED(ctypes.Structure):
1928         _fields_ = [
1929             ('Internal', ctypes.wintypes.LPVOID),
1930             ('InternalHigh', ctypes.wintypes.LPVOID),
1931             ('Offset', ctypes.wintypes.DWORD),
1932             ('OffsetHigh', ctypes.wintypes.DWORD),
1933             ('hEvent', ctypes.wintypes.HANDLE),
1934         ]
1935
1936     kernel32 = ctypes.WinDLL('kernel32')
1937     LockFileEx = kernel32.LockFileEx
1938     LockFileEx.argtypes = [
1939         ctypes.wintypes.HANDLE,     # hFile
1940         ctypes.wintypes.DWORD,      # dwFlags
1941         ctypes.wintypes.DWORD,      # dwReserved
1942         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1943         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1944         ctypes.POINTER(OVERLAPPED)  # Overlapped
1945     ]
1946     LockFileEx.restype = ctypes.wintypes.BOOL
1947     UnlockFileEx = kernel32.UnlockFileEx
1948     UnlockFileEx.argtypes = [
1949         ctypes.wintypes.HANDLE,     # hFile
1950         ctypes.wintypes.DWORD,      # dwReserved
1951         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1952         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1953         ctypes.POINTER(OVERLAPPED)  # Overlapped
1954     ]
1955     UnlockFileEx.restype = ctypes.wintypes.BOOL
1956     whole_low = 0xffffffff
1957     whole_high = 0x7fffffff
1958
1959     def _lock_file(f, exclusive, block):
1960         overlapped = OVERLAPPED()
1961         overlapped.Offset = 0
1962         overlapped.OffsetHigh = 0
1963         overlapped.hEvent = 0
1964         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1965
1966         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1967                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1968                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1969             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1970             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1971
1972     def _unlock_file(f):
1973         assert f._lock_file_overlapped_p
1974         handle = msvcrt.get_osfhandle(f.fileno())
1975         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1976             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1977
1978 else:
1979     try:
1980         import fcntl
1981
1982         def _lock_file(f, exclusive, block):
1983             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1984             if not block:
1985                 flags |= fcntl.LOCK_NB
1986             try:
1987                 fcntl.flock(f, flags)
1988             except BlockingIOError:
1989                 raise
1990             except OSError:  # AOSP does not have flock()
1991                 fcntl.lockf(f, flags)
1992
1993         def _unlock_file(f):
1994             with contextlib.suppress(OSError):
1995                 return fcntl.flock(f, fcntl.LOCK_UN)
1996             with contextlib.suppress(OSError):
1997                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1998             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1999
2000     except ImportError:
2001
2002         def _lock_file(f, exclusive, block):
2003             raise LockingUnsupportedError()
2004
2005         def _unlock_file(f):
2006             raise LockingUnsupportedError()
2007
2008
2009 class locked_file:
2010     locked = False
2011
2012     def __init__(self, filename, mode, block=True, encoding=None):
2013         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2014             raise NotImplementedError(mode)
2015         self.mode, self.block = mode, block
2016
2017         writable = any(f in mode for f in 'wax+')
2018         readable = any(f in mode for f in 'r+')
2019         flags = functools.reduce(operator.ior, (
2020             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2021             getattr(os, 'O_BINARY', 0),  # Windows only
2022             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2023             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2024             os.O_APPEND if 'a' in mode else 0,
2025             os.O_EXCL if 'x' in mode else 0,
2026             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2027         ))
2028
2029         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2030
2031     def __enter__(self):
2032         exclusive = 'r' not in self.mode
2033         try:
2034             _lock_file(self.f, exclusive, self.block)
2035             self.locked = True
2036         except OSError:
2037             self.f.close()
2038             raise
2039         if 'w' in self.mode:
2040             try:
2041                 self.f.truncate()
2042             except OSError as e:
2043                 if e.errno not in (
2044                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2045                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2046                 ):
2047                     raise
2048         return self
2049
2050     def unlock(self):
2051         if not self.locked:
2052             return
2053         try:
2054             _unlock_file(self.f)
2055         finally:
2056             self.locked = False
2057
2058     def __exit__(self, *_):
2059         try:
2060             self.unlock()
2061         finally:
2062             self.f.close()
2063
2064     open = __enter__
2065     close = __exit__
2066
2067     def __getattr__(self, attr):
2068         return getattr(self.f, attr)
2069
2070     def __iter__(self):
2071         return iter(self.f)
2072
2073
2074 @functools.cache
2075 def get_filesystem_encoding():
2076     encoding = sys.getfilesystemencoding()
2077     return encoding if encoding is not None else 'utf-8'
2078
2079
2080 def shell_quote(args):
2081     quoted_args = []
2082     encoding = get_filesystem_encoding()
2083     for a in args:
2084         if isinstance(a, bytes):
2085             # We may get a filename encoded with 'encodeFilename'
2086             a = a.decode(encoding)
2087         quoted_args.append(compat_shlex_quote(a))
2088     return ' '.join(quoted_args)
2089
2090
2091 def smuggle_url(url, data):
2092     """ Pass additional data in a URL for internal use. """
2093
2094     url, idata = unsmuggle_url(url, {})
2095     data.update(idata)
2096     sdata = urllib.parse.urlencode(
2097         {'__youtubedl_smuggle': json.dumps(data)})
2098     return url + '#' + sdata
2099
2100
2101 def unsmuggle_url(smug_url, default=None):
2102     if '#__youtubedl_smuggle' not in smug_url:
2103         return smug_url, default
2104     url, _, sdata = smug_url.rpartition('#')
2105     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2106     data = json.loads(jsond)
2107     return url, data
2108
2109
2110 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2111     """ Formats numbers with decimal sufixes like K, M, etc """
2112     num, factor = float_or_none(num), float(factor)
2113     if num is None or num < 0:
2114         return None
2115     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2116     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2117     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2118     if factor == 1024:
2119         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2120     converted = num / (factor ** exponent)
2121     return fmt % (converted, suffix)
2122
2123
2124 def format_bytes(bytes):
2125     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2126
2127
2128 def lookup_unit_table(unit_table, s, strict=False):
2129     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2130     units_re = '|'.join(re.escape(u) for u in unit_table)
2131     m = (re.fullmatch if strict else re.match)(
2132         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2133     if not m:
2134         return None
2135
2136     num = float(m.group('num').replace(',', '.'))
2137     mult = unit_table[m.group('unit')]
2138     return round(num * mult)
2139
2140
2141 def parse_bytes(s):
2142     """Parse a string indicating a byte quantity into an integer"""
2143     return lookup_unit_table(
2144         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2145         s.upper(), strict=True)
2146
2147
2148 def parse_filesize(s):
2149     if s is None:
2150         return None
2151
2152     # The lower-case forms are of course incorrect and unofficial,
2153     # but we support those too
2154     _UNIT_TABLE = {
2155         'B': 1,
2156         'b': 1,
2157         'bytes': 1,
2158         'KiB': 1024,
2159         'KB': 1000,
2160         'kB': 1024,
2161         'Kb': 1000,
2162         'kb': 1000,
2163         'kilobytes': 1000,
2164         'kibibytes': 1024,
2165         'MiB': 1024 ** 2,
2166         'MB': 1000 ** 2,
2167         'mB': 1024 ** 2,
2168         'Mb': 1000 ** 2,
2169         'mb': 1000 ** 2,
2170         'megabytes': 1000 ** 2,
2171         'mebibytes': 1024 ** 2,
2172         'GiB': 1024 ** 3,
2173         'GB': 1000 ** 3,
2174         'gB': 1024 ** 3,
2175         'Gb': 1000 ** 3,
2176         'gb': 1000 ** 3,
2177         'gigabytes': 1000 ** 3,
2178         'gibibytes': 1024 ** 3,
2179         'TiB': 1024 ** 4,
2180         'TB': 1000 ** 4,
2181         'tB': 1024 ** 4,
2182         'Tb': 1000 ** 4,
2183         'tb': 1000 ** 4,
2184         'terabytes': 1000 ** 4,
2185         'tebibytes': 1024 ** 4,
2186         'PiB': 1024 ** 5,
2187         'PB': 1000 ** 5,
2188         'pB': 1024 ** 5,
2189         'Pb': 1000 ** 5,
2190         'pb': 1000 ** 5,
2191         'petabytes': 1000 ** 5,
2192         'pebibytes': 1024 ** 5,
2193         'EiB': 1024 ** 6,
2194         'EB': 1000 ** 6,
2195         'eB': 1024 ** 6,
2196         'Eb': 1000 ** 6,
2197         'eb': 1000 ** 6,
2198         'exabytes': 1000 ** 6,
2199         'exbibytes': 1024 ** 6,
2200         'ZiB': 1024 ** 7,
2201         'ZB': 1000 ** 7,
2202         'zB': 1024 ** 7,
2203         'Zb': 1000 ** 7,
2204         'zb': 1000 ** 7,
2205         'zettabytes': 1000 ** 7,
2206         'zebibytes': 1024 ** 7,
2207         'YiB': 1024 ** 8,
2208         'YB': 1000 ** 8,
2209         'yB': 1024 ** 8,
2210         'Yb': 1000 ** 8,
2211         'yb': 1000 ** 8,
2212         'yottabytes': 1000 ** 8,
2213         'yobibytes': 1024 ** 8,
2214     }
2215
2216     return lookup_unit_table(_UNIT_TABLE, s)
2217
2218
2219 def parse_count(s):
2220     if s is None:
2221         return None
2222
2223     s = re.sub(r'^[^\d]+\s', '', s).strip()
2224
2225     if re.match(r'^[\d,.]+$', s):
2226         return str_to_int(s)
2227
2228     _UNIT_TABLE = {
2229         'k': 1000,
2230         'K': 1000,
2231         'm': 1000 ** 2,
2232         'M': 1000 ** 2,
2233         'kk': 1000 ** 2,
2234         'KK': 1000 ** 2,
2235         'b': 1000 ** 3,
2236         'B': 1000 ** 3,
2237     }
2238
2239     ret = lookup_unit_table(_UNIT_TABLE, s)
2240     if ret is not None:
2241         return ret
2242
2243     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2244     if mobj:
2245         return str_to_int(mobj.group(1))
2246
2247
2248 def parse_resolution(s, *, lenient=False):
2249     if s is None:
2250         return {}
2251
2252     if lenient:
2253         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2254     else:
2255         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2256     if mobj:
2257         return {
2258             'width': int(mobj.group('w')),
2259             'height': int(mobj.group('h')),
2260         }
2261
2262     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2263     if mobj:
2264         return {'height': int(mobj.group(1))}
2265
2266     mobj = re.search(r'\b([48])[kK]\b', s)
2267     if mobj:
2268         return {'height': int(mobj.group(1)) * 540}
2269
2270     return {}
2271
2272
2273 def parse_bitrate(s):
2274     if not isinstance(s, str):
2275         return
2276     mobj = re.search(r'\b(\d+)\s*kbps', s)
2277     if mobj:
2278         return int(mobj.group(1))
2279
2280
2281 def month_by_name(name, lang='en'):
2282     """ Return the number of a month by (locale-independently) English name """
2283
2284     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2285
2286     try:
2287         return month_names.index(name) + 1
2288     except ValueError:
2289         return None
2290
2291
2292 def month_by_abbreviation(abbrev):
2293     """ Return the number of a month by (locale-independently) English
2294         abbreviations """
2295
2296     try:
2297         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2298     except ValueError:
2299         return None
2300
2301
2302 def fix_xml_ampersands(xml_str):
2303     """Replace all the '&' by '&amp;' in XML"""
2304     return re.sub(
2305         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2306         '&amp;',
2307         xml_str)
2308
2309
2310 def setproctitle(title):
2311     assert isinstance(title, str)
2312
2313     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2314     try:
2315         import ctypes
2316     except ImportError:
2317         return
2318
2319     try:
2320         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2321     except OSError:
2322         return
2323     except TypeError:
2324         # LoadLibrary in Windows Python 2.7.13 only expects
2325         # a bytestring, but since unicode_literals turns
2326         # every string into a unicode string, it fails.
2327         return
2328     title_bytes = title.encode()
2329     buf = ctypes.create_string_buffer(len(title_bytes))
2330     buf.value = title_bytes
2331     try:
2332         libc.prctl(15, buf, 0, 0, 0)
2333     except AttributeError:
2334         return  # Strange libc, just skip this
2335
2336
2337 def remove_start(s, start):
2338     return s[len(start):] if s is not None and s.startswith(start) else s
2339
2340
2341 def remove_end(s, end):
2342     return s[:-len(end)] if s is not None and s.endswith(end) else s
2343
2344
2345 def remove_quotes(s):
2346     if s is None or len(s) < 2:
2347         return s
2348     for quote in ('"', "'", ):
2349         if s[0] == quote and s[-1] == quote:
2350             return s[1:-1]
2351     return s
2352
2353
2354 def get_domain(url):
2355     """
2356     This implementation is inconsistent, but is kept for compatibility.
2357     Use this only for "webpage_url_domain"
2358     """
2359     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2360
2361
2362 def url_basename(url):
2363     path = urllib.parse.urlparse(url).path
2364     return path.strip('/').split('/')[-1]
2365
2366
2367 def base_url(url):
2368     return re.match(r'https?://[^?#]+/', url).group()
2369
2370
2371 def urljoin(base, path):
2372     if isinstance(path, bytes):
2373         path = path.decode()
2374     if not isinstance(path, str) or not path:
2375         return None
2376     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2377         return path
2378     if isinstance(base, bytes):
2379         base = base.decode()
2380     if not isinstance(base, str) or not re.match(
2381             r'^(?:https?:)?//', base):
2382         return None
2383     return urllib.parse.urljoin(base, path)
2384
2385
2386 class HEADRequest(urllib.request.Request):
2387     def get_method(self):
2388         return 'HEAD'
2389
2390
2391 class PUTRequest(urllib.request.Request):
2392     def get_method(self):
2393         return 'PUT'
2394
2395
2396 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2397     if get_attr and v is not None:
2398         v = getattr(v, get_attr, None)
2399     try:
2400         return int(v) * invscale // scale
2401     except (ValueError, TypeError, OverflowError):
2402         return default
2403
2404
2405 def str_or_none(v, default=None):
2406     return default if v is None else str(v)
2407
2408
2409 def str_to_int(int_str):
2410     """ A more relaxed version of int_or_none """
2411     if isinstance(int_str, int):
2412         return int_str
2413     elif isinstance(int_str, str):
2414         int_str = re.sub(r'[,\.\+]', '', int_str)
2415         return int_or_none(int_str)
2416
2417
2418 def float_or_none(v, scale=1, invscale=1, default=None):
2419     if v is None:
2420         return default
2421     try:
2422         return float(v) * invscale / scale
2423     except (ValueError, TypeError):
2424         return default
2425
2426
2427 def bool_or_none(v, default=None):
2428     return v if isinstance(v, bool) else default
2429
2430
2431 def strip_or_none(v, default=None):
2432     return v.strip() if isinstance(v, str) else default
2433
2434
2435 def url_or_none(url):
2436     if not url or not isinstance(url, str):
2437         return None
2438     url = url.strip()
2439     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2440
2441
2442 def request_to_url(req):
2443     if isinstance(req, urllib.request.Request):
2444         return req.get_full_url()
2445     else:
2446         return req
2447
2448
2449 def strftime_or_none(timestamp, date_format, default=None):
2450     datetime_object = None
2451     try:
2452         if isinstance(timestamp, (int, float)):  # unix timestamp
2453             # Using naive datetime here can break timestamp() in Windows
2454             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2455             datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2456         elif isinstance(timestamp, str):  # assume YYYYMMDD
2457             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2458         date_format = re.sub(  # Support %s on windows
2459             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2460         return datetime_object.strftime(date_format)
2461     except (ValueError, TypeError, AttributeError):
2462         return default
2463
2464
2465 def parse_duration(s):
2466     if not isinstance(s, str):
2467         return None
2468     s = s.strip()
2469     if not s:
2470         return None
2471
2472     days, hours, mins, secs, ms = [None] * 5
2473     m = re.match(r'''(?x)
2474             (?P<before_secs>
2475                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2476             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2477             (?P<ms>[.:][0-9]+)?Z?$
2478         ''', s)
2479     if m:
2480         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2481     else:
2482         m = re.match(
2483             r'''(?ix)(?:P?
2484                 (?:
2485                     [0-9]+\s*y(?:ears?)?,?\s*
2486                 )?
2487                 (?:
2488                     [0-9]+\s*m(?:onths?)?,?\s*
2489                 )?
2490                 (?:
2491                     [0-9]+\s*w(?:eeks?)?,?\s*
2492                 )?
2493                 (?:
2494                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2495                 )?
2496                 T)?
2497                 (?:
2498                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2499                 )?
2500                 (?:
2501                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2502                 )?
2503                 (?:
2504                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2505                 )?Z?$''', s)
2506         if m:
2507             days, hours, mins, secs, ms = m.groups()
2508         else:
2509             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2510             if m:
2511                 hours, mins = m.groups()
2512             else:
2513                 return None
2514
2515     if ms:
2516         ms = ms.replace(':', '.')
2517     return sum(float(part or 0) * mult for part, mult in (
2518         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2519
2520
2521 def prepend_extension(filename, ext, expected_real_ext=None):
2522     name, real_ext = os.path.splitext(filename)
2523     return (
2524         f'{name}.{ext}{real_ext}'
2525         if not expected_real_ext or real_ext[1:] == expected_real_ext
2526         else f'{filename}.{ext}')
2527
2528
2529 def replace_extension(filename, ext, expected_real_ext=None):
2530     name, real_ext = os.path.splitext(filename)
2531     return '{}.{}'.format(
2532         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2533         ext)
2534
2535
2536 def check_executable(exe, args=[]):
2537     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2538     args can be a list of arguments for a short output (like -version) """
2539     try:
2540         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2541     except OSError:
2542         return False
2543     return exe
2544
2545
2546 def _get_exe_version_output(exe, args):
2547     try:
2548         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2549         # SIGTTOU if yt-dlp is run in the background.
2550         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2551         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2552                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2553         if ret:
2554             return None
2555     except OSError:
2556         return False
2557     return stdout
2558
2559
2560 def detect_exe_version(output, version_re=None, unrecognized='present'):
2561     assert isinstance(output, str)
2562     if version_re is None:
2563         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2564     m = re.search(version_re, output)
2565     if m:
2566         return m.group(1)
2567     else:
2568         return unrecognized
2569
2570
2571 def get_exe_version(exe, args=['--version'],
2572                     version_re=None, unrecognized=('present', 'broken')):
2573     """ Returns the version of the specified executable,
2574     or False if the executable is not present """
2575     unrecognized = variadic(unrecognized)
2576     assert len(unrecognized) in (1, 2)
2577     out = _get_exe_version_output(exe, args)
2578     if out is None:
2579         return unrecognized[-1]
2580     return out and detect_exe_version(out, version_re, unrecognized[0])
2581
2582
2583 def frange(start=0, stop=None, step=1):
2584     """Float range"""
2585     if stop is None:
2586         start, stop = 0, start
2587     sign = [-1, 1][step > 0] if step else 0
2588     while sign * start < sign * stop:
2589         yield start
2590         start += step
2591
2592
2593 class LazyList(collections.abc.Sequence):
2594     """Lazy immutable list from an iterable
2595     Note that slices of a LazyList are lists and not LazyList"""
2596
2597     class IndexError(IndexError):
2598         pass
2599
2600     def __init__(self, iterable, *, reverse=False, _cache=None):
2601         self._iterable = iter(iterable)
2602         self._cache = [] if _cache is None else _cache
2603         self._reversed = reverse
2604
2605     def __iter__(self):
2606         if self._reversed:
2607             # We need to consume the entire iterable to iterate in reverse
2608             yield from self.exhaust()
2609             return
2610         yield from self._cache
2611         for item in self._iterable:
2612             self._cache.append(item)
2613             yield item
2614
2615     def _exhaust(self):
2616         self._cache.extend(self._iterable)
2617         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2618         return self._cache
2619
2620     def exhaust(self):
2621         """Evaluate the entire iterable"""
2622         return self._exhaust()[::-1 if self._reversed else 1]
2623
2624     @staticmethod
2625     def _reverse_index(x):
2626         return None if x is None else ~x
2627
2628     def __getitem__(self, idx):
2629         if isinstance(idx, slice):
2630             if self._reversed:
2631                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2632             start, stop, step = idx.start, idx.stop, idx.step or 1
2633         elif isinstance(idx, int):
2634             if self._reversed:
2635                 idx = self._reverse_index(idx)
2636             start, stop, step = idx, idx, 0
2637         else:
2638             raise TypeError('indices must be integers or slices')
2639         if ((start or 0) < 0 or (stop or 0) < 0
2640                 or (start is None and step < 0)
2641                 or (stop is None and step > 0)):
2642             # We need to consume the entire iterable to be able to slice from the end
2643             # Obviously, never use this with infinite iterables
2644             self._exhaust()
2645             try:
2646                 return self._cache[idx]
2647             except IndexError as e:
2648                 raise self.IndexError(e) from e
2649         n = max(start or 0, stop or 0) - len(self._cache) + 1
2650         if n > 0:
2651             self._cache.extend(itertools.islice(self._iterable, n))
2652         try:
2653             return self._cache[idx]
2654         except IndexError as e:
2655             raise self.IndexError(e) from e
2656
2657     def __bool__(self):
2658         try:
2659             self[-1] if self._reversed else self[0]
2660         except self.IndexError:
2661             return False
2662         return True
2663
2664     def __len__(self):
2665         self._exhaust()
2666         return len(self._cache)
2667
2668     def __reversed__(self):
2669         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2670
2671     def __copy__(self):
2672         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2673
2674     def __repr__(self):
2675         # repr and str should mimic a list. So we exhaust the iterable
2676         return repr(self.exhaust())
2677
2678     def __str__(self):
2679         return repr(self.exhaust())
2680
2681
2682 class PagedList:
2683
2684     class IndexError(IndexError):
2685         pass
2686
2687     def __len__(self):
2688         # This is only useful for tests
2689         return len(self.getslice())
2690
2691     def __init__(self, pagefunc, pagesize, use_cache=True):
2692         self._pagefunc = pagefunc
2693         self._pagesize = pagesize
2694         self._pagecount = float('inf')
2695         self._use_cache = use_cache
2696         self._cache = {}
2697
2698     def getpage(self, pagenum):
2699         page_results = self._cache.get(pagenum)
2700         if page_results is None:
2701             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2702         if self._use_cache:
2703             self._cache[pagenum] = page_results
2704         return page_results
2705
2706     def getslice(self, start=0, end=None):
2707         return list(self._getslice(start, end))
2708
2709     def _getslice(self, start, end):
2710         raise NotImplementedError('This method must be implemented by subclasses')
2711
2712     def __getitem__(self, idx):
2713         assert self._use_cache, 'Indexing PagedList requires cache'
2714         if not isinstance(idx, int) or idx < 0:
2715             raise TypeError('indices must be non-negative integers')
2716         entries = self.getslice(idx, idx + 1)
2717         if not entries:
2718             raise self.IndexError()
2719         return entries[0]
2720
2721
2722 class OnDemandPagedList(PagedList):
2723     """Download pages until a page with less than maximum results"""
2724
2725     def _getslice(self, start, end):
2726         for pagenum in itertools.count(start // self._pagesize):
2727             firstid = pagenum * self._pagesize
2728             nextfirstid = pagenum * self._pagesize + self._pagesize
2729             if start >= nextfirstid:
2730                 continue
2731
2732             startv = (
2733                 start % self._pagesize
2734                 if firstid <= start < nextfirstid
2735                 else 0)
2736             endv = (
2737                 ((end - 1) % self._pagesize) + 1
2738                 if (end is not None and firstid <= end <= nextfirstid)
2739                 else None)
2740
2741             try:
2742                 page_results = self.getpage(pagenum)
2743             except Exception:
2744                 self._pagecount = pagenum - 1
2745                 raise
2746             if startv != 0 or endv is not None:
2747                 page_results = page_results[startv:endv]
2748             yield from page_results
2749
2750             # A little optimization - if current page is not "full", ie. does
2751             # not contain page_size videos then we can assume that this page
2752             # is the last one - there are no more ids on further pages -
2753             # i.e. no need to query again.
2754             if len(page_results) + startv < self._pagesize:
2755                 break
2756
2757             # If we got the whole page, but the next page is not interesting,
2758             # break out early as well
2759             if end == nextfirstid:
2760                 break
2761
2762
2763 class InAdvancePagedList(PagedList):
2764     """PagedList with total number of pages known in advance"""
2765
2766     def __init__(self, pagefunc, pagecount, pagesize):
2767         PagedList.__init__(self, pagefunc, pagesize, True)
2768         self._pagecount = pagecount
2769
2770     def _getslice(self, start, end):
2771         start_page = start // self._pagesize
2772         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2773         skip_elems = start - start_page * self._pagesize
2774         only_more = None if end is None else end - start
2775         for pagenum in range(start_page, end_page):
2776             page_results = self.getpage(pagenum)
2777             if skip_elems:
2778                 page_results = page_results[skip_elems:]
2779                 skip_elems = None
2780             if only_more is not None:
2781                 if len(page_results) < only_more:
2782                     only_more -= len(page_results)
2783                 else:
2784                     yield from page_results[:only_more]
2785                     break
2786             yield from page_results
2787
2788
2789 class PlaylistEntries:
2790     MissingEntry = object()
2791     is_exhausted = False
2792
2793     def __init__(self, ydl, info_dict):
2794         self.ydl = ydl
2795
2796         # _entries must be assigned now since infodict can change during iteration
2797         entries = info_dict.get('entries')
2798         if entries is None:
2799             raise EntryNotInPlaylist('There are no entries')
2800         elif isinstance(entries, list):
2801             self.is_exhausted = True
2802
2803         requested_entries = info_dict.get('requested_entries')
2804         self.is_incomplete = requested_entries is not None
2805         if self.is_incomplete:
2806             assert self.is_exhausted
2807             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2808             for i, entry in zip(requested_entries, entries):
2809                 self._entries[i - 1] = entry
2810         elif isinstance(entries, (list, PagedList, LazyList)):
2811             self._entries = entries
2812         else:
2813             self._entries = LazyList(entries)
2814
2815     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2816         (?P<start>[+-]?\d+)?
2817         (?P<range>[:-]
2818             (?P<end>[+-]?\d+|inf(?:inite)?)?
2819             (?::(?P<step>[+-]?\d+))?
2820         )?''')
2821
2822     @classmethod
2823     def parse_playlist_items(cls, string):
2824         for segment in string.split(','):
2825             if not segment:
2826                 raise ValueError('There is two or more consecutive commas')
2827             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2828             if not mobj:
2829                 raise ValueError(f'{segment!r} is not a valid specification')
2830             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2831             if int_or_none(step) == 0:
2832                 raise ValueError(f'Step in {segment!r} cannot be zero')
2833             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2834
2835     def get_requested_items(self):
2836         playlist_items = self.ydl.params.get('playlist_items')
2837         playlist_start = self.ydl.params.get('playliststart', 1)
2838         playlist_end = self.ydl.params.get('playlistend')
2839         # For backwards compatibility, interpret -1 as whole list
2840         if playlist_end in (-1, None):
2841             playlist_end = ''
2842         if not playlist_items:
2843             playlist_items = f'{playlist_start}:{playlist_end}'
2844         elif playlist_start != 1 or playlist_end:
2845             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2846
2847         for index in self.parse_playlist_items(playlist_items):
2848             for i, entry in self[index]:
2849                 yield i, entry
2850                 if not entry:
2851                     continue
2852                 try:
2853                     # The item may have just been added to archive. Don't break due to it
2854                     if not self.ydl.params.get('lazy_playlist'):
2855                         # TODO: Add auto-generated fields
2856                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2857                 except (ExistingVideoReached, RejectedVideoReached):
2858                     return
2859
2860     def get_full_count(self):
2861         if self.is_exhausted and not self.is_incomplete:
2862             return len(self)
2863         elif isinstance(self._entries, InAdvancePagedList):
2864             if self._entries._pagesize == 1:
2865                 return self._entries._pagecount
2866
2867     @functools.cached_property
2868     def _getter(self):
2869         if isinstance(self._entries, list):
2870             def get_entry(i):
2871                 try:
2872                     entry = self._entries[i]
2873                 except IndexError:
2874                     entry = self.MissingEntry
2875                     if not self.is_incomplete:
2876                         raise self.IndexError()
2877                 if entry is self.MissingEntry:
2878                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2879                 return entry
2880         else:
2881             def get_entry(i):
2882                 try:
2883                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2884                 except (LazyList.IndexError, PagedList.IndexError):
2885                     raise self.IndexError()
2886         return get_entry
2887
2888     def __getitem__(self, idx):
2889         if isinstance(idx, int):
2890             idx = slice(idx, idx)
2891
2892         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2893         step = 1 if idx.step is None else idx.step
2894         if idx.start is None:
2895             start = 0 if step > 0 else len(self) - 1
2896         else:
2897             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2898
2899         # NB: Do not call len(self) when idx == [:]
2900         if idx.stop is None:
2901             stop = 0 if step < 0 else float('inf')
2902         else:
2903             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2904         stop += [-1, 1][step > 0]
2905
2906         for i in frange(start, stop, step):
2907             if i < 0:
2908                 continue
2909             try:
2910                 entry = self._getter(i)
2911             except self.IndexError:
2912                 self.is_exhausted = True
2913                 if step > 0:
2914                     break
2915                 continue
2916             yield i + 1, entry
2917
2918     def __len__(self):
2919         return len(tuple(self[:]))
2920
2921     class IndexError(IndexError):
2922         pass
2923
2924
2925 def uppercase_escape(s):
2926     unicode_escape = codecs.getdecoder('unicode_escape')
2927     return re.sub(
2928         r'\\U[0-9a-fA-F]{8}',
2929         lambda m: unicode_escape(m.group(0))[0],
2930         s)
2931
2932
2933 def lowercase_escape(s):
2934     unicode_escape = codecs.getdecoder('unicode_escape')
2935     return re.sub(
2936         r'\\u[0-9a-fA-F]{4}',
2937         lambda m: unicode_escape(m.group(0))[0],
2938         s)
2939
2940
2941 def escape_rfc3986(s):
2942     """Escape non-ASCII characters as suggested by RFC 3986"""
2943     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2944
2945
2946 def escape_url(url):
2947     """Escape URL as suggested by RFC 3986"""
2948     url_parsed = urllib.parse.urlparse(url)
2949     return url_parsed._replace(
2950         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2951         path=escape_rfc3986(url_parsed.path),
2952         params=escape_rfc3986(url_parsed.params),
2953         query=escape_rfc3986(url_parsed.query),
2954         fragment=escape_rfc3986(url_parsed.fragment)
2955     ).geturl()
2956
2957
2958 def parse_qs(url, **kwargs):
2959     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2960
2961
2962 def read_batch_urls(batch_fd):
2963     def fixup(url):
2964         if not isinstance(url, str):
2965             url = url.decode('utf-8', 'replace')
2966         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2967         for bom in BOM_UTF8:
2968             if url.startswith(bom):
2969                 url = url[len(bom):]
2970         url = url.lstrip()
2971         if not url or url.startswith(('#', ';', ']')):
2972             return False
2973         # "#" cannot be stripped out since it is part of the URI
2974         # However, it can be safely stripped out if following a whitespace
2975         return re.split(r'\s#', url, 1)[0].rstrip()
2976
2977     with contextlib.closing(batch_fd) as fd:
2978         return [url for url in map(fixup, fd) if url]
2979
2980
2981 def urlencode_postdata(*args, **kargs):
2982     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2983
2984
2985 def update_url(url, *, query_update=None, **kwargs):
2986     """Replace URL components specified by kwargs
2987        @param url           str or parse url tuple
2988        @param query_update  update query
2989        @returns             str
2990     """
2991     if isinstance(url, str):
2992         if not kwargs and not query_update:
2993             return url
2994         else:
2995             url = urllib.parse.urlparse(url)
2996     if query_update:
2997         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2998         kwargs['query'] = urllib.parse.urlencode({
2999             **urllib.parse.parse_qs(url.query),
3000             **query_update
3001         }, True)
3002     return urllib.parse.urlunparse(url._replace(**kwargs))
3003
3004
3005 def update_url_query(url, query):
3006     return update_url(url, query_update=query)
3007
3008
3009 def update_Request(req, url=None, data=None, headers=None, query=None):
3010     req_headers = req.headers.copy()
3011     req_headers.update(headers or {})
3012     req_data = data or req.data
3013     req_url = update_url_query(url or req.get_full_url(), query)
3014     req_get_method = req.get_method()
3015     if req_get_method == 'HEAD':
3016         req_type = HEADRequest
3017     elif req_get_method == 'PUT':
3018         req_type = PUTRequest
3019     else:
3020         req_type = urllib.request.Request
3021     new_req = req_type(
3022         req_url, data=req_data, headers=req_headers,
3023         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3024     if hasattr(req, 'timeout'):
3025         new_req.timeout = req.timeout
3026     return new_req
3027
3028
3029 def _multipart_encode_impl(data, boundary):
3030     content_type = 'multipart/form-data; boundary=%s' % boundary
3031
3032     out = b''
3033     for k, v in data.items():
3034         out += b'--' + boundary.encode('ascii') + b'\r\n'
3035         if isinstance(k, str):
3036             k = k.encode()
3037         if isinstance(v, str):
3038             v = v.encode()
3039         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3040         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3041         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3042         if boundary.encode('ascii') in content:
3043             raise ValueError('Boundary overlaps with data')
3044         out += content
3045
3046     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3047
3048     return out, content_type
3049
3050
3051 def multipart_encode(data, boundary=None):
3052     '''
3053     Encode a dict to RFC 7578-compliant form-data
3054
3055     data:
3056         A dict where keys and values can be either Unicode or bytes-like
3057         objects.
3058     boundary:
3059         If specified a Unicode object, it's used as the boundary. Otherwise
3060         a random boundary is generated.
3061
3062     Reference: https://tools.ietf.org/html/rfc7578
3063     '''
3064     has_specified_boundary = boundary is not None
3065
3066     while True:
3067         if boundary is None:
3068             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3069
3070         try:
3071             out, content_type = _multipart_encode_impl(data, boundary)
3072             break
3073         except ValueError:
3074             if has_specified_boundary:
3075                 raise
3076             boundary = None
3077
3078     return out, content_type
3079
3080
3081 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3082     if blocked_types is NO_DEFAULT:
3083         blocked_types = (str, bytes, collections.abc.Mapping)
3084     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3085
3086
3087 def variadic(x, allowed_types=NO_DEFAULT):
3088     if not isinstance(allowed_types, (tuple, type)):
3089         deprecation_warning('allowed_types should be a tuple or a type')
3090         allowed_types = tuple(allowed_types)
3091     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3092
3093
3094 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3095     for f in funcs:
3096         try:
3097             val = f(*args, **kwargs)
3098         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3099             pass
3100         else:
3101             if expected_type is None or isinstance(val, expected_type):
3102                 return val
3103
3104
3105 def try_get(src, getter, expected_type=None):
3106     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3107
3108
3109 def filter_dict(dct, cndn=lambda _, v: v is not None):
3110     return {k: v for k, v in dct.items() if cndn(k, v)}
3111
3112
3113 def merge_dicts(*dicts):
3114     merged = {}
3115     for a_dict in dicts:
3116         for k, v in a_dict.items():
3117             if (v is not None and k not in merged
3118                     or isinstance(v, str) and merged[k] == ''):
3119                 merged[k] = v
3120     return merged
3121
3122
3123 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3124     return string if isinstance(string, str) else str(string, encoding, errors)
3125
3126
3127 US_RATINGS = {
3128     'G': 0,
3129     'PG': 10,
3130     'PG-13': 13,
3131     'R': 16,
3132     'NC': 18,
3133 }
3134
3135
3136 TV_PARENTAL_GUIDELINES = {
3137     'TV-Y': 0,
3138     'TV-Y7': 7,
3139     'TV-G': 0,
3140     'TV-PG': 0,
3141     'TV-14': 14,
3142     'TV-MA': 17,
3143 }
3144
3145
3146 def parse_age_limit(s):
3147     # isinstance(False, int) is True. So type() must be used instead
3148     if type(s) is int:  # noqa: E721
3149         return s if 0 <= s <= 21 else None
3150     elif not isinstance(s, str):
3151         return None
3152     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3153     if m:
3154         return int(m.group('age'))
3155     s = s.upper()
3156     if s in US_RATINGS:
3157         return US_RATINGS[s]
3158     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3159     if m:
3160         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3161     return None
3162
3163
3164 def strip_jsonp(code):
3165     return re.sub(
3166         r'''(?sx)^
3167             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3168             (?:\s*&&\s*(?P=func_name))?
3169             \s*\(\s*(?P<callback_data>.*)\);?
3170             \s*?(?://[^\n]*)*$''',
3171         r'\g<callback_data>', code)
3172
3173
3174 def js_to_json(code, vars={}, *, strict=False):
3175     # vars is a dict of var, val pairs to substitute
3176     STRING_QUOTES = '\'"`'
3177     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3178     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3179     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3180     INTEGER_TABLE = (
3181         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3182         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3183     )
3184
3185     def process_escape(match):
3186         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3187         escape = match.group(1) or match.group(2)
3188
3189         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3190                 else R'\u00' if escape == 'x'
3191                 else '' if escape == '\n'
3192                 else escape)
3193
3194     def template_substitute(match):
3195         evaluated = js_to_json(match.group(1), vars, strict=strict)
3196         if evaluated[0] == '"':
3197             return json.loads(evaluated)
3198         return evaluated
3199
3200     def fix_kv(m):
3201         v = m.group(0)
3202         if v in ('true', 'false', 'null'):
3203             return v
3204         elif v in ('undefined', 'void 0'):
3205             return 'null'
3206         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3207             return ''
3208
3209         if v[0] in STRING_QUOTES:
3210             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3211             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3212             return f'"{escaped}"'
3213
3214         for regex, base in INTEGER_TABLE:
3215             im = re.match(regex, v)
3216             if im:
3217                 i = int(im.group(1), base)
3218                 return f'"{i}":' if v.endswith(':') else str(i)
3219
3220         if v in vars:
3221             try:
3222                 if not strict:
3223                     json.loads(vars[v])
3224             except json.JSONDecodeError:
3225                 return json.dumps(vars[v])
3226             else:
3227                 return vars[v]
3228
3229         if not strict:
3230             return f'"{v}"'
3231
3232         raise ValueError(f'Unknown value: {v}')
3233
3234     def create_map(mobj):
3235         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3236
3237     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3238     if not strict:
3239         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3240         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3241         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3242         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3243
3244     return re.sub(rf'''(?sx)
3245         {STRING_RE}|
3246         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3247         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3248         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3249         [0-9]+(?={SKIP_RE}:)|
3250         !+
3251         ''', fix_kv, code)
3252
3253
3254 def qualities(quality_ids):
3255     """ Get a numeric quality value out of a list of possible values """
3256     def q(qid):
3257         try:
3258             return quality_ids.index(qid)
3259         except ValueError:
3260             return -1
3261     return q
3262
3263
3264 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3265
3266
3267 DEFAULT_OUTTMPL = {
3268     'default': '%(title)s [%(id)s].%(ext)s',
3269     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3270 }
3271 OUTTMPL_TYPES = {
3272     'chapter': None,
3273     'subtitle': None,
3274     'thumbnail': None,
3275     'description': 'description',
3276     'annotation': 'annotations.xml',
3277     'infojson': 'info.json',
3278     'link': None,
3279     'pl_video': None,
3280     'pl_thumbnail': None,
3281     'pl_description': 'description',
3282     'pl_infojson': 'info.json',
3283 }
3284
3285 # As of [1] format syntax is:
3286 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3287 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3288 STR_FORMAT_RE_TMPL = r'''(?x)
3289     (?<!%)(?P<prefix>(?:%%)*)
3290     %
3291     (?P<has_key>\((?P<key>{0})\))?
3292     (?P<format>
3293         (?P<conversion>[#0\-+ ]+)?
3294         (?P<min_width>\d+)?
3295         (?P<precision>\.\d+)?
3296         (?P<len_mod>[hlL])?  # unused in python
3297         {1}  # conversion type
3298     )
3299 '''
3300
3301
3302 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3303
3304
3305 def limit_length(s, length):
3306     """ Add ellipses to overly long strings """
3307     if s is None:
3308         return None
3309     ELLIPSES = '...'
3310     if len(s) > length:
3311         return s[:length - len(ELLIPSES)] + ELLIPSES
3312     return s
3313
3314
3315 def version_tuple(v):
3316     return tuple(int(e) for e in re.split(r'[-.]', v))
3317
3318
3319 def is_outdated_version(version, limit, assume_new=True):
3320     if not version:
3321         return not assume_new
3322     try:
3323         return version_tuple(version) < version_tuple(limit)
3324     except ValueError:
3325         return not assume_new
3326
3327
3328 def ytdl_is_updateable():
3329     """ Returns if yt-dlp can be updated with -U """
3330
3331     from ..update import is_non_updateable
3332
3333     return not is_non_updateable()
3334
3335
3336 def args_to_str(args):
3337     # Get a short string representation for a subprocess command
3338     return ' '.join(compat_shlex_quote(a) for a in args)
3339
3340
3341 def error_to_str(err):
3342     return f'{type(err).__name__}: {err}'
3343
3344
3345 def mimetype2ext(mt, default=NO_DEFAULT):
3346     if not isinstance(mt, str):
3347         if default is not NO_DEFAULT:
3348             return default
3349         return None
3350
3351     MAP = {
3352         # video
3353         '3gpp': '3gp',
3354         'mp2t': 'ts',
3355         'mp4': 'mp4',
3356         'mpeg': 'mpeg',
3357         'mpegurl': 'm3u8',
3358         'quicktime': 'mov',
3359         'webm': 'webm',
3360         'vp9': 'vp9',
3361         'x-flv': 'flv',
3362         'x-m4v': 'm4v',
3363         'x-matroska': 'mkv',
3364         'x-mng': 'mng',
3365         'x-mp4-fragmented': 'mp4',
3366         'x-ms-asf': 'asf',
3367         'x-ms-wmv': 'wmv',
3368         'x-msvideo': 'avi',
3369
3370         # application (streaming playlists)
3371         'dash+xml': 'mpd',
3372         'f4m+xml': 'f4m',
3373         'hds+xml': 'f4m',
3374         'vnd.apple.mpegurl': 'm3u8',
3375         'vnd.ms-sstr+xml': 'ism',
3376         'x-mpegurl': 'm3u8',
3377
3378         # audio
3379         'audio/mp4': 'm4a',
3380         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3381         # Using .mp3 as it's the most popular one
3382         'audio/mpeg': 'mp3',
3383         'audio/webm': 'webm',
3384         'audio/x-matroska': 'mka',
3385         'audio/x-mpegurl': 'm3u',
3386         'midi': 'mid',
3387         'ogg': 'ogg',
3388         'wav': 'wav',
3389         'wave': 'wav',
3390         'x-aac': 'aac',
3391         'x-flac': 'flac',
3392         'x-m4a': 'm4a',
3393         'x-realaudio': 'ra',
3394         'x-wav': 'wav',
3395
3396         # image
3397         'avif': 'avif',
3398         'bmp': 'bmp',
3399         'gif': 'gif',
3400         'jpeg': 'jpg',
3401         'png': 'png',
3402         'svg+xml': 'svg',
3403         'tiff': 'tif',
3404         'vnd.wap.wbmp': 'wbmp',
3405         'webp': 'webp',
3406         'x-icon': 'ico',
3407         'x-jng': 'jng',
3408         'x-ms-bmp': 'bmp',
3409
3410         # caption
3411         'filmstrip+json': 'fs',
3412         'smptett+xml': 'tt',
3413         'ttaf+xml': 'dfxp',
3414         'ttml+xml': 'ttml',
3415         'x-ms-sami': 'sami',
3416
3417         # misc
3418         'gzip': 'gz',
3419         'json': 'json',
3420         'xml': 'xml',
3421         'zip': 'zip',
3422     }
3423
3424     mimetype = mt.partition(';')[0].strip().lower()
3425     _, _, subtype = mimetype.rpartition('/')
3426
3427     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3428     if ext:
3429         return ext
3430     elif default is not NO_DEFAULT:
3431         return default
3432     return subtype.replace('+', '.')
3433
3434
3435 def ext2mimetype(ext_or_url):
3436     if not ext_or_url:
3437         return None
3438     if '.' not in ext_or_url:
3439         ext_or_url = f'file.{ext_or_url}'
3440     return mimetypes.guess_type(ext_or_url)[0]
3441
3442
3443 def parse_codecs(codecs_str):
3444     # http://tools.ietf.org/html/rfc6381
3445     if not codecs_str:
3446         return {}
3447     split_codecs = list(filter(None, map(
3448         str.strip, codecs_str.strip().strip(',').split(','))))
3449     vcodec, acodec, scodec, hdr = None, None, None, None
3450     for full_codec in split_codecs:
3451         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3452         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3453                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3454             if vcodec:
3455                 continue
3456             vcodec = full_codec
3457             if parts[0] in ('dvh1', 'dvhe'):
3458                 hdr = 'DV'
3459             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3460                 hdr = 'HDR10'
3461             elif parts[:2] == ['vp9', '2']:
3462                 hdr = 'HDR10'
3463         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3464                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3465             acodec = acodec or full_codec
3466         elif parts[0] in ('stpp', 'wvtt'):
3467             scodec = scodec or full_codec
3468         else:
3469             write_string(f'WARNING: Unknown codec {full_codec}\n')
3470     if vcodec or acodec or scodec:
3471         return {
3472             'vcodec': vcodec or 'none',
3473             'acodec': acodec or 'none',
3474             'dynamic_range': hdr,
3475             **({'scodec': scodec} if scodec is not None else {}),
3476         }
3477     elif len(split_codecs) == 2:
3478         return {
3479             'vcodec': split_codecs[0],
3480             'acodec': split_codecs[1],
3481         }
3482     return {}
3483
3484
3485 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3486     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3487
3488     allow_mkv = not preferences or 'mkv' in preferences
3489
3490     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3491         return 'mkv'  # TODO: any other format allows this?
3492
3493     # TODO: All codecs supported by parse_codecs isn't handled here
3494     COMPATIBLE_CODECS = {
3495         'mp4': {
3496             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3497             'h264', 'aacl', 'ec-3',  # Set in ISM
3498         },
3499         'webm': {
3500             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3501             'vp9x', 'vp8x',  # in the webm spec
3502         },
3503     }
3504
3505     sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3506     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3507
3508     for ext in preferences or COMPATIBLE_CODECS.keys():
3509         codec_set = COMPATIBLE_CODECS.get(ext, set())
3510         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3511             return ext
3512
3513     COMPATIBLE_EXTS = (
3514         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3515         {'webm', 'weba'},
3516     )
3517     for ext in preferences or vexts:
3518         current_exts = {ext, *vexts, *aexts}
3519         if ext == 'mkv' or current_exts == {ext} or any(
3520                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3521             return ext
3522     return 'mkv' if allow_mkv else preferences[-1]
3523
3524
3525 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3526     getheader = url_handle.headers.get
3527
3528     cd = getheader('Content-Disposition')
3529     if cd:
3530         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3531         if m:
3532             e = determine_ext(m.group('filename'), default_ext=None)
3533             if e:
3534                 return e
3535
3536     meta_ext = getheader('x-amz-meta-name')
3537     if meta_ext:
3538         e = meta_ext.rpartition('.')[2]
3539         if e:
3540             return e
3541
3542     return mimetype2ext(getheader('Content-Type'), default=default)
3543
3544
3545 def encode_data_uri(data, mime_type):
3546     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3547
3548
3549 def age_restricted(content_limit, age_limit):
3550     """ Returns True iff the content should be blocked """
3551
3552     if age_limit is None:  # No limit set
3553         return False
3554     if content_limit is None:
3555         return False  # Content available for everyone
3556     return age_limit < content_limit
3557
3558
3559 # List of known byte-order-marks (BOM)
3560 BOMS = [
3561     (b'\xef\xbb\xbf', 'utf-8'),
3562     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3563     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3564     (b'\xff\xfe', 'utf-16-le'),
3565     (b'\xfe\xff', 'utf-16-be'),
3566 ]
3567
3568
3569 def is_html(first_bytes):
3570     """ Detect whether a file contains HTML by examining its first bytes. """
3571
3572     encoding = 'utf-8'
3573     for bom, enc in BOMS:
3574         while first_bytes.startswith(bom):
3575             encoding, first_bytes = enc, first_bytes[len(bom):]
3576
3577     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3578
3579
3580 def determine_protocol(info_dict):
3581     protocol = info_dict.get('protocol')
3582     if protocol is not None:
3583         return protocol
3584
3585     url = sanitize_url(info_dict['url'])
3586     if url.startswith('rtmp'):
3587         return 'rtmp'
3588     elif url.startswith('mms'):
3589         return 'mms'
3590     elif url.startswith('rtsp'):
3591         return 'rtsp'
3592
3593     ext = determine_ext(url)
3594     if ext == 'm3u8':
3595         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3596     elif ext == 'f4m':
3597         return 'f4m'
3598
3599     return urllib.parse.urlparse(url).scheme
3600
3601
3602 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3603     """ Render a list of rows, each as a list of values.
3604     Text after a \t will be right aligned """
3605     def width(string):
3606         return len(remove_terminal_sequences(string).replace('\t', ''))
3607
3608     def get_max_lens(table):
3609         return [max(width(str(v)) for v in col) for col in zip(*table)]
3610
3611     def filter_using_list(row, filterArray):
3612         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3613
3614     max_lens = get_max_lens(data) if hide_empty else []
3615     header_row = filter_using_list(header_row, max_lens)
3616     data = [filter_using_list(row, max_lens) for row in data]
3617
3618     table = [header_row] + data
3619     max_lens = get_max_lens(table)
3620     extra_gap += 1
3621     if delim:
3622         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3623         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3624     for row in table:
3625         for pos, text in enumerate(map(str, row)):
3626             if '\t' in text:
3627                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3628             else:
3629                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3630     ret = '\n'.join(''.join(row).rstrip() for row in table)
3631     return ret
3632
3633
3634 def _match_one(filter_part, dct, incomplete):
3635     # TODO: Generalize code with YoutubeDL._build_format_filter
3636     STRING_OPERATORS = {
3637         '*=': operator.contains,
3638         '^=': lambda attr, value: attr.startswith(value),
3639         '$=': lambda attr, value: attr.endswith(value),
3640         '~=': lambda attr, value: re.search(value, attr),
3641     }
3642     COMPARISON_OPERATORS = {
3643         **STRING_OPERATORS,
3644         '<=': operator.le,  # "<=" must be defined above "<"
3645         '<': operator.lt,
3646         '>=': operator.ge,
3647         '>': operator.gt,
3648         '=': operator.eq,
3649     }
3650
3651     if isinstance(incomplete, bool):
3652         is_incomplete = lambda _: incomplete
3653     else:
3654         is_incomplete = lambda k: k in incomplete
3655
3656     operator_rex = re.compile(r'''(?x)
3657         (?P<key>[a-z_]+)
3658         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3659         (?:
3660             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3661             (?P<strval>.+?)
3662         )
3663         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3664     m = operator_rex.fullmatch(filter_part.strip())
3665     if m:
3666         m = m.groupdict()
3667         unnegated_op = COMPARISON_OPERATORS[m['op']]
3668         if m['negation']:
3669             op = lambda attr, value: not unnegated_op(attr, value)
3670         else:
3671             op = unnegated_op
3672         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3673         if m['quote']:
3674             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3675         actual_value = dct.get(m['key'])
3676         numeric_comparison = None
3677         if isinstance(actual_value, (int, float)):
3678             # If the original field is a string and matching comparisonvalue is
3679             # a number we should respect the origin of the original field
3680             # and process comparison value as a string (see
3681             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3682             try:
3683                 numeric_comparison = int(comparison_value)
3684             except ValueError:
3685                 numeric_comparison = parse_filesize(comparison_value)
3686                 if numeric_comparison is None:
3687                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3688                 if numeric_comparison is None:
3689                     numeric_comparison = parse_duration(comparison_value)
3690         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3691             raise ValueError('Operator %s only supports string values!' % m['op'])
3692         if actual_value is None:
3693             return is_incomplete(m['key']) or m['none_inclusive']
3694         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3695
3696     UNARY_OPERATORS = {
3697         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3698         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3699     }
3700     operator_rex = re.compile(r'''(?x)
3701         (?P<op>%s)\s*(?P<key>[a-z_]+)
3702         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3703     m = operator_rex.fullmatch(filter_part.strip())
3704     if m:
3705         op = UNARY_OPERATORS[m.group('op')]
3706         actual_value = dct.get(m.group('key'))
3707         if is_incomplete(m.group('key')) and actual_value is None:
3708             return True
3709         return op(actual_value)
3710
3711     raise ValueError('Invalid filter part %r' % filter_part)
3712
3713
3714 def match_str(filter_str, dct, incomplete=False):
3715     """ Filter a dictionary with a simple string syntax.
3716     @returns           Whether the filter passes
3717     @param incomplete  Set of keys that is expected to be missing from dct.
3718                        Can be True/False to indicate all/none of the keys may be missing.
3719                        All conditions on incomplete keys pass if the key is missing
3720     """
3721     return all(
3722         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3723         for filter_part in re.split(r'(?<!\\)&', filter_str))
3724
3725
3726 def match_filter_func(filters, breaking_filters=None):
3727     if not filters and not breaking_filters:
3728         return None
3729     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3730     filters = set(variadic(filters or []))
3731
3732     interactive = '-' in filters
3733     if interactive:
3734         filters.remove('-')
3735
3736     def _match_func(info_dict, incomplete=False):
3737         ret = breaking_filters(info_dict, incomplete)
3738         if ret is not None:
3739             raise RejectedVideoReached(ret)
3740
3741         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3742             return NO_DEFAULT if interactive and not incomplete else None
3743         else:
3744             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3745             filter_str = ') | ('.join(map(str.strip, filters))
3746             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3747     return _match_func
3748
3749
3750 class download_range_func:
3751     def __init__(self, chapters, ranges):
3752         self.chapters, self.ranges = chapters, ranges
3753
3754     def __call__(self, info_dict, ydl):
3755         if not self.ranges and not self.chapters:
3756             yield {}
3757
3758         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3759                    else 'Cannot match chapters since chapter information is unavailable')
3760         for regex in self.chapters or []:
3761             for i, chapter in enumerate(info_dict.get('chapters') or []):
3762                 if re.search(regex, chapter['title']):
3763                     warning = None
3764                     yield {**chapter, 'index': i}
3765         if self.chapters and warning:
3766             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3767
3768         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3769
3770     def __eq__(self, other):
3771         return (isinstance(other, download_range_func)
3772                 and self.chapters == other.chapters and self.ranges == other.ranges)
3773
3774     def __repr__(self):
3775         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3776
3777
3778 def parse_dfxp_time_expr(time_expr):
3779     if not time_expr:
3780         return
3781
3782     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3783     if mobj:
3784         return float(mobj.group('time_offset'))
3785
3786     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3787     if mobj:
3788         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3789
3790
3791 def srt_subtitles_timecode(seconds):
3792     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3793
3794
3795 def ass_subtitles_timecode(seconds):
3796     time = timetuple_from_msec(seconds * 1000)
3797     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3798
3799
3800 def dfxp2srt(dfxp_data):
3801     '''
3802     @param dfxp_data A bytes-like object containing DFXP data
3803     @returns A unicode object containing converted SRT data
3804     '''
3805     LEGACY_NAMESPACES = (
3806         (b'http://www.w3.org/ns/ttml', [
3807             b'http://www.w3.org/2004/11/ttaf1',
3808             b'http://www.w3.org/2006/04/ttaf1',
3809             b'http://www.w3.org/2006/10/ttaf1',
3810         ]),
3811         (b'http://www.w3.org/ns/ttml#styling', [
3812             b'http://www.w3.org/ns/ttml#style',
3813         ]),
3814     )
3815
3816     SUPPORTED_STYLING = [
3817         'color',
3818         'fontFamily',
3819         'fontSize',
3820         'fontStyle',
3821         'fontWeight',
3822         'textDecoration'
3823     ]
3824
3825     _x = functools.partial(xpath_with_ns, ns_map={
3826         'xml': 'http://www.w3.org/XML/1998/namespace',
3827         'ttml': 'http://www.w3.org/ns/ttml',
3828         'tts': 'http://www.w3.org/ns/ttml#styling',
3829     })
3830
3831     styles = {}
3832     default_style = {}
3833
3834     class TTMLPElementParser:
3835         _out = ''
3836         _unclosed_elements = []
3837         _applied_styles = []
3838
3839         def start(self, tag, attrib):
3840             if tag in (_x('ttml:br'), 'br'):
3841                 self._out += '\n'
3842             else:
3843                 unclosed_elements = []
3844                 style = {}
3845                 element_style_id = attrib.get('style')
3846                 if default_style:
3847                     style.update(default_style)
3848                 if element_style_id:
3849                     style.update(styles.get(element_style_id, {}))
3850                 for prop in SUPPORTED_STYLING:
3851                     prop_val = attrib.get(_x('tts:' + prop))
3852                     if prop_val:
3853                         style[prop] = prop_val
3854                 if style:
3855                     font = ''
3856                     for k, v in sorted(style.items()):
3857                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3858                             continue
3859                         if k == 'color':
3860                             font += ' color="%s"' % v
3861                         elif k == 'fontSize':
3862                             font += ' size="%s"' % v
3863                         elif k == 'fontFamily':
3864                             font += ' face="%s"' % v
3865                         elif k == 'fontWeight' and v == 'bold':
3866                             self._out += '<b>'
3867                             unclosed_elements.append('b')
3868                         elif k == 'fontStyle' and v == 'italic':
3869                             self._out += '<i>'
3870                             unclosed_elements.append('i')
3871                         elif k == 'textDecoration' and v == 'underline':
3872                             self._out += '<u>'
3873                             unclosed_elements.append('u')
3874                     if font:
3875                         self._out += '<font' + font + '>'
3876                         unclosed_elements.append('font')
3877                     applied_style = {}
3878                     if self._applied_styles:
3879                         applied_style.update(self._applied_styles[-1])
3880                     applied_style.update(style)
3881                     self._applied_styles.append(applied_style)
3882                 self._unclosed_elements.append(unclosed_elements)
3883
3884         def end(self, tag):
3885             if tag not in (_x('ttml:br'), 'br'):
3886                 unclosed_elements = self._unclosed_elements.pop()
3887                 for element in reversed(unclosed_elements):
3888                     self._out += '</%s>' % element
3889                 if unclosed_elements and self._applied_styles:
3890                     self._applied_styles.pop()
3891
3892         def data(self, data):
3893             self._out += data
3894
3895         def close(self):
3896             return self._out.strip()
3897
3898     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3899     # This will not trigger false positives since only UTF-8 text is being replaced
3900     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3901
3902     def parse_node(node):
3903         target = TTMLPElementParser()
3904         parser = xml.etree.ElementTree.XMLParser(target=target)
3905         parser.feed(xml.etree.ElementTree.tostring(node))
3906         return parser.close()
3907
3908     for k, v in LEGACY_NAMESPACES:
3909         for ns in v:
3910             dfxp_data = dfxp_data.replace(ns, k)
3911
3912     dfxp = compat_etree_fromstring(dfxp_data)
3913     out = []
3914     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3915
3916     if not paras:
3917         raise ValueError('Invalid dfxp/TTML subtitle')
3918
3919     repeat = False
3920     while True:
3921         for style in dfxp.findall(_x('.//ttml:style')):
3922             style_id = style.get('id') or style.get(_x('xml:id'))
3923             if not style_id:
3924                 continue
3925             parent_style_id = style.get('style')
3926             if parent_style_id:
3927                 if parent_style_id not in styles:
3928                     repeat = True
3929                     continue
3930                 styles[style_id] = styles[parent_style_id].copy()
3931             for prop in SUPPORTED_STYLING:
3932                 prop_val = style.get(_x('tts:' + prop))
3933                 if prop_val:
3934                     styles.setdefault(style_id, {})[prop] = prop_val
3935         if repeat:
3936             repeat = False
3937         else:
3938             break
3939
3940     for p in ('body', 'div'):
3941         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3942         if ele is None:
3943             continue
3944         style = styles.get(ele.get('style'))
3945         if not style:
3946             continue
3947         default_style.update(style)
3948
3949     for para, index in zip(paras, itertools.count(1)):
3950         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3951         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3952         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3953         if begin_time is None:
3954             continue
3955         if not end_time:
3956             if not dur:
3957                 continue
3958             end_time = begin_time + dur
3959         out.append('%d\n%s --> %s\n%s\n\n' % (
3960             index,
3961             srt_subtitles_timecode(begin_time),
3962             srt_subtitles_timecode(end_time),
3963             parse_node(para)))
3964
3965     return ''.join(out)
3966
3967
3968 def cli_option(params, command_option, param, separator=None):
3969     param = params.get(param)
3970     return ([] if param is None
3971             else [command_option, str(param)] if separator is None
3972             else [f'{command_option}{separator}{param}'])
3973
3974
3975 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3976     param = params.get(param)
3977     assert param in (True, False, None)
3978     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3979
3980
3981 def cli_valueless_option(params, command_option, param, expected_value=True):
3982     return [command_option] if params.get(param) == expected_value else []
3983
3984
3985 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3986     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3987         if use_compat:
3988             return argdict
3989         else:
3990             argdict = None
3991     if argdict is None:
3992         return default
3993     assert isinstance(argdict, dict)
3994
3995     assert isinstance(keys, (list, tuple))
3996     for key_list in keys:
3997         arg_list = list(filter(
3998             lambda x: x is not None,
3999             [argdict.get(key.lower()) for key in variadic(key_list)]))
4000         if arg_list:
4001             return [arg for args in arg_list for arg in args]
4002     return default
4003
4004
4005 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4006     main_key, exe = main_key.lower(), exe.lower()
4007     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4008     keys = [f'{root_key}{k}' for k in (keys or [''])]
4009     if root_key in keys:
4010         if main_key != exe:
4011             keys.append((main_key, exe))
4012         keys.append('default')
4013     else:
4014         use_compat = False
4015     return cli_configuration_args(argdict, keys, default, use_compat)
4016
4017
4018 class ISO639Utils:
4019     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4020     _lang_map = {
4021         'aa': 'aar',
4022         'ab': 'abk',
4023         'ae': 'ave',
4024         'af': 'afr',
4025         'ak': 'aka',
4026         'am': 'amh',
4027         'an': 'arg',
4028         'ar': 'ara',
4029         'as': 'asm',
4030         'av': 'ava',
4031         'ay': 'aym',
4032         'az': 'aze',
4033         'ba': 'bak',
4034         'be': 'bel',
4035         'bg': 'bul',
4036         'bh': 'bih',
4037         'bi': 'bis',
4038         'bm': 'bam',
4039         'bn': 'ben',
4040         'bo': 'bod',
4041         'br': 'bre',
4042         'bs': 'bos',
4043         'ca': 'cat',
4044         'ce': 'che',
4045         'ch': 'cha',
4046         'co': 'cos',
4047         'cr': 'cre',
4048         'cs': 'ces',
4049         'cu': 'chu',
4050         'cv': 'chv',
4051         'cy': 'cym',
4052         'da': 'dan',
4053         'de': 'deu',
4054         'dv': 'div',
4055         'dz': 'dzo',
4056         'ee': 'ewe',
4057         'el': 'ell',
4058         'en': 'eng',
4059         'eo': 'epo',
4060         'es': 'spa',
4061         'et': 'est',
4062         'eu': 'eus',
4063         'fa': 'fas',
4064         'ff': 'ful',
4065         'fi': 'fin',
4066         'fj': 'fij',
4067         'fo': 'fao',
4068         'fr': 'fra',
4069         'fy': 'fry',
4070         'ga': 'gle',
4071         'gd': 'gla',
4072         'gl': 'glg',
4073         'gn': 'grn',
4074         'gu': 'guj',
4075         'gv': 'glv',
4076         'ha': 'hau',
4077         'he': 'heb',
4078         'iw': 'heb',  # Replaced by he in 1989 revision
4079         'hi': 'hin',
4080         'ho': 'hmo',
4081         'hr': 'hrv',
4082         'ht': 'hat',
4083         'hu': 'hun',
4084         'hy': 'hye',
4085         'hz': 'her',
4086         'ia': 'ina',
4087         'id': 'ind',
4088         'in': 'ind',  # Replaced by id in 1989 revision
4089         'ie': 'ile',
4090         'ig': 'ibo',
4091         'ii': 'iii',
4092         'ik': 'ipk',
4093         'io': 'ido',
4094         'is': 'isl',
4095         'it': 'ita',
4096         'iu': 'iku',
4097         'ja': 'jpn',
4098         'jv': 'jav',
4099         'ka': 'kat',
4100         'kg': 'kon',
4101         'ki': 'kik',
4102         'kj': 'kua',
4103         'kk': 'kaz',
4104         'kl': 'kal',
4105         'km': 'khm',
4106         'kn': 'kan',
4107         'ko': 'kor',
4108         'kr': 'kau',
4109         'ks': 'kas',
4110         'ku': 'kur',
4111         'kv': 'kom',
4112         'kw': 'cor',
4113         'ky': 'kir',
4114         'la': 'lat',
4115         'lb': 'ltz',
4116         'lg': 'lug',
4117         'li': 'lim',
4118         'ln': 'lin',
4119         'lo': 'lao',
4120         'lt': 'lit',
4121         'lu': 'lub',
4122         'lv': 'lav',
4123         'mg': 'mlg',
4124         'mh': 'mah',
4125         'mi': 'mri',
4126         'mk': 'mkd',
4127         'ml': 'mal',
4128         'mn': 'mon',
4129         'mr': 'mar',
4130         'ms': 'msa',
4131         'mt': 'mlt',
4132         'my': 'mya',
4133         'na': 'nau',
4134         'nb': 'nob',
4135         'nd': 'nde',
4136         'ne': 'nep',
4137         'ng': 'ndo',
4138         'nl': 'nld',
4139         'nn': 'nno',
4140         'no': 'nor',
4141         'nr': 'nbl',
4142         'nv': 'nav',
4143         'ny': 'nya',
4144         'oc': 'oci',
4145         'oj': 'oji',
4146         'om': 'orm',
4147         'or': 'ori',
4148         'os': 'oss',
4149         'pa': 'pan',
4150         'pe': 'per',
4151         'pi': 'pli',
4152         'pl': 'pol',
4153         'ps': 'pus',
4154         'pt': 'por',
4155         'qu': 'que',
4156         'rm': 'roh',
4157         'rn': 'run',
4158         'ro': 'ron',
4159         'ru': 'rus',
4160         'rw': 'kin',
4161         'sa': 'san',
4162         'sc': 'srd',
4163         'sd': 'snd',
4164         'se': 'sme',
4165         'sg': 'sag',
4166         'si': 'sin',
4167         'sk': 'slk',
4168         'sl': 'slv',
4169         'sm': 'smo',
4170         'sn': 'sna',
4171         'so': 'som',
4172         'sq': 'sqi',
4173         'sr': 'srp',
4174         'ss': 'ssw',
4175         'st': 'sot',
4176         'su': 'sun',
4177         'sv': 'swe',
4178         'sw': 'swa',
4179         'ta': 'tam',
4180         'te': 'tel',
4181         'tg': 'tgk',
4182         'th': 'tha',
4183         'ti': 'tir',
4184         'tk': 'tuk',
4185         'tl': 'tgl',
4186         'tn': 'tsn',
4187         'to': 'ton',
4188         'tr': 'tur',
4189         'ts': 'tso',
4190         'tt': 'tat',
4191         'tw': 'twi',
4192         'ty': 'tah',
4193         'ug': 'uig',
4194         'uk': 'ukr',
4195         'ur': 'urd',
4196         'uz': 'uzb',
4197         've': 'ven',
4198         'vi': 'vie',
4199         'vo': 'vol',
4200         'wa': 'wln',
4201         'wo': 'wol',
4202         'xh': 'xho',
4203         'yi': 'yid',
4204         'ji': 'yid',  # Replaced by yi in 1989 revision
4205         'yo': 'yor',
4206         'za': 'zha',
4207         'zh': 'zho',
4208         'zu': 'zul',
4209     }
4210
4211     @classmethod
4212     def short2long(cls, code):
4213         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4214         return cls._lang_map.get(code[:2])
4215
4216     @classmethod
4217     def long2short(cls, code):
4218         """Convert language code from ISO 639-2/T to ISO 639-1"""
4219         for short_name, long_name in cls._lang_map.items():
4220             if long_name == code:
4221                 return short_name
4222
4223
4224 class ISO3166Utils:
4225     # From http://data.okfn.org/data/core/country-list
4226     _country_map = {
4227         'AF': 'Afghanistan',
4228         'AX': 'Åland Islands',
4229         'AL': 'Albania',
4230         'DZ': 'Algeria',
4231         'AS': 'American Samoa',
4232         'AD': 'Andorra',
4233         'AO': 'Angola',
4234         'AI': 'Anguilla',
4235         'AQ': 'Antarctica',
4236         'AG': 'Antigua and Barbuda',
4237         'AR': 'Argentina',
4238         'AM': 'Armenia',
4239         'AW': 'Aruba',
4240         'AU': 'Australia',
4241         'AT': 'Austria',
4242         'AZ': 'Azerbaijan',
4243         'BS': 'Bahamas',
4244         'BH': 'Bahrain',
4245         'BD': 'Bangladesh',
4246         'BB': 'Barbados',
4247         'BY': 'Belarus',
4248         'BE': 'Belgium',
4249         'BZ': 'Belize',
4250         'BJ': 'Benin',
4251         'BM': 'Bermuda',
4252         'BT': 'Bhutan',
4253         'BO': 'Bolivia, Plurinational State of',
4254         'BQ': 'Bonaire, Sint Eustatius and Saba',
4255         'BA': 'Bosnia and Herzegovina',
4256         'BW': 'Botswana',
4257         'BV': 'Bouvet Island',
4258         'BR': 'Brazil',
4259         'IO': 'British Indian Ocean Territory',
4260         'BN': 'Brunei Darussalam',
4261         'BG': 'Bulgaria',
4262         'BF': 'Burkina Faso',
4263         'BI': 'Burundi',
4264         'KH': 'Cambodia',
4265         'CM': 'Cameroon',
4266         'CA': 'Canada',
4267         'CV': 'Cape Verde',
4268         'KY': 'Cayman Islands',
4269         'CF': 'Central African Republic',
4270         'TD': 'Chad',
4271         'CL': 'Chile',
4272         'CN': 'China',
4273         'CX': 'Christmas Island',
4274         'CC': 'Cocos (Keeling) Islands',
4275         'CO': 'Colombia',
4276         'KM': 'Comoros',
4277         'CG': 'Congo',
4278         'CD': 'Congo, the Democratic Republic of the',
4279         'CK': 'Cook Islands',
4280         'CR': 'Costa Rica',
4281         'CI': 'Côte d\'Ivoire',
4282         'HR': 'Croatia',
4283         'CU': 'Cuba',
4284         'CW': 'Curaçao',
4285         'CY': 'Cyprus',
4286         'CZ': 'Czech Republic',
4287         'DK': 'Denmark',
4288         'DJ': 'Djibouti',
4289         'DM': 'Dominica',
4290         'DO': 'Dominican Republic',
4291         'EC': 'Ecuador',
4292         'EG': 'Egypt',
4293         'SV': 'El Salvador',
4294         'GQ': 'Equatorial Guinea',
4295         'ER': 'Eritrea',
4296         'EE': 'Estonia',
4297         'ET': 'Ethiopia',
4298         'FK': 'Falkland Islands (Malvinas)',
4299         'FO': 'Faroe Islands',
4300         'FJ': 'Fiji',
4301         'FI': 'Finland',
4302         'FR': 'France',
4303         'GF': 'French Guiana',
4304         'PF': 'French Polynesia',
4305         'TF': 'French Southern Territories',
4306         'GA': 'Gabon',
4307         'GM': 'Gambia',
4308         'GE': 'Georgia',
4309         'DE': 'Germany',
4310         'GH': 'Ghana',
4311         'GI': 'Gibraltar',
4312         'GR': 'Greece',
4313         'GL': 'Greenland',
4314         'GD': 'Grenada',
4315         'GP': 'Guadeloupe',
4316         'GU': 'Guam',
4317         'GT': 'Guatemala',
4318         'GG': 'Guernsey',
4319         'GN': 'Guinea',
4320         'GW': 'Guinea-Bissau',
4321         'GY': 'Guyana',
4322         'HT': 'Haiti',
4323         'HM': 'Heard Island and McDonald Islands',
4324         'VA': 'Holy See (Vatican City State)',
4325         'HN': 'Honduras',
4326         'HK': 'Hong Kong',
4327         'HU': 'Hungary',
4328         'IS': 'Iceland',
4329         'IN': 'India',
4330         'ID': 'Indonesia',
4331         'IR': 'Iran, Islamic Republic of',
4332         'IQ': 'Iraq',
4333         'IE': 'Ireland',
4334         'IM': 'Isle of Man',
4335         'IL': 'Israel',
4336         'IT': 'Italy',
4337         'JM': 'Jamaica',
4338         'JP': 'Japan',
4339         'JE': 'Jersey',
4340         'JO': 'Jordan',
4341         'KZ': 'Kazakhstan',
4342         'KE': 'Kenya',
4343         'KI': 'Kiribati',
4344         'KP': 'Korea, Democratic People\'s Republic of',
4345         'KR': 'Korea, Republic of',
4346         'KW': 'Kuwait',
4347         'KG': 'Kyrgyzstan',
4348         'LA': 'Lao People\'s Democratic Republic',
4349         'LV': 'Latvia',
4350         'LB': 'Lebanon',
4351         'LS': 'Lesotho',
4352         'LR': 'Liberia',
4353         'LY': 'Libya',
4354         'LI': 'Liechtenstein',
4355         'LT': 'Lithuania',
4356         'LU': 'Luxembourg',
4357         'MO': 'Macao',
4358         'MK': 'Macedonia, the Former Yugoslav Republic of',
4359         'MG': 'Madagascar',
4360         'MW': 'Malawi',
4361         'MY': 'Malaysia',
4362         'MV': 'Maldives',
4363         'ML': 'Mali',
4364         'MT': 'Malta',
4365         'MH': 'Marshall Islands',
4366         'MQ': 'Martinique',
4367         'MR': 'Mauritania',
4368         'MU': 'Mauritius',
4369         'YT': 'Mayotte',
4370         'MX': 'Mexico',
4371         'FM': 'Micronesia, Federated States of',
4372         'MD': 'Moldova, Republic of',
4373         'MC': 'Monaco',
4374         'MN': 'Mongolia',
4375         'ME': 'Montenegro',
4376         'MS': 'Montserrat',
4377         'MA': 'Morocco',
4378         'MZ': 'Mozambique',
4379         'MM': 'Myanmar',
4380         'NA': 'Namibia',
4381         'NR': 'Nauru',
4382         'NP': 'Nepal',
4383         'NL': 'Netherlands',
4384         'NC': 'New Caledonia',
4385         'NZ': 'New Zealand',
4386         'NI': 'Nicaragua',
4387         'NE': 'Niger',
4388         'NG': 'Nigeria',
4389         'NU': 'Niue',
4390         'NF': 'Norfolk Island',
4391         'MP': 'Northern Mariana Islands',
4392         'NO': 'Norway',
4393         'OM': 'Oman',
4394         'PK': 'Pakistan',
4395         'PW': 'Palau',
4396         'PS': 'Palestine, State of',
4397         'PA': 'Panama',
4398         'PG': 'Papua New Guinea',
4399         'PY': 'Paraguay',
4400         'PE': 'Peru',
4401         'PH': 'Philippines',
4402         'PN': 'Pitcairn',
4403         'PL': 'Poland',
4404         'PT': 'Portugal',
4405         'PR': 'Puerto Rico',
4406         'QA': 'Qatar',
4407         'RE': 'Réunion',
4408         'RO': 'Romania',
4409         'RU': 'Russian Federation',
4410         'RW': 'Rwanda',
4411         'BL': 'Saint Barthélemy',
4412         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4413         'KN': 'Saint Kitts and Nevis',
4414         'LC': 'Saint Lucia',
4415         'MF': 'Saint Martin (French part)',
4416         'PM': 'Saint Pierre and Miquelon',
4417         'VC': 'Saint Vincent and the Grenadines',
4418         'WS': 'Samoa',
4419         'SM': 'San Marino',
4420         'ST': 'Sao Tome and Principe',
4421         'SA': 'Saudi Arabia',
4422         'SN': 'Senegal',
4423         'RS': 'Serbia',
4424         'SC': 'Seychelles',
4425         'SL': 'Sierra Leone',
4426         'SG': 'Singapore',
4427         'SX': 'Sint Maarten (Dutch part)',
4428         'SK': 'Slovakia',
4429         'SI': 'Slovenia',
4430         'SB': 'Solomon Islands',
4431         'SO': 'Somalia',
4432         'ZA': 'South Africa',
4433         'GS': 'South Georgia and the South Sandwich Islands',
4434         'SS': 'South Sudan',
4435         'ES': 'Spain',
4436         'LK': 'Sri Lanka',
4437         'SD': 'Sudan',
4438         'SR': 'Suriname',
4439         'SJ': 'Svalbard and Jan Mayen',
4440         'SZ': 'Swaziland',
4441         'SE': 'Sweden',
4442         'CH': 'Switzerland',
4443         'SY': 'Syrian Arab Republic',
4444         'TW': 'Taiwan, Province of China',
4445         'TJ': 'Tajikistan',
4446         'TZ': 'Tanzania, United Republic of',
4447         'TH': 'Thailand',
4448         'TL': 'Timor-Leste',
4449         'TG': 'Togo',
4450         'TK': 'Tokelau',
4451         'TO': 'Tonga',
4452         'TT': 'Trinidad and Tobago',
4453         'TN': 'Tunisia',
4454         'TR': 'Turkey',
4455         'TM': 'Turkmenistan',
4456         'TC': 'Turks and Caicos Islands',
4457         'TV': 'Tuvalu',
4458         'UG': 'Uganda',
4459         'UA': 'Ukraine',
4460         'AE': 'United Arab Emirates',
4461         'GB': 'United Kingdom',
4462         'US': 'United States',
4463         'UM': 'United States Minor Outlying Islands',
4464         'UY': 'Uruguay',
4465         'UZ': 'Uzbekistan',
4466         'VU': 'Vanuatu',
4467         'VE': 'Venezuela, Bolivarian Republic of',
4468         'VN': 'Viet Nam',
4469         'VG': 'Virgin Islands, British',
4470         'VI': 'Virgin Islands, U.S.',
4471         'WF': 'Wallis and Futuna',
4472         'EH': 'Western Sahara',
4473         'YE': 'Yemen',
4474         'ZM': 'Zambia',
4475         'ZW': 'Zimbabwe',
4476         # Not ISO 3166 codes, but used for IP blocks
4477         'AP': 'Asia/Pacific Region',
4478         'EU': 'Europe',
4479     }
4480
4481     @classmethod
4482     def short2full(cls, code):
4483         """Convert an ISO 3166-2 country code to the corresponding full name"""
4484         return cls._country_map.get(code.upper())
4485
4486
4487 class GeoUtils:
4488     # Major IPv4 address blocks per country
4489     _country_ip_map = {
4490         'AD': '46.172.224.0/19',
4491         'AE': '94.200.0.0/13',
4492         'AF': '149.54.0.0/17',
4493         'AG': '209.59.64.0/18',
4494         'AI': '204.14.248.0/21',
4495         'AL': '46.99.0.0/16',
4496         'AM': '46.70.0.0/15',
4497         'AO': '105.168.0.0/13',
4498         'AP': '182.50.184.0/21',
4499         'AQ': '23.154.160.0/24',
4500         'AR': '181.0.0.0/12',
4501         'AS': '202.70.112.0/20',
4502         'AT': '77.116.0.0/14',
4503         'AU': '1.128.0.0/11',
4504         'AW': '181.41.0.0/18',
4505         'AX': '185.217.4.0/22',
4506         'AZ': '5.197.0.0/16',
4507         'BA': '31.176.128.0/17',
4508         'BB': '65.48.128.0/17',
4509         'BD': '114.130.0.0/16',
4510         'BE': '57.0.0.0/8',
4511         'BF': '102.178.0.0/15',
4512         'BG': '95.42.0.0/15',
4513         'BH': '37.131.0.0/17',
4514         'BI': '154.117.192.0/18',
4515         'BJ': '137.255.0.0/16',
4516         'BL': '185.212.72.0/23',
4517         'BM': '196.12.64.0/18',
4518         'BN': '156.31.0.0/16',
4519         'BO': '161.56.0.0/16',
4520         'BQ': '161.0.80.0/20',
4521         'BR': '191.128.0.0/12',
4522         'BS': '24.51.64.0/18',
4523         'BT': '119.2.96.0/19',
4524         'BW': '168.167.0.0/16',
4525         'BY': '178.120.0.0/13',
4526         'BZ': '179.42.192.0/18',
4527         'CA': '99.224.0.0/11',
4528         'CD': '41.243.0.0/16',
4529         'CF': '197.242.176.0/21',
4530         'CG': '160.113.0.0/16',
4531         'CH': '85.0.0.0/13',
4532         'CI': '102.136.0.0/14',
4533         'CK': '202.65.32.0/19',
4534         'CL': '152.172.0.0/14',
4535         'CM': '102.244.0.0/14',
4536         'CN': '36.128.0.0/10',
4537         'CO': '181.240.0.0/12',
4538         'CR': '201.192.0.0/12',
4539         'CU': '152.206.0.0/15',
4540         'CV': '165.90.96.0/19',
4541         'CW': '190.88.128.0/17',
4542         'CY': '31.153.0.0/16',
4543         'CZ': '88.100.0.0/14',
4544         'DE': '53.0.0.0/8',
4545         'DJ': '197.241.0.0/17',
4546         'DK': '87.48.0.0/12',
4547         'DM': '192.243.48.0/20',
4548         'DO': '152.166.0.0/15',
4549         'DZ': '41.96.0.0/12',
4550         'EC': '186.68.0.0/15',
4551         'EE': '90.190.0.0/15',
4552         'EG': '156.160.0.0/11',
4553         'ER': '196.200.96.0/20',
4554         'ES': '88.0.0.0/11',
4555         'ET': '196.188.0.0/14',
4556         'EU': '2.16.0.0/13',
4557         'FI': '91.152.0.0/13',
4558         'FJ': '144.120.0.0/16',
4559         'FK': '80.73.208.0/21',
4560         'FM': '119.252.112.0/20',
4561         'FO': '88.85.32.0/19',
4562         'FR': '90.0.0.0/9',
4563         'GA': '41.158.0.0/15',
4564         'GB': '25.0.0.0/8',
4565         'GD': '74.122.88.0/21',
4566         'GE': '31.146.0.0/16',
4567         'GF': '161.22.64.0/18',
4568         'GG': '62.68.160.0/19',
4569         'GH': '154.160.0.0/12',
4570         'GI': '95.164.0.0/16',
4571         'GL': '88.83.0.0/19',
4572         'GM': '160.182.0.0/15',
4573         'GN': '197.149.192.0/18',
4574         'GP': '104.250.0.0/19',
4575         'GQ': '105.235.224.0/20',
4576         'GR': '94.64.0.0/13',
4577         'GT': '168.234.0.0/16',
4578         'GU': '168.123.0.0/16',
4579         'GW': '197.214.80.0/20',
4580         'GY': '181.41.64.0/18',
4581         'HK': '113.252.0.0/14',
4582         'HN': '181.210.0.0/16',
4583         'HR': '93.136.0.0/13',
4584         'HT': '148.102.128.0/17',
4585         'HU': '84.0.0.0/14',
4586         'ID': '39.192.0.0/10',
4587         'IE': '87.32.0.0/12',
4588         'IL': '79.176.0.0/13',
4589         'IM': '5.62.80.0/20',
4590         'IN': '117.192.0.0/10',
4591         'IO': '203.83.48.0/21',
4592         'IQ': '37.236.0.0/14',
4593         'IR': '2.176.0.0/12',
4594         'IS': '82.221.0.0/16',
4595         'IT': '79.0.0.0/10',
4596         'JE': '87.244.64.0/18',
4597         'JM': '72.27.0.0/17',
4598         'JO': '176.29.0.0/16',
4599         'JP': '133.0.0.0/8',
4600         'KE': '105.48.0.0/12',
4601         'KG': '158.181.128.0/17',
4602         'KH': '36.37.128.0/17',
4603         'KI': '103.25.140.0/22',
4604         'KM': '197.255.224.0/20',
4605         'KN': '198.167.192.0/19',
4606         'KP': '175.45.176.0/22',
4607         'KR': '175.192.0.0/10',
4608         'KW': '37.36.0.0/14',
4609         'KY': '64.96.0.0/15',
4610         'KZ': '2.72.0.0/13',
4611         'LA': '115.84.64.0/18',
4612         'LB': '178.135.0.0/16',
4613         'LC': '24.92.144.0/20',
4614         'LI': '82.117.0.0/19',
4615         'LK': '112.134.0.0/15',
4616         'LR': '102.183.0.0/16',
4617         'LS': '129.232.0.0/17',
4618         'LT': '78.56.0.0/13',
4619         'LU': '188.42.0.0/16',
4620         'LV': '46.109.0.0/16',
4621         'LY': '41.252.0.0/14',
4622         'MA': '105.128.0.0/11',
4623         'MC': '88.209.64.0/18',
4624         'MD': '37.246.0.0/16',
4625         'ME': '178.175.0.0/17',
4626         'MF': '74.112.232.0/21',
4627         'MG': '154.126.0.0/17',
4628         'MH': '117.103.88.0/21',
4629         'MK': '77.28.0.0/15',
4630         'ML': '154.118.128.0/18',
4631         'MM': '37.111.0.0/17',
4632         'MN': '49.0.128.0/17',
4633         'MO': '60.246.0.0/16',
4634         'MP': '202.88.64.0/20',
4635         'MQ': '109.203.224.0/19',
4636         'MR': '41.188.64.0/18',
4637         'MS': '208.90.112.0/22',
4638         'MT': '46.11.0.0/16',
4639         'MU': '105.16.0.0/12',
4640         'MV': '27.114.128.0/18',
4641         'MW': '102.70.0.0/15',
4642         'MX': '187.192.0.0/11',
4643         'MY': '175.136.0.0/13',
4644         'MZ': '197.218.0.0/15',
4645         'NA': '41.182.0.0/16',
4646         'NC': '101.101.0.0/18',
4647         'NE': '197.214.0.0/18',
4648         'NF': '203.17.240.0/22',
4649         'NG': '105.112.0.0/12',
4650         'NI': '186.76.0.0/15',
4651         'NL': '145.96.0.0/11',
4652         'NO': '84.208.0.0/13',
4653         'NP': '36.252.0.0/15',
4654         'NR': '203.98.224.0/19',
4655         'NU': '49.156.48.0/22',
4656         'NZ': '49.224.0.0/14',
4657         'OM': '5.36.0.0/15',
4658         'PA': '186.72.0.0/15',
4659         'PE': '186.160.0.0/14',
4660         'PF': '123.50.64.0/18',
4661         'PG': '124.240.192.0/19',
4662         'PH': '49.144.0.0/13',
4663         'PK': '39.32.0.0/11',
4664         'PL': '83.0.0.0/11',
4665         'PM': '70.36.0.0/20',
4666         'PR': '66.50.0.0/16',
4667         'PS': '188.161.0.0/16',
4668         'PT': '85.240.0.0/13',
4669         'PW': '202.124.224.0/20',
4670         'PY': '181.120.0.0/14',
4671         'QA': '37.210.0.0/15',
4672         'RE': '102.35.0.0/16',
4673         'RO': '79.112.0.0/13',
4674         'RS': '93.86.0.0/15',
4675         'RU': '5.136.0.0/13',
4676         'RW': '41.186.0.0/16',
4677         'SA': '188.48.0.0/13',
4678         'SB': '202.1.160.0/19',
4679         'SC': '154.192.0.0/11',
4680         'SD': '102.120.0.0/13',
4681         'SE': '78.64.0.0/12',
4682         'SG': '8.128.0.0/10',
4683         'SI': '188.196.0.0/14',
4684         'SK': '78.98.0.0/15',
4685         'SL': '102.143.0.0/17',
4686         'SM': '89.186.32.0/19',
4687         'SN': '41.82.0.0/15',
4688         'SO': '154.115.192.0/18',
4689         'SR': '186.179.128.0/17',
4690         'SS': '105.235.208.0/21',
4691         'ST': '197.159.160.0/19',
4692         'SV': '168.243.0.0/16',
4693         'SX': '190.102.0.0/20',
4694         'SY': '5.0.0.0/16',
4695         'SZ': '41.84.224.0/19',
4696         'TC': '65.255.48.0/20',
4697         'TD': '154.68.128.0/19',
4698         'TG': '196.168.0.0/14',
4699         'TH': '171.96.0.0/13',
4700         'TJ': '85.9.128.0/18',
4701         'TK': '27.96.24.0/21',
4702         'TL': '180.189.160.0/20',
4703         'TM': '95.85.96.0/19',
4704         'TN': '197.0.0.0/11',
4705         'TO': '175.176.144.0/21',
4706         'TR': '78.160.0.0/11',
4707         'TT': '186.44.0.0/15',
4708         'TV': '202.2.96.0/19',
4709         'TW': '120.96.0.0/11',
4710         'TZ': '156.156.0.0/14',
4711         'UA': '37.52.0.0/14',
4712         'UG': '102.80.0.0/13',
4713         'US': '6.0.0.0/8',
4714         'UY': '167.56.0.0/13',
4715         'UZ': '84.54.64.0/18',
4716         'VA': '212.77.0.0/19',
4717         'VC': '207.191.240.0/21',
4718         'VE': '186.88.0.0/13',
4719         'VG': '66.81.192.0/20',
4720         'VI': '146.226.0.0/16',
4721         'VN': '14.160.0.0/11',
4722         'VU': '202.80.32.0/20',
4723         'WF': '117.20.32.0/21',
4724         'WS': '202.4.32.0/19',
4725         'YE': '134.35.0.0/16',
4726         'YT': '41.242.116.0/22',
4727         'ZA': '41.0.0.0/11',
4728         'ZM': '102.144.0.0/13',
4729         'ZW': '102.177.192.0/18',
4730     }
4731
4732     @classmethod
4733     def random_ipv4(cls, code_or_block):
4734         if len(code_or_block) == 2:
4735             block = cls._country_ip_map.get(code_or_block.upper())
4736             if not block:
4737                 return None
4738         else:
4739             block = code_or_block
4740         addr, preflen = block.split('/')
4741         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4742         addr_max = addr_min | (0xffffffff >> int(preflen))
4743         return str(socket.inet_ntoa(
4744             struct.pack('!L', random.randint(addr_min, addr_max))))
4745
4746
4747 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4748     def __init__(self, proxies=None):
4749         # Set default handlers
4750         for type in ('http', 'https'):
4751             setattr(self, '%s_open' % type,
4752                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4753                         meth(r, proxy, type))
4754         urllib.request.ProxyHandler.__init__(self, proxies)
4755
4756     def proxy_open(self, req, proxy, type):
4757         req_proxy = req.headers.get('Ytdl-request-proxy')
4758         if req_proxy is not None:
4759             proxy = req_proxy
4760             del req.headers['Ytdl-request-proxy']
4761
4762         if proxy == '__noproxy__':
4763             return None  # No Proxy
4764         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4765             req.add_header('Ytdl-socks-proxy', proxy)
4766             # yt-dlp's http/https handlers do wrapping the socket with socks
4767             return None
4768         return urllib.request.ProxyHandler.proxy_open(
4769             self, req, proxy, type)
4770
4771
4772 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4773 # released into Public Domain
4774 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4775
4776 def long_to_bytes(n, blocksize=0):
4777     """long_to_bytes(n:long, blocksize:int) : string
4778     Convert a long integer to a byte string.
4779
4780     If optional blocksize is given and greater than zero, pad the front of the
4781     byte string with binary zeros so that the length is a multiple of
4782     blocksize.
4783     """
4784     # after much testing, this algorithm was deemed to be the fastest
4785     s = b''
4786     n = int(n)
4787     while n > 0:
4788         s = struct.pack('>I', n & 0xffffffff) + s
4789         n = n >> 32
4790     # strip off leading zeros
4791     for i in range(len(s)):
4792         if s[i] != b'\000'[0]:
4793             break
4794     else:
4795         # only happens when n == 0
4796         s = b'\000'
4797         i = 0
4798     s = s[i:]
4799     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4800     # de-padding being done above, but sigh...
4801     if blocksize > 0 and len(s) % blocksize:
4802         s = (blocksize - len(s) % blocksize) * b'\000' + s
4803     return s
4804
4805
4806 def bytes_to_long(s):
4807     """bytes_to_long(string) : long
4808     Convert a byte string to a long integer.
4809
4810     This is (essentially) the inverse of long_to_bytes().
4811     """
4812     acc = 0
4813     length = len(s)
4814     if length % 4:
4815         extra = (4 - length % 4)
4816         s = b'\000' * extra + s
4817         length = length + extra
4818     for i in range(0, length, 4):
4819         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4820     return acc
4821
4822
4823 def ohdave_rsa_encrypt(data, exponent, modulus):
4824     '''
4825     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4826
4827     Input:
4828         data: data to encrypt, bytes-like object
4829         exponent, modulus: parameter e and N of RSA algorithm, both integer
4830     Output: hex string of encrypted data
4831
4832     Limitation: supports one block encryption only
4833     '''
4834
4835     payload = int(binascii.hexlify(data[::-1]), 16)
4836     encrypted = pow(payload, exponent, modulus)
4837     return '%x' % encrypted
4838
4839
4840 def pkcs1pad(data, length):
4841     """
4842     Padding input data with PKCS#1 scheme
4843
4844     @param {int[]} data        input data
4845     @param {int}   length      target length
4846     @returns {int[]}           padded data
4847     """
4848     if len(data) > length - 11:
4849         raise ValueError('Input data too long for PKCS#1 padding')
4850
4851     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4852     return [0, 2] + pseudo_random + [0] + data
4853
4854
4855 def _base_n_table(n, table):
4856     if not table and not n:
4857         raise ValueError('Either table or n must be specified')
4858     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4859
4860     if n and n != len(table):
4861         raise ValueError(f'base {n} exceeds table length {len(table)}')
4862     return table
4863
4864
4865 def encode_base_n(num, n=None, table=None):
4866     """Convert given int to a base-n string"""
4867     table = _base_n_table(n, table)
4868     if not num:
4869         return table[0]
4870
4871     result, base = '', len(table)
4872     while num:
4873         result = table[num % base] + result
4874         num = num // base
4875     return result
4876
4877
4878 def decode_base_n(string, n=None, table=None):
4879     """Convert given base-n string to int"""
4880     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4881     result, base = 0, len(table)
4882     for char in string:
4883         result = result * base + table[char]
4884     return result
4885
4886
4887 def decode_packed_codes(code):
4888     mobj = re.search(PACKED_CODES_RE, code)
4889     obfuscated_code, base, count, symbols = mobj.groups()
4890     base = int(base)
4891     count = int(count)
4892     symbols = symbols.split('|')
4893     symbol_table = {}
4894
4895     while count:
4896         count -= 1
4897         base_n_count = encode_base_n(count, base)
4898         symbol_table[base_n_count] = symbols[count] or base_n_count
4899
4900     return re.sub(
4901         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4902         obfuscated_code)
4903
4904
4905 def caesar(s, alphabet, shift):
4906     if shift == 0:
4907         return s
4908     l = len(alphabet)
4909     return ''.join(
4910         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4911         for c in s)
4912
4913
4914 def rot47(s):
4915     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4916
4917
4918 def parse_m3u8_attributes(attrib):
4919     info = {}
4920     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4921         if val.startswith('"'):
4922             val = val[1:-1]
4923         info[key] = val
4924     return info
4925
4926
4927 def urshift(val, n):
4928     return val >> n if val >= 0 else (val + 0x100000000) >> n
4929
4930
4931 def write_xattr(path, key, value):
4932     # Windows: Write xattrs to NTFS Alternate Data Streams:
4933     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4934     if compat_os_name == 'nt':
4935         assert ':' not in key
4936         assert os.path.exists(path)
4937
4938         try:
4939             with open(f'{path}:{key}', 'wb') as f:
4940                 f.write(value)
4941         except OSError as e:
4942             raise XAttrMetadataError(e.errno, e.strerror)
4943         return
4944
4945     # UNIX Method 1. Use xattrs/pyxattrs modules
4946
4947     setxattr = None
4948     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4949         # Unicode arguments are not supported in pyxattr until version 0.5.0
4950         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4951         if version_tuple(xattr.__version__) >= (0, 5, 0):
4952             setxattr = xattr.set
4953     elif xattr:
4954         setxattr = xattr.setxattr
4955
4956     if setxattr:
4957         try:
4958             setxattr(path, key, value)
4959         except OSError as e:
4960             raise XAttrMetadataError(e.errno, e.strerror)
4961         return
4962
4963     # UNIX Method 2. Use setfattr/xattr executables
4964     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4965            else 'xattr' if check_executable('xattr', ['-h']) else None)
4966     if not exe:
4967         raise XAttrUnavailableError(
4968             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4969             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4970
4971     value = value.decode()
4972     try:
4973         _, stderr, returncode = Popen.run(
4974             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4975             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4976     except OSError as e:
4977         raise XAttrMetadataError(e.errno, e.strerror)
4978     if returncode:
4979         raise XAttrMetadataError(returncode, stderr)
4980
4981
4982 def random_birthday(year_field, month_field, day_field):
4983     start_date = datetime.date(1950, 1, 1)
4984     end_date = datetime.date(1995, 12, 31)
4985     offset = random.randint(0, (end_date - start_date).days)
4986     random_date = start_date + datetime.timedelta(offset)
4987     return {
4988         year_field: str(random_date.year),
4989         month_field: str(random_date.month),
4990         day_field: str(random_date.day),
4991     }
4992
4993
4994 def find_available_port(interface=''):
4995     try:
4996         with socket.socket() as sock:
4997             sock.bind((interface, 0))
4998             return sock.getsockname()[1]
4999     except OSError:
5000         return None
5001
5002
5003 # Templates for internet shortcut files, which are plain text files.
5004 DOT_URL_LINK_TEMPLATE = '''\
5005 [InternetShortcut]
5006 URL=%(url)s
5007 '''
5008
5009 DOT_WEBLOC_LINK_TEMPLATE = '''\
5010 <?xml version="1.0" encoding="UTF-8"?>
5011 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5012 <plist version="1.0">
5013 <dict>
5014 \t<key>URL</key>
5015 \t<string>%(url)s</string>
5016 </dict>
5017 </plist>
5018 '''
5019
5020 DOT_DESKTOP_LINK_TEMPLATE = '''\
5021 [Desktop Entry]
5022 Encoding=UTF-8
5023 Name=%(filename)s
5024 Type=Link
5025 URL=%(url)s
5026 Icon=text-html
5027 '''
5028
5029 LINK_TEMPLATES = {
5030     'url': DOT_URL_LINK_TEMPLATE,
5031     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5032     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5033 }
5034
5035
5036 def iri_to_uri(iri):
5037     """
5038     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5039
5040     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5041     """
5042
5043     iri_parts = urllib.parse.urlparse(iri)
5044
5045     if '[' in iri_parts.netloc:
5046         raise ValueError('IPv6 URIs are not, yet, supported.')
5047         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5048
5049     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5050
5051     net_location = ''
5052     if iri_parts.username:
5053         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5054         if iri_parts.password is not None:
5055             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5056         net_location += '@'
5057
5058     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5059     # The 'idna' encoding produces ASCII text.
5060     if iri_parts.port is not None and iri_parts.port != 80:
5061         net_location += ':' + str(iri_parts.port)
5062
5063     return urllib.parse.urlunparse(
5064         (iri_parts.scheme,
5065             net_location,
5066
5067             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5068
5069             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5070             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5071
5072             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5073             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5074
5075             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5076
5077     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5078
5079
5080 def to_high_limit_path(path):
5081     if sys.platform in ['win32', 'cygwin']:
5082         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5083         return '\\\\?\\' + os.path.abspath(path)
5084
5085     return path
5086
5087
5088 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5089     val = traversal.traverse_obj(obj, *variadic(field))
5090     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5091         return default
5092     return template % func(val)
5093
5094
5095 def clean_podcast_url(url):
5096     return re.sub(r'''(?x)
5097         (?:
5098             (?:
5099                 chtbl\.com/track|
5100                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5101                 play\.podtrac\.com
5102             )/[^/]+|
5103             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5104             flex\.acast\.com|
5105             pd(?:
5106                 cn\.co| # https://podcorn.com/analytics-prefix/
5107                 st\.fm # https://podsights.com/docs/
5108             )/e
5109         )/''', '', url)
5110
5111
5112 _HEX_TABLE = '0123456789abcdef'
5113
5114
5115 def random_uuidv4():
5116     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5117
5118
5119 def make_dir(path, to_screen=None):
5120     try:
5121         dn = os.path.dirname(path)
5122         if dn:
5123             os.makedirs(dn, exist_ok=True)
5124         return True
5125     except OSError as err:
5126         if callable(to_screen) is not None:
5127             to_screen(f'unable to create directory {err}')
5128         return False
5129
5130
5131 def get_executable_path():
5132     from ..update import _get_variant_and_executable_path
5133
5134     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5135
5136
5137 def get_user_config_dirs(package_name):
5138     # .config (e.g. ~/.config/package_name)
5139     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5140     yield os.path.join(xdg_config_home, package_name)
5141
5142     # appdata (%APPDATA%/package_name)
5143     appdata_dir = os.getenv('appdata')
5144     if appdata_dir:
5145         yield os.path.join(appdata_dir, package_name)
5146
5147     # home (~/.package_name)
5148     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5149
5150
5151 def get_system_config_dirs(package_name):
5152     # /etc/package_name
5153     yield os.path.join('/etc', package_name)
5154
5155
5156 def time_seconds(**kwargs):
5157     """
5158     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5159     """
5160     return time.time() + datetime.timedelta(**kwargs).total_seconds()
5161
5162
5163 # create a JSON Web Signature (jws) with HS256 algorithm
5164 # the resulting format is in JWS Compact Serialization
5165 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5166 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5167 def jwt_encode_hs256(payload_data, key, headers={}):
5168     header_data = {
5169         'alg': 'HS256',
5170         'typ': 'JWT',
5171     }
5172     if headers:
5173         header_data.update(headers)
5174     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5175     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5176     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5177     signature_b64 = base64.b64encode(h.digest())
5178     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5179     return token
5180
5181
5182 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5183 def jwt_decode_hs256(jwt):
5184     header_b64, payload_b64, signature_b64 = jwt.split('.')
5185     # add trailing ='s that may have been stripped, superfluous ='s are ignored
5186     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5187     return payload_data
5188
5189
5190 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5191
5192
5193 @functools.cache
5194 def supports_terminal_sequences(stream):
5195     if compat_os_name == 'nt':
5196         if not WINDOWS_VT_MODE:
5197             return False
5198     elif not os.getenv('TERM'):
5199         return False
5200     try:
5201         return stream.isatty()
5202     except BaseException:
5203         return False
5204
5205
5206 def windows_enable_vt_mode():
5207     """Ref: https://bugs.python.org/issue30075 """
5208     if get_windows_version() < (10, 0, 10586):
5209         return
5210
5211     import ctypes
5212     import ctypes.wintypes
5213     import msvcrt
5214
5215     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5216
5217     dll = ctypes.WinDLL('kernel32', use_last_error=False)
5218     handle = os.open('CONOUT$', os.O_RDWR)
5219     try:
5220         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5221         dw_original_mode = ctypes.wintypes.DWORD()
5222         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5223         if not success:
5224             raise Exception('GetConsoleMode failed')
5225
5226         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5227             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5228         if not success:
5229             raise Exception('SetConsoleMode failed')
5230     finally:
5231         os.close(handle)
5232
5233     global WINDOWS_VT_MODE
5234     WINDOWS_VT_MODE = True
5235     supports_terminal_sequences.cache_clear()
5236
5237
5238 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5239
5240
5241 def remove_terminal_sequences(string):
5242     return _terminal_sequences_re.sub('', string)
5243
5244
5245 def number_of_digits(number):
5246     return len('%d' % number)
5247
5248
5249 def join_nonempty(*values, delim='-', from_dict=None):
5250     if from_dict is not None:
5251         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
5252     return delim.join(map(str, filter(None, values)))
5253
5254
5255 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5256     """
5257     Find the largest format dimensions in terms of video width and, for each thumbnail:
5258     * Modify the URL: Match the width with the provided regex and replace with the former width
5259     * Update dimensions
5260
5261     This function is useful with video services that scale the provided thumbnails on demand
5262     """
5263     _keys = ('width', 'height')
5264     max_dimensions = max(
5265         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5266         default=(0, 0))
5267     if not max_dimensions[0]:
5268         return thumbnails
5269     return [
5270         merge_dicts(
5271             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5272             dict(zip(_keys, max_dimensions)), thumbnail)
5273         for thumbnail in thumbnails
5274     ]
5275
5276
5277 def parse_http_range(range):
5278     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5279     if not range:
5280         return None, None, None
5281     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5282     if not crg:
5283         return None, None, None
5284     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5285
5286
5287 def read_stdin(what):
5288     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5289     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5290     return sys.stdin
5291
5292
5293 def determine_file_encoding(data):
5294     """
5295     Detect the text encoding used
5296     @returns (encoding, bytes to skip)
5297     """
5298
5299     # BOM marks are given priority over declarations
5300     for bom, enc in BOMS:
5301         if data.startswith(bom):
5302             return enc, len(bom)
5303
5304     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5305     # We ignore the endianness to get a good enough match
5306     data = data.replace(b'\0', b'')
5307     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5308     return mobj.group(1).decode() if mobj else None, 0
5309
5310
5311 class Config:
5312     own_args = None
5313     parsed_args = None
5314     filename = None
5315     __initialized = False
5316
5317     def __init__(self, parser, label=None):
5318         self.parser, self.label = parser, label
5319         self._loaded_paths, self.configs = set(), []
5320
5321     def init(self, args=None, filename=None):
5322         assert not self.__initialized
5323         self.own_args, self.filename = args, filename
5324         return self.load_configs()
5325
5326     def load_configs(self):
5327         directory = ''
5328         if self.filename:
5329             location = os.path.realpath(self.filename)
5330             directory = os.path.dirname(location)
5331             if location in self._loaded_paths:
5332                 return False
5333             self._loaded_paths.add(location)
5334
5335         self.__initialized = True
5336         opts, _ = self.parser.parse_known_args(self.own_args)
5337         self.parsed_args = self.own_args
5338         for location in opts.config_locations or []:
5339             if location == '-':
5340                 if location in self._loaded_paths:
5341                     continue
5342                 self._loaded_paths.add(location)
5343                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5344                 continue
5345             location = os.path.join(directory, expand_path(location))
5346             if os.path.isdir(location):
5347                 location = os.path.join(location, 'yt-dlp.conf')
5348             if not os.path.exists(location):
5349                 self.parser.error(f'config location {location} does not exist')
5350             self.append_config(self.read_file(location), location)
5351         return True
5352
5353     def __str__(self):
5354         label = join_nonempty(
5355             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5356             delim=' ')
5357         return join_nonempty(
5358             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5359             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5360             delim='\n')
5361
5362     @staticmethod
5363     def read_file(filename, default=[]):
5364         try:
5365             optionf = open(filename, 'rb')
5366         except OSError:
5367             return default  # silently skip if file is not present
5368         try:
5369             enc, skip = determine_file_encoding(optionf.read(512))
5370             optionf.seek(skip, io.SEEK_SET)
5371         except OSError:
5372             enc = None  # silently skip read errors
5373         try:
5374             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5375             contents = optionf.read().decode(enc or preferredencoding())
5376             res = shlex.split(contents, comments=True)
5377         except Exception as err:
5378             raise ValueError(f'Unable to parse "{filename}": {err}')
5379         finally:
5380             optionf.close()
5381         return res
5382
5383     @staticmethod
5384     def hide_login_info(opts):
5385         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5386         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5387
5388         def _scrub_eq(o):
5389             m = eqre.match(o)
5390             if m:
5391                 return m.group('key') + '=PRIVATE'
5392             else:
5393                 return o
5394
5395         opts = list(map(_scrub_eq, opts))
5396         for idx, opt in enumerate(opts):
5397             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5398                 opts[idx + 1] = 'PRIVATE'
5399         return opts
5400
5401     def append_config(self, *args, label=None):
5402         config = type(self)(self.parser, label)
5403         config._loaded_paths = self._loaded_paths
5404         if config.init(*args):
5405             self.configs.append(config)
5406
5407     @property
5408     def all_args(self):
5409         for config in reversed(self.configs):
5410             yield from config.all_args
5411         yield from self.parsed_args or []
5412
5413     def parse_known_args(self, **kwargs):
5414         return self.parser.parse_known_args(self.all_args, **kwargs)
5415
5416     def parse_args(self):
5417         return self.parser.parse_args(self.all_args)
5418
5419
5420 class WebSocketsWrapper:
5421     """Wraps websockets module to use in non-async scopes"""
5422     pool = None
5423
5424     def __init__(self, url, headers=None, connect=True):
5425         self.loop = asyncio.new_event_loop()
5426         # XXX: "loop" is deprecated
5427         self.conn = websockets.connect(
5428             url, extra_headers=headers, ping_interval=None,
5429             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5430         if connect:
5431             self.__enter__()
5432         atexit.register(self.__exit__, None, None, None)
5433
5434     def __enter__(self):
5435         if not self.pool:
5436             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5437         return self
5438
5439     def send(self, *args):
5440         self.run_with_loop(self.pool.send(*args), self.loop)
5441
5442     def recv(self, *args):
5443         return self.run_with_loop(self.pool.recv(*args), self.loop)
5444
5445     def __exit__(self, type, value, traceback):
5446         try:
5447             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5448         finally:
5449             self.loop.close()
5450             self._cancel_all_tasks(self.loop)
5451
5452     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5453     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5454     @staticmethod
5455     def run_with_loop(main, loop):
5456         if not asyncio.iscoroutine(main):
5457             raise ValueError(f'a coroutine was expected, got {main!r}')
5458
5459         try:
5460             return loop.run_until_complete(main)
5461         finally:
5462             loop.run_until_complete(loop.shutdown_asyncgens())
5463             if hasattr(loop, 'shutdown_default_executor'):
5464                 loop.run_until_complete(loop.shutdown_default_executor())
5465
5466     @staticmethod
5467     def _cancel_all_tasks(loop):
5468         to_cancel = asyncio.all_tasks(loop)
5469
5470         if not to_cancel:
5471             return
5472
5473         for task in to_cancel:
5474             task.cancel()
5475
5476         # XXX: "loop" is removed in python 3.10+
5477         loop.run_until_complete(
5478             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5479
5480         for task in to_cancel:
5481             if task.cancelled():
5482                 continue
5483             if task.exception() is not None:
5484                 loop.call_exception_handler({
5485                     'message': 'unhandled exception during asyncio.run() shutdown',
5486                     'exception': task.exception(),
5487                     'task': task,
5488                 })
5489
5490
5491 def merge_headers(*dicts):
5492     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5493     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5494
5495
5496 def cached_method(f):
5497     """Cache a method"""
5498     signature = inspect.signature(f)
5499
5500     @functools.wraps(f)
5501     def wrapper(self, *args, **kwargs):
5502         bound_args = signature.bind(self, *args, **kwargs)
5503         bound_args.apply_defaults()
5504         key = tuple(bound_args.arguments.values())[1:]
5505
5506         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5507         if key not in cache:
5508             cache[key] = f(self, *args, **kwargs)
5509         return cache[key]
5510     return wrapper
5511
5512
5513 class classproperty:
5514     """property access for class methods with optional caching"""
5515     def __new__(cls, func=None, *args, **kwargs):
5516         if not func:
5517             return functools.partial(cls, *args, **kwargs)
5518         return super().__new__(cls)
5519
5520     def __init__(self, func, *, cache=False):
5521         functools.update_wrapper(self, func)
5522         self.func = func
5523         self._cache = {} if cache else None
5524
5525     def __get__(self, _, cls):
5526         if self._cache is None:
5527             return self.func(cls)
5528         elif cls not in self._cache:
5529             self._cache[cls] = self.func(cls)
5530         return self._cache[cls]
5531
5532
5533 class function_with_repr:
5534     def __init__(self, func, repr_=None):
5535         functools.update_wrapper(self, func)
5536         self.func, self.__repr = func, repr_
5537
5538     def __call__(self, *args, **kwargs):
5539         return self.func(*args, **kwargs)
5540
5541     def __repr__(self):
5542         if self.__repr:
5543             return self.__repr
5544         return f'{self.func.__module__}.{self.func.__qualname__}'
5545
5546
5547 class Namespace(types.SimpleNamespace):
5548     """Immutable namespace"""
5549
5550     def __iter__(self):
5551         return iter(self.__dict__.values())
5552
5553     @property
5554     def items_(self):
5555         return self.__dict__.items()
5556
5557
5558 MEDIA_EXTENSIONS = Namespace(
5559     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5560     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5561     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5562     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5563     thumbnails=('jpg', 'png', 'webp'),
5564     storyboards=('mhtml', ),
5565     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5566     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5567 )
5568 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5569 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5570
5571 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5572
5573
5574 class RetryManager:
5575     """Usage:
5576         for retry in RetryManager(...):
5577             try:
5578                 ...
5579             except SomeException as err:
5580                 retry.error = err
5581                 continue
5582     """
5583     attempt, _error = 0, None
5584
5585     def __init__(self, _retries, _error_callback, **kwargs):
5586         self.retries = _retries or 0
5587         self.error_callback = functools.partial(_error_callback, **kwargs)
5588
5589     def _should_retry(self):
5590         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5591
5592     @property
5593     def error(self):
5594         if self._error is NO_DEFAULT:
5595             return None
5596         return self._error
5597
5598     @error.setter
5599     def error(self, value):
5600         self._error = value
5601
5602     def __iter__(self):
5603         while self._should_retry():
5604             self.error = NO_DEFAULT
5605             self.attempt += 1
5606             yield self
5607             if self.error:
5608                 self.error_callback(self.error, self.attempt, self.retries)
5609
5610     @staticmethod
5611     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5612         """Utility function for reporting retries"""
5613         if count > retries:
5614             if error:
5615                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5616             raise e
5617
5618         if not count:
5619             return warn(e)
5620         elif isinstance(e, ExtractorError):
5621             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5622         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5623
5624         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5625         if delay:
5626             info(f'Sleeping {delay:.2f} seconds ...')
5627             time.sleep(delay)
5628
5629
5630 def make_archive_id(ie, video_id):
5631     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5632     return f'{ie_key.lower()} {video_id}'
5633
5634
5635 def truncate_string(s, left, right=0):
5636     assert left > 3 and right >= 0
5637     if s is None or len(s) <= left + right:
5638         return s
5639     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5640
5641
5642 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5643     assert 'all' in alias_dict, '"all" alias is required'
5644     requested = list(start or [])
5645     for val in options:
5646         discard = val.startswith('-')
5647         if discard:
5648             val = val[1:]
5649
5650         if val in alias_dict:
5651             val = alias_dict[val] if not discard else [
5652                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5653             # NB: Do not allow regex in aliases for performance
5654             requested = orderedSet_from_options(val, alias_dict, start=requested)
5655             continue
5656
5657         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5658                    else [val] if val in alias_dict['all'] else None)
5659         if current is None:
5660             raise ValueError(val)
5661
5662         if discard:
5663             for item in current:
5664                 while item in requested:
5665                     requested.remove(item)
5666         else:
5667             requested.extend(current)
5668
5669     return orderedSet(requested)
5670
5671
5672 # TODO: Rewrite
5673 class FormatSorter:
5674     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5675
5676     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5677                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5678                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5679     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5680                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5681                     'fps', 'fs_approx', 'source', 'id')
5682
5683     settings = {
5684         'vcodec': {'type': 'ordered', 'regex': True,
5685                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5686         'acodec': {'type': 'ordered', 'regex': True,
5687                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5688         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5689                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5690         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5691                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5692         'vext': {'type': 'ordered', 'field': 'video_ext',
5693                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5694                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5695         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5696                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5697                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5698         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5699         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5700                        'field': ('vcodec', 'acodec'),
5701                        'function': lambda it: int(any(v != 'none' for v in it))},
5702         'ie_pref': {'priority': True, 'type': 'extractor'},
5703         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5704         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5705         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5706         'quality': {'convert': 'float', 'default': -1},
5707         'filesize': {'convert': 'bytes'},
5708         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5709         'id': {'convert': 'string', 'field': 'format_id'},
5710         'height': {'convert': 'float_none'},
5711         'width': {'convert': 'float_none'},
5712         'fps': {'convert': 'float_none'},
5713         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5714         'tbr': {'convert': 'float_none'},
5715         'vbr': {'convert': 'float_none'},
5716         'abr': {'convert': 'float_none'},
5717         'asr': {'convert': 'float_none'},
5718         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5719
5720         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5721         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'),
5722                'function': lambda it: next(filter(None, it), None)},
5723         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'),
5724                  'function': lambda it: next(filter(None, it), None)},
5725         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5726         'res': {'type': 'multiple', 'field': ('height', 'width'),
5727                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5728
5729         # Actual field names
5730         'format_id': {'type': 'alias', 'field': 'id'},
5731         'preference': {'type': 'alias', 'field': 'ie_pref'},
5732         'language_preference': {'type': 'alias', 'field': 'lang'},
5733         'source_preference': {'type': 'alias', 'field': 'source'},
5734         'protocol': {'type': 'alias', 'field': 'proto'},
5735         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5736         'audio_channels': {'type': 'alias', 'field': 'channels'},
5737
5738         # Deprecated
5739         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5740         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5741         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5742         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5743         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5744         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5745         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5746         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5747         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5748         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5749         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5750         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5751         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5752         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5753         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5754         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5755         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5756         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5757         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5758         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5759     }
5760
5761     def __init__(self, ydl, field_preference):
5762         self.ydl = ydl
5763         self._order = []
5764         self.evaluate_params(self.ydl.params, field_preference)
5765         if ydl.params.get('verbose'):
5766             self.print_verbose_info(self.ydl.write_debug)
5767
5768     def _get_field_setting(self, field, key):
5769         if field not in self.settings:
5770             if key in ('forced', 'priority'):
5771                 return False
5772             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5773                                         'deprecated and may be removed in a future version')
5774             self.settings[field] = {}
5775         propObj = self.settings[field]
5776         if key not in propObj:
5777             type = propObj.get('type')
5778             if key == 'field':
5779                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5780             elif key == 'convert':
5781                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5782             else:
5783                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5784             propObj[key] = default
5785         return propObj[key]
5786
5787     def _resolve_field_value(self, field, value, convertNone=False):
5788         if value is None:
5789             if not convertNone:
5790                 return None
5791         else:
5792             value = value.lower()
5793         conversion = self._get_field_setting(field, 'convert')
5794         if conversion == 'ignore':
5795             return None
5796         if conversion == 'string':
5797             return value
5798         elif conversion == 'float_none':
5799             return float_or_none(value)
5800         elif conversion == 'bytes':
5801             return parse_bytes(value)
5802         elif conversion == 'order':
5803             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5804             use_regex = self._get_field_setting(field, 'regex')
5805             list_length = len(order_list)
5806             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5807             if use_regex and value is not None:
5808                 for i, regex in enumerate(order_list):
5809                     if regex and re.match(regex, value):
5810                         return list_length - i
5811                 return list_length - empty_pos  # not in list
5812             else:  # not regex or  value = None
5813                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5814         else:
5815             if value.isnumeric():
5816                 return float(value)
5817             else:
5818                 self.settings[field]['convert'] = 'string'
5819                 return value
5820
5821     def evaluate_params(self, params, sort_extractor):
5822         self._use_free_order = params.get('prefer_free_formats', False)
5823         self._sort_user = params.get('format_sort', [])
5824         self._sort_extractor = sort_extractor
5825
5826         def add_item(field, reverse, closest, limit_text):
5827             field = field.lower()
5828             if field in self._order:
5829                 return
5830             self._order.append(field)
5831             limit = self._resolve_field_value(field, limit_text)
5832             data = {
5833                 'reverse': reverse,
5834                 'closest': False if limit is None else closest,
5835                 'limit_text': limit_text,
5836                 'limit': limit}
5837             if field in self.settings:
5838                 self.settings[field].update(data)
5839             else:
5840                 self.settings[field] = data
5841
5842         sort_list = (
5843             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5844             + (tuple() if params.get('format_sort_force', False)
5845                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5846             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5847
5848         for item in sort_list:
5849             match = re.match(self.regex, item)
5850             if match is None:
5851                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5852             field = match.group('field')
5853             if field is None:
5854                 continue
5855             if self._get_field_setting(field, 'type') == 'alias':
5856                 alias, field = field, self._get_field_setting(field, 'field')
5857                 if self._get_field_setting(alias, 'deprecated'):
5858                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5859                                                 f'be removed in a future version. Please use {field} instead')
5860             reverse = match.group('reverse') is not None
5861             closest = match.group('separator') == '~'
5862             limit_text = match.group('limit')
5863
5864             has_limit = limit_text is not None
5865             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5866             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5867
5868             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5869             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5870             limit_count = len(limits)
5871             for (i, f) in enumerate(fields):
5872                 add_item(f, reverse, closest,
5873                          limits[i] if i < limit_count
5874                          else limits[0] if has_limit and not has_multiple_limits
5875                          else None)
5876
5877     def print_verbose_info(self, write_debug):
5878         if self._sort_user:
5879             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5880         if self._sort_extractor:
5881             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5882         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5883             '+' if self._get_field_setting(field, 'reverse') else '', field,
5884             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5885                           self._get_field_setting(field, 'limit_text'),
5886                           self._get_field_setting(field, 'limit'))
5887             if self._get_field_setting(field, 'limit_text') is not None else '')
5888             for field in self._order if self._get_field_setting(field, 'visible')]))
5889
5890     def _calculate_field_preference_from_value(self, format, field, type, value):
5891         reverse = self._get_field_setting(field, 'reverse')
5892         closest = self._get_field_setting(field, 'closest')
5893         limit = self._get_field_setting(field, 'limit')
5894
5895         if type == 'extractor':
5896             maximum = self._get_field_setting(field, 'max')
5897             if value is None or (maximum is not None and value >= maximum):
5898                 value = -1
5899         elif type == 'boolean':
5900             in_list = self._get_field_setting(field, 'in_list')
5901             not_in_list = self._get_field_setting(field, 'not_in_list')
5902             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5903         elif type == 'ordered':
5904             value = self._resolve_field_value(field, value, True)
5905
5906         # try to convert to number
5907         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5908         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5909         if is_num:
5910             value = val_num
5911
5912         return ((-10, 0) if value is None
5913                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5914                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5915                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5916                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5917                 else (-1, value, 0))
5918
5919     def _calculate_field_preference(self, format, field):
5920         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5921         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5922         if type == 'multiple':
5923             type = 'field'  # Only 'field' is allowed in multiple for now
5924             actual_fields = self._get_field_setting(field, 'field')
5925
5926             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5927         else:
5928             value = get_value(field)
5929         return self._calculate_field_preference_from_value(format, field, type, value)
5930
5931     def calculate_preference(self, format):
5932         # Determine missing protocol
5933         if not format.get('protocol'):
5934             format['protocol'] = determine_protocol(format)
5935
5936         # Determine missing ext
5937         if not format.get('ext') and 'url' in format:
5938             format['ext'] = determine_ext(format['url'])
5939         if format.get('vcodec') == 'none':
5940             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5941             format['video_ext'] = 'none'
5942         else:
5943             format['video_ext'] = format['ext']
5944             format['audio_ext'] = 'none'
5945         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5946         #    format['preference'] = -1000
5947
5948         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5949             # HEVC-over-FLV is out-of-spec by FLV's original spec
5950             # ref. https://trac.ffmpeg.org/ticket/6389
5951             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5952             format['preference'] = -100
5953
5954         # Determine missing bitrates
5955         if format.get('vcodec') == 'none':
5956             format['vbr'] = 0
5957         if format.get('acodec') == 'none':
5958             format['abr'] = 0
5959         if not format.get('vbr') and format.get('vcodec') != 'none':
5960             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5961         if not format.get('abr') and format.get('acodec') != 'none':
5962             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5963         if not format.get('tbr'):
5964             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5965
5966         return tuple(self._calculate_field_preference(format, field) for field in self._order)