yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import unicodedata
  44 import urllib.error
  45 import urllib.parse
  46 import urllib.request
  47 import xml.etree.ElementTree
  48 import zlib
  49
  50 from . import traversal
  51
  52 from ..compat import functools  # isort: split
  53 from ..compat import (
  54     compat_etree_fromstring,
  55     compat_expanduser,
  56     compat_HTMLParseError,
  57     compat_os_name,
  58     compat_shlex_quote,
  59 )
  60 from ..dependencies import brotli, certifi, websockets, xattr
  61 from ..socks import ProxyType, sockssocket
  62
  63 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  64
  65 # This is not clearly defined otherwise
  66 compiled_regex_type = type(re.compile(''))
  67
  68
  69 def random_user_agent():
  70     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  71     _CHROME_VERSIONS = (
  72         '90.0.4430.212',
  73         '90.0.4430.24',
  74         '90.0.4430.70',
  75         '90.0.4430.72',
  76         '90.0.4430.85',
  77         '90.0.4430.93',
  78         '91.0.4472.101',
  79         '91.0.4472.106',
  80         '91.0.4472.114',
  81         '91.0.4472.124',
  82         '91.0.4472.164',
  83         '91.0.4472.19',
  84         '91.0.4472.77',
  85         '92.0.4515.107',
  86         '92.0.4515.115',
  87         '92.0.4515.131',
  88         '92.0.4515.159',
  89         '92.0.4515.43',
  90         '93.0.4556.0',
  91         '93.0.4577.15',
  92         '93.0.4577.63',
  93         '93.0.4577.82',
  94         '94.0.4606.41',
  95         '94.0.4606.54',
  96         '94.0.4606.61',
  97         '94.0.4606.71',
  98         '94.0.4606.81',
  99         '94.0.4606.85',
 100         '95.0.4638.17',
 101         '95.0.4638.50',
 102         '95.0.4638.54',
 103         '95.0.4638.69',
 104         '95.0.4638.74',
 105         '96.0.4664.18',
 106         '96.0.4664.45',
 107         '96.0.4664.55',
 108         '96.0.4664.93',
 109         '97.0.4692.20',
 110     )
 111     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 112
 113
 114 SUPPORTED_ENCODINGS = [
 115     'gzip', 'deflate'
 116 ]
 117 if brotli:
 118     SUPPORTED_ENCODINGS.append('br')
 119
 120 std_headers = {
 121     'User-Agent': random_user_agent(),
 122     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 123     'Accept-Language': 'en-us,en;q=0.5',
 124     'Sec-Fetch-Mode': 'navigate',
 125 }
 126
 127
 128 USER_AGENTS = {
 129     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 130 }
 131
 132
 133 class NO_DEFAULT:
 134     pass
 135
 136
 137 def IDENTITY(x):
 138     return x
 139
 140
 141 ENGLISH_MONTH_NAMES = [
 142     'January', 'February', 'March', 'April', 'May', 'June',
 143     'July', 'August', 'September', 'October', 'November', 'December']
 144
 145 MONTH_NAMES = {
 146     'en': ENGLISH_MONTH_NAMES,
 147     'fr': [
 148         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 149         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 150     # these follow the genitive grammatical case (dopełniacz)
 151     # some websites might be using nominative, which will require another month list
 152     # https://en.wikibooks.org/wiki/Polish/Noun_cases
 153     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
 154            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
 155 }
 156
 157 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 158 TIMEZONE_NAMES = {
 159     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 160     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 161     'EST': -5, 'EDT': -4,  # Eastern
 162     'CST': -6, 'CDT': -5,  # Central
 163     'MST': -7, 'MDT': -6,  # Mountain
 164     'PST': -8, 'PDT': -7   # Pacific
 165 }
 166
 167 # needed for sanitizing filenames in restricted mode
 168 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 169                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 170                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 171
 172 DATE_FORMATS = (
 173     '%d %B %Y',
 174     '%d %b %Y',
 175     '%B %d %Y',
 176     '%B %dst %Y',
 177     '%B %dnd %Y',
 178     '%B %drd %Y',
 179     '%B %dth %Y',
 180     '%b %d %Y',
 181     '%b %dst %Y',
 182     '%b %dnd %Y',
 183     '%b %drd %Y',
 184     '%b %dth %Y',
 185     '%b %dst %Y %I:%M',
 186     '%b %dnd %Y %I:%M',
 187     '%b %drd %Y %I:%M',
 188     '%b %dth %Y %I:%M',
 189     '%Y %m %d',
 190     '%Y-%m-%d',
 191     '%Y.%m.%d.',
 192     '%Y/%m/%d',
 193     '%Y/%m/%d %H:%M',
 194     '%Y/%m/%d %H:%M:%S',
 195     '%Y%m%d%H%M',
 196     '%Y%m%d%H%M%S',
 197     '%Y%m%d',
 198     '%Y-%m-%d %H:%M',
 199     '%Y-%m-%d %H:%M:%S',
 200     '%Y-%m-%d %H:%M:%S.%f',
 201     '%Y-%m-%d %H:%M:%S:%f',
 202     '%d.%m.%Y %H:%M',
 203     '%d.%m.%Y %H.%M',
 204     '%Y-%m-%dT%H:%M:%SZ',
 205     '%Y-%m-%dT%H:%M:%S.%fZ',
 206     '%Y-%m-%dT%H:%M:%S.%f0Z',
 207     '%Y-%m-%dT%H:%M:%S',
 208     '%Y-%m-%dT%H:%M:%S.%f',
 209     '%Y-%m-%dT%H:%M',
 210     '%b %d %Y at %H:%M',
 211     '%b %d %Y at %H:%M:%S',
 212     '%B %d %Y at %H:%M',
 213     '%B %d %Y at %H:%M:%S',
 214     '%H:%M %d-%b-%Y',
 215 )
 216
 217 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 218 DATE_FORMATS_DAY_FIRST.extend([
 219     '%d-%m-%Y',
 220     '%d.%m.%Y',
 221     '%d.%m.%y',
 222     '%d/%m/%Y',
 223     '%d/%m/%y',
 224     '%d/%m/%Y %H:%M:%S',
 225     '%d-%m-%Y %H:%M',
 226     '%H:%M %d/%m/%Y',
 227 ])
 228
 229 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 230 DATE_FORMATS_MONTH_FIRST.extend([
 231     '%m-%d-%Y',
 232     '%m.%d.%Y',
 233     '%m/%d/%Y',
 234     '%m/%d/%y',
 235     '%m/%d/%Y %H:%M:%S',
 236 ])
 237
 238 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 239 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 240
 241 NUMBER_RE = r'\d+(?:\.\d+)?'
 242
 243
 244 @functools.cache
 245 def preferredencoding():
 246     """Get preferred encoding.
 247
 248     Returns the best encoding scheme for the system, based on
 249     locale.getpreferredencoding() and some further tweaks.
 250     """
 251     try:
 252         pref = locale.getpreferredencoding()
 253         'TEST'.encode(pref)
 254     except Exception:
 255         pref = 'UTF-8'
 256
 257     return pref
 258
 259
 260 def write_json_file(obj, fn):
 261     """ Encode obj as JSON and write it to fn, atomically if possible """
 262
 263     tf = tempfile.NamedTemporaryFile(
 264         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 265         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 266
 267     try:
 268         with tf:
 269             json.dump(obj, tf, ensure_ascii=False)
 270         if sys.platform == 'win32':
 271             # Need to remove existing file on Windows, else os.rename raises
 272             # WindowsError or FileExistsError.
 273             with contextlib.suppress(OSError):
 274                 os.unlink(fn)
 275         with contextlib.suppress(OSError):
 276             mask = os.umask(0)
 277             os.umask(mask)
 278             os.chmod(tf.name, 0o666 & ~mask)
 279         os.rename(tf.name, fn)
 280     except Exception:
 281         with contextlib.suppress(OSError):
 282             os.remove(tf.name)
 283         raise
 284
 285
 286 def find_xpath_attr(node, xpath, key, val=None):
 287     """ Find the xpath xpath[@key=val] """
 288     assert re.match(r'^[a-zA-Z_-]+$', key)
 289     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 290     return node.find(expr)
 291
 292 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 293 # the namespace parameter
 294
 295
 296 def xpath_with_ns(path, ns_map):
 297     components = [c.split(':') for c in path.split('/')]
 298     replaced = []
 299     for c in components:
 300         if len(c) == 1:
 301             replaced.append(c[0])
 302         else:
 303             ns, tag = c
 304             replaced.append('{%s}%s' % (ns_map[ns], tag))
 305     return '/'.join(replaced)
 306
 307
 308 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 309     def _find_xpath(xpath):
 310         return node.find(xpath)
 311
 312     if isinstance(xpath, str):
 313         n = _find_xpath(xpath)
 314     else:
 315         for xp in xpath:
 316             n = _find_xpath(xp)
 317             if n is not None:
 318                 break
 319
 320     if n is None:
 321         if default is not NO_DEFAULT:
 322             return default
 323         elif fatal:
 324             name = xpath if name is None else name
 325             raise ExtractorError('Could not find XML element %s' % name)
 326         else:
 327             return None
 328     return n
 329
 330
 331 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 332     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 333     if n is None or n == default:
 334         return n
 335     if n.text is None:
 336         if default is not NO_DEFAULT:
 337             return default
 338         elif fatal:
 339             name = xpath if name is None else name
 340             raise ExtractorError('Could not find XML element\'s text %s' % name)
 341         else:
 342             return None
 343     return n.text
 344
 345
 346 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 347     n = find_xpath_attr(node, xpath, key)
 348     if n is None:
 349         if default is not NO_DEFAULT:
 350             return default
 351         elif fatal:
 352             name = f'{xpath}[@{key}]' if name is None else name
 353             raise ExtractorError('Could not find XML attribute %s' % name)
 354         else:
 355             return None
 356     return n.attrib[key]
 357
 358
 359 def get_element_by_id(id, html, **kwargs):
 360     """Return the content of the tag with the specified ID in the passed HTML document"""
 361     return get_element_by_attribute('id', id, html, **kwargs)
 362
 363
 364 def get_element_html_by_id(id, html, **kwargs):
 365     """Return the html of the tag with the specified ID in the passed HTML document"""
 366     return get_element_html_by_attribute('id', id, html, **kwargs)
 367
 368
 369 def get_element_by_class(class_name, html):
 370     """Return the content of the first tag with the specified class in the passed HTML document"""
 371     retval = get_elements_by_class(class_name, html)
 372     return retval[0] if retval else None
 373
 374
 375 def get_element_html_by_class(class_name, html):
 376     """Return the html of the first tag with the specified class in the passed HTML document"""
 377     retval = get_elements_html_by_class(class_name, html)
 378     return retval[0] if retval else None
 379
 380
 381 def get_element_by_attribute(attribute, value, html, **kwargs):
 382     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 383     return retval[0] if retval else None
 384
 385
 386 def get_element_html_by_attribute(attribute, value, html, **kargs):
 387     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 388     return retval[0] if retval else None
 389
 390
 391 def get_elements_by_class(class_name, html, **kargs):
 392     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 393     return get_elements_by_attribute(
 394         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 395         html, escape_value=False)
 396
 397
 398 def get_elements_html_by_class(class_name, html):
 399     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 400     return get_elements_html_by_attribute(
 401         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 402         html, escape_value=False)
 403
 404
 405 def get_elements_by_attribute(*args, **kwargs):
 406     """Return the content of the tag with the specified attribute in the passed HTML document"""
 407     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 408
 409
 410 def get_elements_html_by_attribute(*args, **kwargs):
 411     """Return the html of the tag with the specified attribute in the passed HTML document"""
 412     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 413
 414
 415 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 416     """
 417     Return the text (content) and the html (whole) of the tag with the specified
 418     attribute in the passed HTML document
 419     """
 420     if not value:
 421         return
 422
 423     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 424
 425     value = re.escape(value) if escape_value else value
 426
 427     partial_element_re = rf'''(?x)
 428         <(?P<tag>{tag})
 429          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 430          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 431         '''
 432
 433     for m in re.finditer(partial_element_re, html):
 434         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 435
 436         yield (
 437             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 438             whole
 439         )
 440
 441
 442 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 443     """
 444     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 445     closing tag for the first opening tag it has encountered, and can be used
 446     as a context manager
 447     """
 448
 449     class HTMLBreakOnClosingTagException(Exception):
 450         pass
 451
 452     def __init__(self):
 453         self.tagstack = collections.deque()
 454         html.parser.HTMLParser.__init__(self)
 455
 456     def __enter__(self):
 457         return self
 458
 459     def __exit__(self, *_):
 460         self.close()
 461
 462     def close(self):
 463         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 464         # so data remains buffered; we no longer have any interest in it, thus
 465         # override this method to discard it
 466         pass
 467
 468     def handle_starttag(self, tag, _):
 469         self.tagstack.append(tag)
 470
 471     def handle_endtag(self, tag):
 472         if not self.tagstack:
 473             raise compat_HTMLParseError('no tags in the stack')
 474         while self.tagstack:
 475             inner_tag = self.tagstack.pop()
 476             if inner_tag == tag:
 477                 break
 478         else:
 479             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 480         if not self.tagstack:
 481             raise self.HTMLBreakOnClosingTagException()
 482
 483
 484 # XXX: This should be far less strict
 485 def get_element_text_and_html_by_tag(tag, html):
 486     """
 487     For the first element with the specified tag in the passed HTML document
 488     return its' content (text) and the whole element (html)
 489     """
 490     def find_or_raise(haystack, needle, exc):
 491         try:
 492             return haystack.index(needle)
 493         except ValueError:
 494             raise exc
 495     closing_tag = f'</{tag}>'
 496     whole_start = find_or_raise(
 497         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 498     content_start = find_or_raise(
 499         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 500     content_start += whole_start + 1
 501     with HTMLBreakOnClosingTagParser() as parser:
 502         parser.feed(html[whole_start:content_start])
 503         if not parser.tagstack or parser.tagstack[0] != tag:
 504             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 505         offset = content_start
 506         while offset < len(html):
 507             next_closing_tag_start = find_or_raise(
 508                 html[offset:], closing_tag,
 509                 compat_HTMLParseError(f'closing {tag} tag not found'))
 510             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 511             try:
 512                 parser.feed(html[offset:offset + next_closing_tag_end])
 513                 offset += next_closing_tag_end
 514             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 515                 return html[content_start:offset + next_closing_tag_start], \
 516                     html[whole_start:offset + next_closing_tag_end]
 517         raise compat_HTMLParseError('unexpected end of html')
 518
 519
 520 class HTMLAttributeParser(html.parser.HTMLParser):
 521     """Trivial HTML parser to gather the attributes for a single element"""
 522
 523     def __init__(self):
 524         self.attrs = {}
 525         html.parser.HTMLParser.__init__(self)
 526
 527     def handle_starttag(self, tag, attrs):
 528         self.attrs = dict(attrs)
 529         raise compat_HTMLParseError('done')
 530
 531
 532 class HTMLListAttrsParser(html.parser.HTMLParser):
 533     """HTML parser to gather the attributes for the elements of a list"""
 534
 535     def __init__(self):
 536         html.parser.HTMLParser.__init__(self)
 537         self.items = []
 538         self._level = 0
 539
 540     def handle_starttag(self, tag, attrs):
 541         if tag == 'li' and self._level == 0:
 542             self.items.append(dict(attrs))
 543         self._level += 1
 544
 545     def handle_endtag(self, tag):
 546         self._level -= 1
 547
 548
 549 def extract_attributes(html_element):
 550     """Given a string for an HTML element such as
 551     <el
 552          a="foo" B="bar" c="&98;az" d=boz
 553          empty= noval entity="&amp;"
 554          sq='"' dq="'"
 555     >
 556     Decode and return a dictionary of attributes.
 557     {
 558         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 559         'empty': '', 'noval': None, 'entity': '&',
 560         'sq': '"', 'dq': '\''
 561     }.
 562     """
 563     parser = HTMLAttributeParser()
 564     with contextlib.suppress(compat_HTMLParseError):
 565         parser.feed(html_element)
 566         parser.close()
 567     return parser.attrs
 568
 569
 570 def parse_list(webpage):
 571     """Given a string for an series of HTML <li> elements,
 572     return a dictionary of their attributes"""
 573     parser = HTMLListAttrsParser()
 574     parser.feed(webpage)
 575     parser.close()
 576     return parser.items
 577
 578
 579 def clean_html(html):
 580     """Clean an HTML snippet into a readable string"""
 581
 582     if html is None:  # Convenience for sanitizing descriptions etc.
 583         return html
 584
 585     html = re.sub(r'\s+', ' ', html)
 586     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 587     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 588     # Strip html tags
 589     html = re.sub('<.*?>', '', html)
 590     # Replace html entities
 591     html = unescapeHTML(html)
 592     return html.strip()
 593
 594
 595 class LenientJSONDecoder(json.JSONDecoder):
 596     # TODO: Write tests
 597     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 598         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 599         self._close_attempts = 2 * close_objects
 600         super().__init__(*args, **kwargs)
 601
 602     @staticmethod
 603     def _close_object(err):
 604         doc = err.doc[:err.pos]
 605         # We need to add comma first to get the correct error message
 606         if err.msg.startswith('Expecting \',\''):
 607             return doc + ','
 608         elif not doc.endswith(','):
 609             return
 610
 611         if err.msg.startswith('Expecting property name'):
 612             return doc[:-1] + '}'
 613         elif err.msg.startswith('Expecting value'):
 614             return doc[:-1] + ']'
 615
 616     def decode(self, s):
 617         if self.transform_source:
 618             s = self.transform_source(s)
 619         for attempt in range(self._close_attempts + 1):
 620             try:
 621                 if self.ignore_extra:
 622                     return self.raw_decode(s.lstrip())[0]
 623                 return super().decode(s)
 624             except json.JSONDecodeError as e:
 625                 if e.pos is None:
 626                     raise
 627                 elif attempt < self._close_attempts:
 628                     s = self._close_object(e)
 629                     if s is not None:
 630                         continue
 631                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 632         assert False, 'Too many attempts to decode JSON'
 633
 634
 635 def sanitize_open(filename, open_mode):
 636     """Try to open the given filename, and slightly tweak it if this fails.
 637
 638     Attempts to open the given filename. If this fails, it tries to change
 639     the filename slightly, step by step, until it's either able to open it
 640     or it fails and raises a final exception, like the standard open()
 641     function.
 642
 643     It returns the tuple (stream, definitive_file_name).
 644     """
 645     if filename == '-':
 646         if sys.platform == 'win32':
 647             import msvcrt
 648
 649             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 650             with contextlib.suppress(io.UnsupportedOperation):
 651                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 652         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 653
 654     for attempt in range(2):
 655         try:
 656             try:
 657                 if sys.platform == 'win32':
 658                     # FIXME: An exclusive lock also locks the file from being read.
 659                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 660                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 661                     raise LockingUnsupportedError()
 662                 stream = locked_file(filename, open_mode, block=False).__enter__()
 663             except OSError:
 664                 stream = open(filename, open_mode)
 665             return stream, filename
 666         except OSError as err:
 667             if attempt or err.errno in (errno.EACCES,):
 668                 raise
 669             old_filename, filename = filename, sanitize_path(filename)
 670             if old_filename == filename:
 671                 raise
 672
 673
 674 def timeconvert(timestr):
 675     """Convert RFC 2822 defined time string into system timestamp"""
 676     timestamp = None
 677     timetuple = email.utils.parsedate_tz(timestr)
 678     if timetuple is not None:
 679         timestamp = email.utils.mktime_tz(timetuple)
 680     return timestamp
 681
 682
 683 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 684     """Sanitizes a string so it could be used as part of a filename.
 685     @param restricted   Use a stricter subset of allowed characters
 686     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 687                         If unset, yt-dlp's new sanitization rules are in effect
 688     """
 689     if s == '':
 690         return ''
 691
 692     def replace_insane(char):
 693         if restricted and char in ACCENT_CHARS:
 694             return ACCENT_CHARS[char]
 695         elif not restricted and char == '\n':
 696             return '\0 '
 697         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 698             # Replace with their full-width unicode counterparts
 699             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 700         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 701             return ''
 702         elif char == '"':
 703             return '' if restricted else '\''
 704         elif char == ':':
 705             return '\0_\0-' if restricted else '\0 \0-'
 706         elif char in '\\/|*<>':
 707             return '\0_'
 708         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 709             return '\0_'
 710         return char
 711
 712     # Replace look-alike Unicode glyphs
 713     if restricted and (is_id is NO_DEFAULT or not is_id):
 714         s = unicodedata.normalize('NFKC', s)
 715     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 716     result = ''.join(map(replace_insane, s))
 717     if is_id is NO_DEFAULT:
 718         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 719         STRIP_RE = r'(?:\0.|[ _-])*'
 720         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 721     result = result.replace('\0', '') or '_'
 722
 723     if not is_id:
 724         while '__' in result:
 725             result = result.replace('__', '_')
 726         result = result.strip('_')
 727         # Common case of "Foreign band name - English song title"
 728         if restricted and result.startswith('-_'):
 729             result = result[2:]
 730         if result.startswith('-'):
 731             result = '_' + result[len('-'):]
 732         result = result.lstrip('.')
 733         if not result:
 734             result = '_'
 735     return result
 736
 737
 738 def sanitize_path(s, force=False):
 739     """Sanitizes and normalizes path on Windows"""
 740     if sys.platform == 'win32':
 741         force = False
 742         drive_or_unc, _ = os.path.splitdrive(s)
 743     elif force:
 744         drive_or_unc = ''
 745     else:
 746         return s
 747
 748     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 749     if drive_or_unc:
 750         norm_path.pop(0)
 751     sanitized_path = [
 752         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 753         for path_part in norm_path]
 754     if drive_or_unc:
 755         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 756     elif force and s and s[0] == os.path.sep:
 757         sanitized_path.insert(0, os.path.sep)
 758     return os.path.join(*sanitized_path)
 759
 760
 761 def sanitize_url(url, *, scheme='http'):
 762     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 763     # the number of unwanted failures due to missing protocol
 764     if url is None:
 765         return
 766     elif url.startswith('//'):
 767         return f'{scheme}:{url}'
 768     # Fix some common typos seen so far
 769     COMMON_TYPOS = (
 770         # https://github.com/ytdl-org/youtube-dl/issues/15649
 771         (r'^httpss://', r'https://'),
 772         # https://bx1.be/lives/direct-tv/
 773         (r'^rmtp([es]?)://', r'rtmp\1://'),
 774     )
 775     for mistake, fixup in COMMON_TYPOS:
 776         if re.match(mistake, url):
 777             return re.sub(mistake, fixup, url)
 778     return url
 779
 780
 781 def extract_basic_auth(url):
 782     parts = urllib.parse.urlsplit(url)
 783     if parts.username is None:
 784         return url, None
 785     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 786         parts.hostname if parts.port is None
 787         else '%s:%d' % (parts.hostname, parts.port))))
 788     auth_payload = base64.b64encode(
 789         ('%s:%s' % (parts.username, parts.password or '')).encode())
 790     return url, f'Basic {auth_payload.decode()}'
 791
 792
 793 def sanitized_Request(url, *args, **kwargs):
 794     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 795     if auth_header is not None:
 796         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 797         headers['Authorization'] = auth_header
 798     return urllib.request.Request(url, *args, **kwargs)
 799
 800
 801 def expand_path(s):
 802     """Expand shell variables and ~"""
 803     return os.path.expandvars(compat_expanduser(s))
 804
 805
 806 def orderedSet(iterable, *, lazy=False):
 807     """Remove all duplicates from the input iterable"""
 808     def _iter():
 809         seen = []  # Do not use set since the items can be unhashable
 810         for x in iterable:
 811             if x not in seen:
 812                 seen.append(x)
 813                 yield x
 814
 815     return _iter() if lazy else list(_iter())
 816
 817
 818 def _htmlentity_transform(entity_with_semicolon):
 819     """Transforms an HTML entity to a character."""
 820     entity = entity_with_semicolon[:-1]
 821
 822     # Known non-numeric HTML entity
 823     if entity in html.entities.name2codepoint:
 824         return chr(html.entities.name2codepoint[entity])
 825
 826     # TODO: HTML5 allows entities without a semicolon.
 827     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 828     if entity_with_semicolon in html.entities.html5:
 829         return html.entities.html5[entity_with_semicolon]
 830
 831     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 832     if mobj is not None:
 833         numstr = mobj.group(1)
 834         if numstr.startswith('x'):
 835             base = 16
 836             numstr = '0%s' % numstr
 837         else:
 838             base = 10
 839         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 840         with contextlib.suppress(ValueError):
 841             return chr(int(numstr, base))
 842
 843     # Unknown entity in name, return its literal representation
 844     return '&%s;' % entity
 845
 846
 847 def unescapeHTML(s):
 848     if s is None:
 849         return None
 850     assert isinstance(s, str)
 851
 852     return re.sub(
 853         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 854
 855
 856 def escapeHTML(text):
 857     return (
 858         text
 859         .replace('&', '&amp;')
 860         .replace('<', '&lt;')
 861         .replace('>', '&gt;')
 862         .replace('"', '&quot;')
 863         .replace("'", '&#39;')
 864     )
 865
 866
 867 def process_communicate_or_kill(p, *args, **kwargs):
 868     deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
 869                         f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
 870     return Popen.communicate_or_kill(p, *args, **kwargs)
 871
 872
 873 class Popen(subprocess.Popen):
 874     if sys.platform == 'win32':
 875         _startupinfo = subprocess.STARTUPINFO()
 876         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 877     else:
 878         _startupinfo = None
 879
 880     @staticmethod
 881     def _fix_pyinstaller_ld_path(env):
 882         """Restore LD_LIBRARY_PATH when using PyInstaller
 883             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 884                  https://github.com/yt-dlp/yt-dlp/issues/4573
 885         """
 886         if not hasattr(sys, '_MEIPASS'):
 887             return
 888
 889         def _fix(key):
 890             orig = env.get(f'{key}_ORIG')
 891             if orig is None:
 892                 env.pop(key, None)
 893             else:
 894                 env[key] = orig
 895
 896         _fix('LD_LIBRARY_PATH')  # Linux
 897         _fix('DYLD_LIBRARY_PATH')  # macOS
 898
 899     def __init__(self, *args, env=None, text=False, **kwargs):
 900         if env is None:
 901             env = os.environ.copy()
 902         self._fix_pyinstaller_ld_path(env)
 903
 904         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 905         if text is True:
 906             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 907             kwargs.setdefault('encoding', 'utf-8')
 908             kwargs.setdefault('errors', 'replace')
 909         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 910
 911     def communicate_or_kill(self, *args, **kwargs):
 912         try:
 913             return self.communicate(*args, **kwargs)
 914         except BaseException:  # Including KeyboardInterrupt
 915             self.kill(timeout=None)
 916             raise
 917
 918     def kill(self, *, timeout=0):
 919         super().kill()
 920         if timeout != 0:
 921             self.wait(timeout=timeout)
 922
 923     @classmethod
 924     def run(cls, *args, timeout=None, **kwargs):
 925         with cls(*args, **kwargs) as proc:
 926             default = '' if proc.__text_mode else b''
 927             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 928             return stdout or default, stderr or default, proc.returncode
 929
 930
 931 def encodeArgument(s):
 932     # Legacy code that uses byte strings
 933     # Uncomment the following line after fixing all post processors
 934     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 935     return s if isinstance(s, str) else s.decode('ascii')
 936
 937
 938 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 939
 940
 941 def timetuple_from_msec(msec):
 942     secs, msec = divmod(msec, 1000)
 943     mins, secs = divmod(secs, 60)
 944     hrs, mins = divmod(mins, 60)
 945     return _timetuple(hrs, mins, secs, msec)
 946
 947
 948 def formatSeconds(secs, delim=':', msec=False):
 949     time = timetuple_from_msec(secs * 1000)
 950     if time.hours:
 951         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 952     elif time.minutes:
 953         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 954     else:
 955         ret = '%d' % time.seconds
 956     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 957
 958
 959 def _ssl_load_windows_store_certs(ssl_context, storename):
 960     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 961     try:
 962         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 963                  if encoding == 'x509_asn' and (
 964                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 965     except PermissionError:
 966         return
 967     for cert in certs:
 968         with contextlib.suppress(ssl.SSLError):
 969             ssl_context.load_verify_locations(cadata=cert)
 970
 971
 972 def make_HTTPS_handler(params, **kwargs):
 973     opts_check_certificate = not params.get('nocheckcertificate')
 974     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 975     context.check_hostname = opts_check_certificate
 976     if params.get('legacyserverconnect'):
 977         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 978         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 979         context.set_ciphers('DEFAULT')
 980     elif (
 981         sys.version_info < (3, 10)
 982         and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
 983         and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
 984     ):
 985         # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
 986         # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
 987         # in some situations [2][3].
 988         # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
 989         # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
 990         # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
 991         # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
 992         # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
 993         # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
 994         # 4. https://peps.python.org/pep-0644/
 995         # 5. https://peps.python.org/pep-0644/#libressl-support
 996         # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
 997         context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
 998         context.minimum_version = ssl.TLSVersion.TLSv1_2
 999
1000     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1001     if opts_check_certificate:
1002         if certifi and 'no-certifi' not in params.get('compat_opts', []):
1003             context.load_verify_locations(cafile=certifi.where())
1004         else:
1005             try:
1006                 context.load_default_certs()
1007                 # Work around the issue in load_default_certs when there are bad certificates. See:
1008                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1009                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1010             except ssl.SSLError:
1011                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1012                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1013                     for storename in ('CA', 'ROOT'):
1014                         _ssl_load_windows_store_certs(context, storename)
1015                 context.set_default_verify_paths()
1016
1017     client_certfile = params.get('client_certificate')
1018     if client_certfile:
1019         try:
1020             context.load_cert_chain(
1021                 client_certfile, keyfile=params.get('client_certificate_key'),
1022                 password=params.get('client_certificate_password'))
1023         except ssl.SSLError:
1024             raise YoutubeDLError('Unable to load client certificate')
1025
1026     # Some servers may reject requests if ALPN extension is not sent. See:
1027     # https://github.com/python/cpython/issues/85140
1028     # https://github.com/yt-dlp/yt-dlp/issues/3878
1029     with contextlib.suppress(NotImplementedError):
1030         context.set_alpn_protocols(['http/1.1'])
1031
1032     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1033
1034
1035 def bug_reports_message(before=';'):
1036     from ..update import REPOSITORY
1037
1038     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1039            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1040
1041     before = before.rstrip()
1042     if not before or before.endswith(('.', '!', '?')):
1043         msg = msg[0].title() + msg[1:]
1044
1045     return (before + ' ' if before else '') + msg
1046
1047
1048 class YoutubeDLError(Exception):
1049     """Base exception for YoutubeDL errors."""
1050     msg = None
1051
1052     def __init__(self, msg=None):
1053         if msg is not None:
1054             self.msg = msg
1055         elif self.msg is None:
1056             self.msg = type(self).__name__
1057         super().__init__(self.msg)
1058
1059
1060 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1061 if hasattr(ssl, 'CertificateError'):
1062     network_exceptions.append(ssl.CertificateError)
1063 network_exceptions = tuple(network_exceptions)
1064
1065
1066 class ExtractorError(YoutubeDLError):
1067     """Error during info extraction."""
1068
1069     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1070         """ tb, if given, is the original traceback (so that it can be printed out).
1071         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1072         """
1073         if sys.exc_info()[0] in network_exceptions:
1074             expected = True
1075
1076         self.orig_msg = str(msg)
1077         self.traceback = tb
1078         self.expected = expected
1079         self.cause = cause
1080         self.video_id = video_id
1081         self.ie = ie
1082         self.exc_info = sys.exc_info()  # preserve original exception
1083         if isinstance(self.exc_info[1], ExtractorError):
1084             self.exc_info = self.exc_info[1].exc_info
1085         super().__init__(self.__msg)
1086
1087     @property
1088     def __msg(self):
1089         return ''.join((
1090             format_field(self.ie, None, '[%s] '),
1091             format_field(self.video_id, None, '%s: '),
1092             self.orig_msg,
1093             format_field(self.cause, None, ' (caused by %r)'),
1094             '' if self.expected else bug_reports_message()))
1095
1096     def format_traceback(self):
1097         return join_nonempty(
1098             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1099             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1100             delim='\n') or None
1101
1102     def __setattr__(self, name, value):
1103         super().__setattr__(name, value)
1104         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1105             self.msg = self.__msg or type(self).__name__
1106             self.args = (self.msg, )  # Cannot be property
1107
1108
1109 class UnsupportedError(ExtractorError):
1110     def __init__(self, url):
1111         super().__init__(
1112             'Unsupported URL: %s' % url, expected=True)
1113         self.url = url
1114
1115
1116 class RegexNotFoundError(ExtractorError):
1117     """Error when a regex didn't match"""
1118     pass
1119
1120
1121 class GeoRestrictedError(ExtractorError):
1122     """Geographic restriction Error exception.
1123
1124     This exception may be thrown when a video is not available from your
1125     geographic location due to geographic restrictions imposed by a website.
1126     """
1127
1128     def __init__(self, msg, countries=None, **kwargs):
1129         kwargs['expected'] = True
1130         super().__init__(msg, **kwargs)
1131         self.countries = countries
1132
1133
1134 class UserNotLive(ExtractorError):
1135     """Error when a channel/user is not live"""
1136
1137     def __init__(self, msg=None, **kwargs):
1138         kwargs['expected'] = True
1139         super().__init__(msg or 'The channel is not currently live', **kwargs)
1140
1141
1142 class DownloadError(YoutubeDLError):
1143     """Download Error exception.
1144
1145     This exception may be thrown by FileDownloader objects if they are not
1146     configured to continue on errors. They will contain the appropriate
1147     error message.
1148     """
1149
1150     def __init__(self, msg, exc_info=None):
1151         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1152         super().__init__(msg)
1153         self.exc_info = exc_info
1154
1155
1156 class EntryNotInPlaylist(YoutubeDLError):
1157     """Entry not in playlist exception.
1158
1159     This exception will be thrown by YoutubeDL when a requested entry
1160     is not found in the playlist info_dict
1161     """
1162     msg = 'Entry not found in info'
1163
1164
1165 class SameFileError(YoutubeDLError):
1166     """Same File exception.
1167
1168     This exception will be thrown by FileDownloader objects if they detect
1169     multiple files would have to be downloaded to the same file on disk.
1170     """
1171     msg = 'Fixed output name but more than one file to download'
1172
1173     def __init__(self, filename=None):
1174         if filename is not None:
1175             self.msg += f': {filename}'
1176         super().__init__(self.msg)
1177
1178
1179 class PostProcessingError(YoutubeDLError):
1180     """Post Processing exception.
1181
1182     This exception may be raised by PostProcessor's .run() method to
1183     indicate an error in the postprocessing task.
1184     """
1185
1186
1187 class DownloadCancelled(YoutubeDLError):
1188     """ Exception raised when the download queue should be interrupted """
1189     msg = 'The download was cancelled'
1190
1191
1192 class ExistingVideoReached(DownloadCancelled):
1193     """ --break-on-existing triggered """
1194     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1195
1196
1197 class RejectedVideoReached(DownloadCancelled):
1198     """ --break-match-filter triggered """
1199     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1200
1201
1202 class MaxDownloadsReached(DownloadCancelled):
1203     """ --max-downloads limit has been reached. """
1204     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1205
1206
1207 class ReExtractInfo(YoutubeDLError):
1208     """ Video info needs to be re-extracted. """
1209
1210     def __init__(self, msg, expected=False):
1211         super().__init__(msg)
1212         self.expected = expected
1213
1214
1215 class ThrottledDownload(ReExtractInfo):
1216     """ Download speed below --throttled-rate. """
1217     msg = 'The download speed is below throttle limit'
1218
1219     def __init__(self):
1220         super().__init__(self.msg, expected=False)
1221
1222
1223 class UnavailableVideoError(YoutubeDLError):
1224     """Unavailable Format exception.
1225
1226     This exception will be thrown when a video is requested
1227     in a format that is not available for that video.
1228     """
1229     msg = 'Unable to download video'
1230
1231     def __init__(self, err=None):
1232         if err is not None:
1233             self.msg += f': {err}'
1234         super().__init__(self.msg)
1235
1236
1237 class ContentTooShortError(YoutubeDLError):
1238     """Content Too Short exception.
1239
1240     This exception may be raised by FileDownloader objects when a file they
1241     download is too small for what the server announced first, indicating
1242     the connection was probably interrupted.
1243     """
1244
1245     def __init__(self, downloaded, expected):
1246         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1247         # Both in bytes
1248         self.downloaded = downloaded
1249         self.expected = expected
1250
1251
1252 class XAttrMetadataError(YoutubeDLError):
1253     def __init__(self, code=None, msg='Unknown error'):
1254         super().__init__(msg)
1255         self.code = code
1256         self.msg = msg
1257
1258         # Parsing code and msg
1259         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1260                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1261             self.reason = 'NO_SPACE'
1262         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1263             self.reason = 'VALUE_TOO_LONG'
1264         else:
1265             self.reason = 'NOT_SUPPORTED'
1266
1267
1268 class XAttrUnavailableError(YoutubeDLError):
1269     pass
1270
1271
1272 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1273     hc = http_class(*args, **kwargs)
1274     source_address = ydl_handler._params.get('source_address')
1275
1276     if source_address is not None:
1277         # This is to workaround _create_connection() from socket where it will try all
1278         # address data from getaddrinfo() including IPv6. This filters the result from
1279         # getaddrinfo() based on the source_address value.
1280         # This is based on the cpython socket.create_connection() function.
1281         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1282         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1283             host, port = address
1284             err = None
1285             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1286             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1287             ip_addrs = [addr for addr in addrs if addr[0] == af]
1288             if addrs and not ip_addrs:
1289                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1290                 raise OSError(
1291                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1292                     % (ip_version, source_address[0]))
1293             for res in ip_addrs:
1294                 af, socktype, proto, canonname, sa = res
1295                 sock = None
1296                 try:
1297                     sock = socket.socket(af, socktype, proto)
1298                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1299                         sock.settimeout(timeout)
1300                     sock.bind(source_address)
1301                     sock.connect(sa)
1302                     err = None  # Explicitly break reference cycle
1303                     return sock
1304                 except OSError as _:
1305                     err = _
1306                     if sock is not None:
1307                         sock.close()
1308             if err is not None:
1309                 raise err
1310             else:
1311                 raise OSError('getaddrinfo returns an empty list')
1312         if hasattr(hc, '_create_connection'):
1313             hc._create_connection = _create_connection
1314         hc.source_address = (source_address, 0)
1315
1316     return hc
1317
1318
1319 class YoutubeDLHandler(urllib.request.HTTPHandler):
1320     """Handler for HTTP requests and responses.
1321
1322     This class, when installed with an OpenerDirector, automatically adds
1323     the standard headers to every HTTP request and handles gzipped, deflated and
1324     brotli responses from web servers.
1325
1326     Part of this code was copied from:
1327
1328     http://techknack.net/python-urllib2-handlers/
1329
1330     Andrew Rowls, the author of that code, agreed to release it to the
1331     public domain.
1332     """
1333
1334     def __init__(self, params, *args, **kwargs):
1335         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1336         self._params = params
1337
1338     def http_open(self, req):
1339         conn_class = http.client.HTTPConnection
1340
1341         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1342         if socks_proxy:
1343             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1344             del req.headers['Ytdl-socks-proxy']
1345
1346         return self.do_open(functools.partial(
1347             _create_http_connection, self, conn_class, False),
1348             req)
1349
1350     @staticmethod
1351     def deflate(data):
1352         if not data:
1353             return data
1354         try:
1355             return zlib.decompress(data, -zlib.MAX_WBITS)
1356         except zlib.error:
1357             return zlib.decompress(data)
1358
1359     @staticmethod
1360     def brotli(data):
1361         if not data:
1362             return data
1363         return brotli.decompress(data)
1364
1365     @staticmethod
1366     def gz(data):
1367         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
1368         try:
1369             return gz.read()
1370         except OSError as original_oserror:
1371             # There may be junk add the end of the file
1372             # See http://stackoverflow.com/q/4928560/35070 for details
1373             for i in range(1, 1024):
1374                 try:
1375                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
1376                     return gz.read()
1377                 except OSError:
1378                     continue
1379             else:
1380                 raise original_oserror
1381
1382     def http_request(self, req):
1383         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1384         # always respected by websites, some tend to give out URLs with non percent-encoded
1385         # non-ASCII characters (see telemb.py, ard.py [#3412])
1386         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1387         # To work around aforementioned issue we will replace request's original URL with
1388         # percent-encoded one
1389         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1390         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1391         url = req.get_full_url()
1392         url_escaped = escape_url(url)
1393
1394         # Substitute URL if any change after escaping
1395         if url != url_escaped:
1396             req = update_Request(req, url=url_escaped)
1397
1398         for h, v in self._params.get('http_headers', std_headers).items():
1399             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1400             # The dict keys are capitalized because of this bug by urllib
1401             if h.capitalize() not in req.headers:
1402                 req.add_header(h, v)
1403
1404         if 'Youtubedl-no-compression' in req.headers:  # deprecated
1405             req.headers.pop('Youtubedl-no-compression', None)
1406             req.add_header('Accept-encoding', 'identity')
1407
1408         if 'Accept-encoding' not in req.headers:
1409             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1410
1411         return super().do_request_(req)
1412
1413     def http_response(self, req, resp):
1414         old_resp = resp
1415
1416         # Content-Encoding header lists the encodings in order that they were applied [1].
1417         # To decompress, we simply do the reverse.
1418         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
1419         decoded_response = None
1420         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
1421             if encoding == 'gzip':
1422                 decoded_response = self.gz(decoded_response or resp.read())
1423             elif encoding == 'deflate':
1424                 decoded_response = self.deflate(decoded_response or resp.read())
1425             elif encoding == 'br' and brotli:
1426                 decoded_response = self.brotli(decoded_response or resp.read())
1427
1428         if decoded_response is not None:
1429             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
1430             resp.msg = old_resp.msg
1431         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1432         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1433         if 300 <= resp.code < 400:
1434             location = resp.headers.get('Location')
1435             if location:
1436                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1437                 location = location.encode('iso-8859-1').decode()
1438                 location_escaped = escape_url(location)
1439                 if location != location_escaped:
1440                     del resp.headers['Location']
1441                     resp.headers['Location'] = location_escaped
1442         return resp
1443
1444     https_request = http_request
1445     https_response = http_response
1446
1447
1448 def make_socks_conn_class(base_class, socks_proxy):
1449     assert issubclass(base_class, (
1450         http.client.HTTPConnection, http.client.HTTPSConnection))
1451
1452     url_components = urllib.parse.urlparse(socks_proxy)
1453     if url_components.scheme.lower() == 'socks5':
1454         socks_type = ProxyType.SOCKS5
1455     elif url_components.scheme.lower() in ('socks', 'socks4'):
1456         socks_type = ProxyType.SOCKS4
1457     elif url_components.scheme.lower() == 'socks4a':
1458         socks_type = ProxyType.SOCKS4A
1459
1460     def unquote_if_non_empty(s):
1461         if not s:
1462             return s
1463         return urllib.parse.unquote_plus(s)
1464
1465     proxy_args = (
1466         socks_type,
1467         url_components.hostname, url_components.port or 1080,
1468         True,  # Remote DNS
1469         unquote_if_non_empty(url_components.username),
1470         unquote_if_non_empty(url_components.password),
1471     )
1472
1473     class SocksConnection(base_class):
1474         def connect(self):
1475             self.sock = sockssocket()
1476             self.sock.setproxy(*proxy_args)
1477             if isinstance(self.timeout, (int, float)):
1478                 self.sock.settimeout(self.timeout)
1479             self.sock.connect((self.host, self.port))
1480
1481             if isinstance(self, http.client.HTTPSConnection):
1482                 if hasattr(self, '_context'):  # Python > 2.6
1483                     self.sock = self._context.wrap_socket(
1484                         self.sock, server_hostname=self.host)
1485                 else:
1486                     self.sock = ssl.wrap_socket(self.sock)
1487
1488     return SocksConnection
1489
1490
1491 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1492     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1493         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1494         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1495         self._params = params
1496
1497     def https_open(self, req):
1498         kwargs = {}
1499         conn_class = self._https_conn_class
1500
1501         if hasattr(self, '_context'):  # python > 2.6
1502             kwargs['context'] = self._context
1503         if hasattr(self, '_check_hostname'):  # python 3.x
1504             kwargs['check_hostname'] = self._check_hostname
1505
1506         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1507         if socks_proxy:
1508             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1509             del req.headers['Ytdl-socks-proxy']
1510
1511         try:
1512             return self.do_open(
1513                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1514         except urllib.error.URLError as e:
1515             if (isinstance(e.reason, ssl.SSLError)
1516                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1517                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1518             raise
1519
1520
1521 def is_path_like(f):
1522     return isinstance(f, (str, bytes, os.PathLike))
1523
1524
1525 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1526     def __init__(self, cookiejar=None):
1527         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1528
1529     def http_response(self, request, response):
1530         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1531
1532     https_request = urllib.request.HTTPCookieProcessor.http_request
1533     https_response = http_response
1534
1535
1536 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1537     """YoutubeDL redirect handler
1538
1539     The code is based on HTTPRedirectHandler implementation from CPython [1].
1540
1541     This redirect handler fixes and improves the logic to better align with RFC7261
1542      and what browsers tend to do [2][3]
1543
1544     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1545     2. https://datatracker.ietf.org/doc/html/rfc7231
1546     3. https://github.com/python/cpython/issues/91306
1547     """
1548
1549     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1550
1551     def redirect_request(self, req, fp, code, msg, headers, newurl):
1552         if code not in (301, 302, 303, 307, 308):
1553             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1554
1555         new_method = req.get_method()
1556         new_data = req.data
1557         remove_headers = []
1558         # A 303 must either use GET or HEAD for subsequent request
1559         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1560         if code == 303 and req.get_method() != 'HEAD':
1561             new_method = 'GET'
1562         # 301 and 302 redirects are commonly turned into a GET from a POST
1563         # for subsequent requests by browsers, so we'll do the same.
1564         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1565         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1566         elif code in (301, 302) and req.get_method() == 'POST':
1567             new_method = 'GET'
1568
1569         # only remove payload if method changed (e.g. POST to GET)
1570         if new_method != req.get_method():
1571             new_data = None
1572             remove_headers.extend(['Content-Length', 'Content-Type'])
1573
1574         new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
1575
1576         return urllib.request.Request(
1577             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1578             unverifiable=True, method=new_method, data=new_data)
1579
1580
1581 def extract_timezone(date_str):
1582     m = re.search(
1583         r'''(?x)
1584             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1585             (?P<tz>Z|                                            # just the UTC Z, or
1586                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1587                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1588                    [ ]?                                          # optional space
1589                 (?P<sign>\+|-)                                   # +/-
1590                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1591             $)
1592         ''', date_str)
1593     if not m:
1594         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1595         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1596         if timezone is not None:
1597             date_str = date_str[:-len(m.group('tz'))]
1598         timezone = datetime.timedelta(hours=timezone or 0)
1599     else:
1600         date_str = date_str[:-len(m.group('tz'))]
1601         if not m.group('sign'):
1602             timezone = datetime.timedelta()
1603         else:
1604             sign = 1 if m.group('sign') == '+' else -1
1605             timezone = datetime.timedelta(
1606                 hours=sign * int(m.group('hours')),
1607                 minutes=sign * int(m.group('minutes')))
1608     return timezone, date_str
1609
1610
1611 def parse_iso8601(date_str, delimiter='T', timezone=None):
1612     """ Return a UNIX timestamp from the given date """
1613
1614     if date_str is None:
1615         return None
1616
1617     date_str = re.sub(r'\.[0-9]+', '', date_str)
1618
1619     if timezone is None:
1620         timezone, date_str = extract_timezone(date_str)
1621
1622     with contextlib.suppress(ValueError):
1623         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1624         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1625         return calendar.timegm(dt.timetuple())
1626
1627
1628 def date_formats(day_first=True):
1629     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1630
1631
1632 def unified_strdate(date_str, day_first=True):
1633     """Return a string with the date in the format YYYYMMDD"""
1634
1635     if date_str is None:
1636         return None
1637     upload_date = None
1638     # Replace commas
1639     date_str = date_str.replace(',', ' ')
1640     # Remove AM/PM + timezone
1641     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1642     _, date_str = extract_timezone(date_str)
1643
1644     for expression in date_formats(day_first):
1645         with contextlib.suppress(ValueError):
1646             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1647     if upload_date is None:
1648         timetuple = email.utils.parsedate_tz(date_str)
1649         if timetuple:
1650             with contextlib.suppress(ValueError):
1651                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1652     if upload_date is not None:
1653         return str(upload_date)
1654
1655
1656 def unified_timestamp(date_str, day_first=True):
1657     if date_str is None:
1658         return None
1659
1660     date_str = re.sub(r'\s+', ' ', re.sub(
1661         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1662
1663     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1664     timezone, date_str = extract_timezone(date_str)
1665
1666     # Remove AM/PM + timezone
1667     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1668
1669     # Remove unrecognized timezones from ISO 8601 alike timestamps
1670     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1671     if m:
1672         date_str = date_str[:-len(m.group('tz'))]
1673
1674     # Python only supports microseconds, so remove nanoseconds
1675     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1676     if m:
1677         date_str = m.group(1)
1678
1679     for expression in date_formats(day_first):
1680         with contextlib.suppress(ValueError):
1681             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1682             return calendar.timegm(dt.timetuple())
1683
1684     timetuple = email.utils.parsedate_tz(date_str)
1685     if timetuple:
1686         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1687
1688
1689 def determine_ext(url, default_ext='unknown_video'):
1690     if url is None or '.' not in url:
1691         return default_ext
1692     guess = url.partition('?')[0].rpartition('.')[2]
1693     if re.match(r'^[A-Za-z0-9]+$', guess):
1694         return guess
1695     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1696     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1697         return guess.rstrip('/')
1698     else:
1699         return default_ext
1700
1701
1702 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1703     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1704
1705
1706 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1707     R"""
1708     Return a datetime object from a string.
1709     Supported format:
1710         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1711
1712     @param format       strftime format of DATE
1713     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1714                         auto: round to the unit provided in date_str (if applicable).
1715     """
1716     auto_precision = False
1717     if precision == 'auto':
1718         auto_precision = True
1719         precision = 'microsecond'
1720     today = datetime_round(datetime.datetime.utcnow(), precision)
1721     if date_str in ('now', 'today'):
1722         return today
1723     if date_str == 'yesterday':
1724         return today - datetime.timedelta(days=1)
1725     match = re.match(
1726         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1727         date_str)
1728     if match is not None:
1729         start_time = datetime_from_str(match.group('start'), precision, format)
1730         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1731         unit = match.group('unit')
1732         if unit == 'month' or unit == 'year':
1733             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1734             unit = 'day'
1735         else:
1736             if unit == 'week':
1737                 unit = 'day'
1738                 time *= 7
1739             delta = datetime.timedelta(**{unit + 's': time})
1740             new_date = start_time + delta
1741         if auto_precision:
1742             return datetime_round(new_date, unit)
1743         return new_date
1744
1745     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1746
1747
1748 def date_from_str(date_str, format='%Y%m%d', strict=False):
1749     R"""
1750     Return a date object from a string using datetime_from_str
1751
1752     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1753                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1754     """
1755     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1756         raise ValueError(f'Invalid date format "{date_str}"')
1757     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1758
1759
1760 def datetime_add_months(dt, months):
1761     """Increment/Decrement a datetime object by months."""
1762     month = dt.month + months - 1
1763     year = dt.year + month // 12
1764     month = month % 12 + 1
1765     day = min(dt.day, calendar.monthrange(year, month)[1])
1766     return dt.replace(year, month, day)
1767
1768
1769 def datetime_round(dt, precision='day'):
1770     """
1771     Round a datetime object's time to a specific precision
1772     """
1773     if precision == 'microsecond':
1774         return dt
1775
1776     unit_seconds = {
1777         'day': 86400,
1778         'hour': 3600,
1779         'minute': 60,
1780         'second': 1,
1781     }
1782     roundto = lambda x, n: ((x + n / 2) // n) * n
1783     timestamp = calendar.timegm(dt.timetuple())
1784     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1785
1786
1787 def hyphenate_date(date_str):
1788     """
1789     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1790     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1791     if match is not None:
1792         return '-'.join(match.groups())
1793     else:
1794         return date_str
1795
1796
1797 class DateRange:
1798     """Represents a time interval between two dates"""
1799
1800     def __init__(self, start=None, end=None):
1801         """start and end must be strings in the format accepted by date"""
1802         if start is not None:
1803             self.start = date_from_str(start, strict=True)
1804         else:
1805             self.start = datetime.datetime.min.date()
1806         if end is not None:
1807             self.end = date_from_str(end, strict=True)
1808         else:
1809             self.end = datetime.datetime.max.date()
1810         if self.start > self.end:
1811             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1812
1813     @classmethod
1814     def day(cls, day):
1815         """Returns a range that only contains the given day"""
1816         return cls(day, day)
1817
1818     def __contains__(self, date):
1819         """Check if the date is in the range"""
1820         if not isinstance(date, datetime.date):
1821             date = date_from_str(date)
1822         return self.start <= date <= self.end
1823
1824     def __repr__(self):
1825         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1826
1827     def __eq__(self, other):
1828         return (isinstance(other, DateRange)
1829                 and self.start == other.start and self.end == other.end)
1830
1831
1832 @functools.cache
1833 def system_identifier():
1834     python_implementation = platform.python_implementation()
1835     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1836         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1837     libc_ver = []
1838     with contextlib.suppress(OSError):  # We may not have access to the executable
1839         libc_ver = platform.libc_ver()
1840
1841     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1842         platform.python_version(),
1843         python_implementation,
1844         platform.machine(),
1845         platform.architecture()[0],
1846         platform.platform(),
1847         ssl.OPENSSL_VERSION,
1848         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1849     )
1850
1851
1852 @functools.cache
1853 def get_windows_version():
1854     ''' Get Windows version. returns () if it's not running on Windows '''
1855     if compat_os_name == 'nt':
1856         return version_tuple(platform.win32_ver()[1])
1857     else:
1858         return ()
1859
1860
1861 def write_string(s, out=None, encoding=None):
1862     assert isinstance(s, str)
1863     out = out or sys.stderr
1864     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1865     if not out:
1866         return
1867
1868     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1869         s = re.sub(r'([\r\n]+)', r' \1', s)
1870
1871     enc, buffer = None, out
1872     if 'b' in getattr(out, 'mode', ''):
1873         enc = encoding or preferredencoding()
1874     elif hasattr(out, 'buffer'):
1875         buffer = out.buffer
1876         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1877
1878     buffer.write(s.encode(enc, 'ignore') if enc else s)
1879     out.flush()
1880
1881
1882 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1883     from .. import _IN_CLI
1884     if _IN_CLI:
1885         if msg in deprecation_warning._cache:
1886             return
1887         deprecation_warning._cache.add(msg)
1888         if printer:
1889             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1890         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1891     else:
1892         import warnings
1893         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1894
1895
1896 deprecation_warning._cache = set()
1897
1898
1899 def bytes_to_intlist(bs):
1900     if not bs:
1901         return []
1902     if isinstance(bs[0], int):  # Python 3
1903         return list(bs)
1904     else:
1905         return [ord(c) for c in bs]
1906
1907
1908 def intlist_to_bytes(xs):
1909     if not xs:
1910         return b''
1911     return struct.pack('%dB' % len(xs), *xs)
1912
1913
1914 class LockingUnsupportedError(OSError):
1915     msg = 'File locking is not supported'
1916
1917     def __init__(self):
1918         super().__init__(self.msg)
1919
1920
1921 # Cross-platform file locking
1922 if sys.platform == 'win32':
1923     import ctypes
1924     import ctypes.wintypes
1925     import msvcrt
1926
1927     class OVERLAPPED(ctypes.Structure):
1928         _fields_ = [
1929             ('Internal', ctypes.wintypes.LPVOID),
1930             ('InternalHigh', ctypes.wintypes.LPVOID),
1931             ('Offset', ctypes.wintypes.DWORD),
1932             ('OffsetHigh', ctypes.wintypes.DWORD),
1933             ('hEvent', ctypes.wintypes.HANDLE),
1934         ]
1935
1936     kernel32 = ctypes.WinDLL('kernel32')
1937     LockFileEx = kernel32.LockFileEx
1938     LockFileEx.argtypes = [
1939         ctypes.wintypes.HANDLE,     # hFile
1940         ctypes.wintypes.DWORD,      # dwFlags
1941         ctypes.wintypes.DWORD,      # dwReserved
1942         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1943         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1944         ctypes.POINTER(OVERLAPPED)  # Overlapped
1945     ]
1946     LockFileEx.restype = ctypes.wintypes.BOOL
1947     UnlockFileEx = kernel32.UnlockFileEx
1948     UnlockFileEx.argtypes = [
1949         ctypes.wintypes.HANDLE,     # hFile
1950         ctypes.wintypes.DWORD,      # dwReserved
1951         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1952         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1953         ctypes.POINTER(OVERLAPPED)  # Overlapped
1954     ]
1955     UnlockFileEx.restype = ctypes.wintypes.BOOL
1956     whole_low = 0xffffffff
1957     whole_high = 0x7fffffff
1958
1959     def _lock_file(f, exclusive, block):
1960         overlapped = OVERLAPPED()
1961         overlapped.Offset = 0
1962         overlapped.OffsetHigh = 0
1963         overlapped.hEvent = 0
1964         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1965
1966         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1967                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1968                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1969             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1970             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1971
1972     def _unlock_file(f):
1973         assert f._lock_file_overlapped_p
1974         handle = msvcrt.get_osfhandle(f.fileno())
1975         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1976             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1977
1978 else:
1979     try:
1980         import fcntl
1981
1982         def _lock_file(f, exclusive, block):
1983             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1984             if not block:
1985                 flags |= fcntl.LOCK_NB
1986             try:
1987                 fcntl.flock(f, flags)
1988             except BlockingIOError:
1989                 raise
1990             except OSError:  # AOSP does not have flock()
1991                 fcntl.lockf(f, flags)
1992
1993         def _unlock_file(f):
1994             with contextlib.suppress(OSError):
1995                 return fcntl.flock(f, fcntl.LOCK_UN)
1996             with contextlib.suppress(OSError):
1997                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1998             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1999
2000     except ImportError:
2001
2002         def _lock_file(f, exclusive, block):
2003             raise LockingUnsupportedError()
2004
2005         def _unlock_file(f):
2006             raise LockingUnsupportedError()
2007
2008
2009 class locked_file:
2010     locked = False
2011
2012     def __init__(self, filename, mode, block=True, encoding=None):
2013         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2014             raise NotImplementedError(mode)
2015         self.mode, self.block = mode, block
2016
2017         writable = any(f in mode for f in 'wax+')
2018         readable = any(f in mode for f in 'r+')
2019         flags = functools.reduce(operator.ior, (
2020             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2021             getattr(os, 'O_BINARY', 0),  # Windows only
2022             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2023             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2024             os.O_APPEND if 'a' in mode else 0,
2025             os.O_EXCL if 'x' in mode else 0,
2026             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2027         ))
2028
2029         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2030
2031     def __enter__(self):
2032         exclusive = 'r' not in self.mode
2033         try:
2034             _lock_file(self.f, exclusive, self.block)
2035             self.locked = True
2036         except OSError:
2037             self.f.close()
2038             raise
2039         if 'w' in self.mode:
2040             try:
2041                 self.f.truncate()
2042             except OSError as e:
2043                 if e.errno not in (
2044                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2045                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2046                 ):
2047                     raise
2048         return self
2049
2050     def unlock(self):
2051         if not self.locked:
2052             return
2053         try:
2054             _unlock_file(self.f)
2055         finally:
2056             self.locked = False
2057
2058     def __exit__(self, *_):
2059         try:
2060             self.unlock()
2061         finally:
2062             self.f.close()
2063
2064     open = __enter__
2065     close = __exit__
2066
2067     def __getattr__(self, attr):
2068         return getattr(self.f, attr)
2069
2070     def __iter__(self):
2071         return iter(self.f)
2072
2073
2074 @functools.cache
2075 def get_filesystem_encoding():
2076     encoding = sys.getfilesystemencoding()
2077     return encoding if encoding is not None else 'utf-8'
2078
2079
2080 def shell_quote(args):
2081     quoted_args = []
2082     encoding = get_filesystem_encoding()
2083     for a in args:
2084         if isinstance(a, bytes):
2085             # We may get a filename encoded with 'encodeFilename'
2086             a = a.decode(encoding)
2087         quoted_args.append(compat_shlex_quote(a))
2088     return ' '.join(quoted_args)
2089
2090
2091 def smuggle_url(url, data):
2092     """ Pass additional data in a URL for internal use. """
2093
2094     url, idata = unsmuggle_url(url, {})
2095     data.update(idata)
2096     sdata = urllib.parse.urlencode(
2097         {'__youtubedl_smuggle': json.dumps(data)})
2098     return url + '#' + sdata
2099
2100
2101 def unsmuggle_url(smug_url, default=None):
2102     if '#__youtubedl_smuggle' not in smug_url:
2103         return smug_url, default
2104     url, _, sdata = smug_url.rpartition('#')
2105     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2106     data = json.loads(jsond)
2107     return url, data
2108
2109
2110 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2111     """ Formats numbers with decimal sufixes like K, M, etc """
2112     num, factor = float_or_none(num), float(factor)
2113     if num is None or num < 0:
2114         return None
2115     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2116     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2117     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2118     if factor == 1024:
2119         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2120     converted = num / (factor ** exponent)
2121     return fmt % (converted, suffix)
2122
2123
2124 def format_bytes(bytes):
2125     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2126
2127
2128 def lookup_unit_table(unit_table, s, strict=False):
2129     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2130     units_re = '|'.join(re.escape(u) for u in unit_table)
2131     m = (re.fullmatch if strict else re.match)(
2132         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2133     if not m:
2134         return None
2135
2136     num = float(m.group('num').replace(',', '.'))
2137     mult = unit_table[m.group('unit')]
2138     return round(num * mult)
2139
2140
2141 def parse_bytes(s):
2142     """Parse a string indicating a byte quantity into an integer"""
2143     return lookup_unit_table(
2144         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2145         s.upper(), strict=True)
2146
2147
2148 def parse_filesize(s):
2149     if s is None:
2150         return None
2151
2152     # The lower-case forms are of course incorrect and unofficial,
2153     # but we support those too
2154     _UNIT_TABLE = {
2155         'B': 1,
2156         'b': 1,
2157         'bytes': 1,
2158         'KiB': 1024,
2159         'KB': 1000,
2160         'kB': 1024,
2161         'Kb': 1000,
2162         'kb': 1000,
2163         'kilobytes': 1000,
2164         'kibibytes': 1024,
2165         'MiB': 1024 ** 2,
2166         'MB': 1000 ** 2,
2167         'mB': 1024 ** 2,
2168         'Mb': 1000 ** 2,
2169         'mb': 1000 ** 2,
2170         'megabytes': 1000 ** 2,
2171         'mebibytes': 1024 ** 2,
2172         'GiB': 1024 ** 3,
2173         'GB': 1000 ** 3,
2174         'gB': 1024 ** 3,
2175         'Gb': 1000 ** 3,
2176         'gb': 1000 ** 3,
2177         'gigabytes': 1000 ** 3,
2178         'gibibytes': 1024 ** 3,
2179         'TiB': 1024 ** 4,
2180         'TB': 1000 ** 4,
2181         'tB': 1024 ** 4,
2182         'Tb': 1000 ** 4,
2183         'tb': 1000 ** 4,
2184         'terabytes': 1000 ** 4,
2185         'tebibytes': 1024 ** 4,
2186         'PiB': 1024 ** 5,
2187         'PB': 1000 ** 5,
2188         'pB': 1024 ** 5,
2189         'Pb': 1000 ** 5,
2190         'pb': 1000 ** 5,
2191         'petabytes': 1000 ** 5,
2192         'pebibytes': 1024 ** 5,
2193         'EiB': 1024 ** 6,
2194         'EB': 1000 ** 6,
2195         'eB': 1024 ** 6,
2196         'Eb': 1000 ** 6,
2197         'eb': 1000 ** 6,
2198         'exabytes': 1000 ** 6,
2199         'exbibytes': 1024 ** 6,
2200         'ZiB': 1024 ** 7,
2201         'ZB': 1000 ** 7,
2202         'zB': 1024 ** 7,
2203         'Zb': 1000 ** 7,
2204         'zb': 1000 ** 7,
2205         'zettabytes': 1000 ** 7,
2206         'zebibytes': 1024 ** 7,
2207         'YiB': 1024 ** 8,
2208         'YB': 1000 ** 8,
2209         'yB': 1024 ** 8,
2210         'Yb': 1000 ** 8,
2211         'yb': 1000 ** 8,
2212         'yottabytes': 1000 ** 8,
2213         'yobibytes': 1024 ** 8,
2214     }
2215
2216     return lookup_unit_table(_UNIT_TABLE, s)
2217
2218
2219 def parse_count(s):
2220     if s is None:
2221         return None
2222
2223     s = re.sub(r'^[^\d]+\s', '', s).strip()
2224
2225     if re.match(r'^[\d,.]+$', s):
2226         return str_to_int(s)
2227
2228     _UNIT_TABLE = {
2229         'k': 1000,
2230         'K': 1000,
2231         'm': 1000 ** 2,
2232         'M': 1000 ** 2,
2233         'kk': 1000 ** 2,
2234         'KK': 1000 ** 2,
2235         'b': 1000 ** 3,
2236         'B': 1000 ** 3,
2237     }
2238
2239     ret = lookup_unit_table(_UNIT_TABLE, s)
2240     if ret is not None:
2241         return ret
2242
2243     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2244     if mobj:
2245         return str_to_int(mobj.group(1))
2246
2247
2248 def parse_resolution(s, *, lenient=False):
2249     if s is None:
2250         return {}
2251
2252     if lenient:
2253         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2254     else:
2255         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2256     if mobj:
2257         return {
2258             'width': int(mobj.group('w')),
2259             'height': int(mobj.group('h')),
2260         }
2261
2262     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2263     if mobj:
2264         return {'height': int(mobj.group(1))}
2265
2266     mobj = re.search(r'\b([48])[kK]\b', s)
2267     if mobj:
2268         return {'height': int(mobj.group(1)) * 540}
2269
2270     return {}
2271
2272
2273 def parse_bitrate(s):
2274     if not isinstance(s, str):
2275         return
2276     mobj = re.search(r'\b(\d+)\s*kbps', s)
2277     if mobj:
2278         return int(mobj.group(1))
2279
2280
2281 def month_by_name(name, lang='en'):
2282     """ Return the number of a month by (locale-independently) English name """
2283
2284     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2285
2286     try:
2287         return month_names.index(name) + 1
2288     except ValueError:
2289         return None
2290
2291
2292 def month_by_abbreviation(abbrev):
2293     """ Return the number of a month by (locale-independently) English
2294         abbreviations """
2295
2296     try:
2297         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2298     except ValueError:
2299         return None
2300
2301
2302 def fix_xml_ampersands(xml_str):
2303     """Replace all the '&' by '&amp;' in XML"""
2304     return re.sub(
2305         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2306         '&amp;',
2307         xml_str)
2308
2309
2310 def setproctitle(title):
2311     assert isinstance(title, str)
2312
2313     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2314     try:
2315         import ctypes
2316     except ImportError:
2317         return
2318
2319     try:
2320         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2321     except OSError:
2322         return
2323     except TypeError:
2324         # LoadLibrary in Windows Python 2.7.13 only expects
2325         # a bytestring, but since unicode_literals turns
2326         # every string into a unicode string, it fails.
2327         return
2328     title_bytes = title.encode()
2329     buf = ctypes.create_string_buffer(len(title_bytes))
2330     buf.value = title_bytes
2331     try:
2332         libc.prctl(15, buf, 0, 0, 0)
2333     except AttributeError:
2334         return  # Strange libc, just skip this
2335
2336
2337 def remove_start(s, start):
2338     return s[len(start):] if s is not None and s.startswith(start) else s
2339
2340
2341 def remove_end(s, end):
2342     return s[:-len(end)] if s is not None and s.endswith(end) else s
2343
2344
2345 def remove_quotes(s):
2346     if s is None or len(s) < 2:
2347         return s
2348     for quote in ('"', "'", ):
2349         if s[0] == quote and s[-1] == quote:
2350             return s[1:-1]
2351     return s
2352
2353
2354 def get_domain(url):
2355     """
2356     This implementation is inconsistent, but is kept for compatibility.
2357     Use this only for "webpage_url_domain"
2358     """
2359     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2360
2361
2362 def url_basename(url):
2363     path = urllib.parse.urlparse(url).path
2364     return path.strip('/').split('/')[-1]
2365
2366
2367 def base_url(url):
2368     return re.match(r'https?://[^?#]+/', url).group()
2369
2370
2371 def urljoin(base, path):
2372     if isinstance(path, bytes):
2373         path = path.decode()
2374     if not isinstance(path, str) or not path:
2375         return None
2376     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2377         return path
2378     if isinstance(base, bytes):
2379         base = base.decode()
2380     if not isinstance(base, str) or not re.match(
2381             r'^(?:https?:)?//', base):
2382         return None
2383     return urllib.parse.urljoin(base, path)
2384
2385
2386 class HEADRequest(urllib.request.Request):
2387     def get_method(self):
2388         return 'HEAD'
2389
2390
2391 class PUTRequest(urllib.request.Request):
2392     def get_method(self):
2393         return 'PUT'
2394
2395
2396 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2397     if get_attr and v is not None:
2398         v = getattr(v, get_attr, None)
2399     try:
2400         return int(v) * invscale // scale
2401     except (ValueError, TypeError, OverflowError):
2402         return default
2403
2404
2405 def str_or_none(v, default=None):
2406     return default if v is None else str(v)
2407
2408
2409 def str_to_int(int_str):
2410     """ A more relaxed version of int_or_none """
2411     if isinstance(int_str, int):
2412         return int_str
2413     elif isinstance(int_str, str):
2414         int_str = re.sub(r'[,\.\+]', '', int_str)
2415         return int_or_none(int_str)
2416
2417
2418 def float_or_none(v, scale=1, invscale=1, default=None):
2419     if v is None:
2420         return default
2421     try:
2422         return float(v) * invscale / scale
2423     except (ValueError, TypeError):
2424         return default
2425
2426
2427 def bool_or_none(v, default=None):
2428     return v if isinstance(v, bool) else default
2429
2430
2431 def strip_or_none(v, default=None):
2432     return v.strip() if isinstance(v, str) else default
2433
2434
2435 def url_or_none(url):
2436     if not url or not isinstance(url, str):
2437         return None
2438     url = url.strip()
2439     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2440
2441
2442 def request_to_url(req):
2443     if isinstance(req, urllib.request.Request):
2444         return req.get_full_url()
2445     else:
2446         return req
2447
2448
2449 def strftime_or_none(timestamp, date_format, default=None):
2450     datetime_object = None
2451     try:
2452         if isinstance(timestamp, (int, float)):  # unix timestamp
2453             # Using naive datetime here can break timestamp() in Windows
2454             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2455             datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2456         elif isinstance(timestamp, str):  # assume YYYYMMDD
2457             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2458         date_format = re.sub(  # Support %s on windows
2459             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2460         return datetime_object.strftime(date_format)
2461     except (ValueError, TypeError, AttributeError):
2462         return default
2463
2464
2465 def parse_duration(s):
2466     if not isinstance(s, str):
2467         return None
2468     s = s.strip()
2469     if not s:
2470         return None
2471
2472     days, hours, mins, secs, ms = [None] * 5
2473     m = re.match(r'''(?x)
2474             (?P<before_secs>
2475                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2476             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2477             (?P<ms>[.:][0-9]+)?Z?$
2478         ''', s)
2479     if m:
2480         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2481     else:
2482         m = re.match(
2483             r'''(?ix)(?:P?
2484                 (?:
2485                     [0-9]+\s*y(?:ears?)?,?\s*
2486                 )?
2487                 (?:
2488                     [0-9]+\s*m(?:onths?)?,?\s*
2489                 )?
2490                 (?:
2491                     [0-9]+\s*w(?:eeks?)?,?\s*
2492                 )?
2493                 (?:
2494                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2495                 )?
2496                 T)?
2497                 (?:
2498                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2499                 )?
2500                 (?:
2501                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2502                 )?
2503                 (?:
2504                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2505                 )?Z?$''', s)
2506         if m:
2507             days, hours, mins, secs, ms = m.groups()
2508         else:
2509             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2510             if m:
2511                 hours, mins = m.groups()
2512             else:
2513                 return None
2514
2515     if ms:
2516         ms = ms.replace(':', '.')
2517     return sum(float(part or 0) * mult for part, mult in (
2518         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2519
2520
2521 def prepend_extension(filename, ext, expected_real_ext=None):
2522     name, real_ext = os.path.splitext(filename)
2523     return (
2524         f'{name}.{ext}{real_ext}'
2525         if not expected_real_ext or real_ext[1:] == expected_real_ext
2526         else f'{filename}.{ext}')
2527
2528
2529 def replace_extension(filename, ext, expected_real_ext=None):
2530     name, real_ext = os.path.splitext(filename)
2531     return '{}.{}'.format(
2532         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2533         ext)
2534
2535
2536 def check_executable(exe, args=[]):
2537     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2538     args can be a list of arguments for a short output (like -version) """
2539     try:
2540         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2541     except OSError:
2542         return False
2543     return exe
2544
2545
2546 def _get_exe_version_output(exe, args):
2547     try:
2548         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2549         # SIGTTOU if yt-dlp is run in the background.
2550         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2551         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2552                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2553         if ret:
2554             return None
2555     except OSError:
2556         return False
2557     return stdout
2558
2559
2560 def detect_exe_version(output, version_re=None, unrecognized='present'):
2561     assert isinstance(output, str)
2562     if version_re is None:
2563         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2564     m = re.search(version_re, output)
2565     if m:
2566         return m.group(1)
2567     else:
2568         return unrecognized
2569
2570
2571 def get_exe_version(exe, args=['--version'],
2572                     version_re=None, unrecognized=('present', 'broken')):
2573     """ Returns the version of the specified executable,
2574     or False if the executable is not present """
2575     unrecognized = variadic(unrecognized)
2576     assert len(unrecognized) in (1, 2)
2577     out = _get_exe_version_output(exe, args)
2578     if out is None:
2579         return unrecognized[-1]
2580     return out and detect_exe_version(out, version_re, unrecognized[0])
2581
2582
2583 def frange(start=0, stop=None, step=1):
2584     """Float range"""
2585     if stop is None:
2586         start, stop = 0, start
2587     sign = [-1, 1][step > 0] if step else 0
2588     while sign * start < sign * stop:
2589         yield start
2590         start += step
2591
2592
2593 class LazyList(collections.abc.Sequence):
2594     """Lazy immutable list from an iterable
2595     Note that slices of a LazyList are lists and not LazyList"""
2596
2597     class IndexError(IndexError):
2598         pass
2599
2600     def __init__(self, iterable, *, reverse=False, _cache=None):
2601         self._iterable = iter(iterable)
2602         self._cache = [] if _cache is None else _cache
2603         self._reversed = reverse
2604
2605     def __iter__(self):
2606         if self._reversed:
2607             # We need to consume the entire iterable to iterate in reverse
2608             yield from self.exhaust()
2609             return
2610         yield from self._cache
2611         for item in self._iterable:
2612             self._cache.append(item)
2613             yield item
2614
2615     def _exhaust(self):
2616         self._cache.extend(self._iterable)
2617         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2618         return self._cache
2619
2620     def exhaust(self):
2621         """Evaluate the entire iterable"""
2622         return self._exhaust()[::-1 if self._reversed else 1]
2623
2624     @staticmethod
2625     def _reverse_index(x):
2626         return None if x is None else ~x
2627
2628     def __getitem__(self, idx):
2629         if isinstance(idx, slice):
2630             if self._reversed:
2631                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2632             start, stop, step = idx.start, idx.stop, idx.step or 1
2633         elif isinstance(idx, int):
2634             if self._reversed:
2635                 idx = self._reverse_index(idx)
2636             start, stop, step = idx, idx, 0
2637         else:
2638             raise TypeError('indices must be integers or slices')
2639         if ((start or 0) < 0 or (stop or 0) < 0
2640                 or (start is None and step < 0)
2641                 or (stop is None and step > 0)):
2642             # We need to consume the entire iterable to be able to slice from the end
2643             # Obviously, never use this with infinite iterables
2644             self._exhaust()
2645             try:
2646                 return self._cache[idx]
2647             except IndexError as e:
2648                 raise self.IndexError(e) from e
2649         n = max(start or 0, stop or 0) - len(self._cache) + 1
2650         if n > 0:
2651             self._cache.extend(itertools.islice(self._iterable, n))
2652         try:
2653             return self._cache[idx]
2654         except IndexError as e:
2655             raise self.IndexError(e) from e
2656
2657     def __bool__(self):
2658         try:
2659             self[-1] if self._reversed else self[0]
2660         except self.IndexError:
2661             return False
2662         return True
2663
2664     def __len__(self):
2665         self._exhaust()
2666         return len(self._cache)
2667
2668     def __reversed__(self):
2669         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2670
2671     def __copy__(self):
2672         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2673
2674     def __repr__(self):
2675         # repr and str should mimic a list. So we exhaust the iterable
2676         return repr(self.exhaust())
2677
2678     def __str__(self):
2679         return repr(self.exhaust())
2680
2681
2682 class PagedList:
2683
2684     class IndexError(IndexError):
2685         pass
2686
2687     def __len__(self):
2688         # This is only useful for tests
2689         return len(self.getslice())
2690
2691     def __init__(self, pagefunc, pagesize, use_cache=True):
2692         self._pagefunc = pagefunc
2693         self._pagesize = pagesize
2694         self._pagecount = float('inf')
2695         self._use_cache = use_cache
2696         self._cache = {}
2697
2698     def getpage(self, pagenum):
2699         page_results = self._cache.get(pagenum)
2700         if page_results is None:
2701             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2702         if self._use_cache:
2703             self._cache[pagenum] = page_results
2704         return page_results
2705
2706     def getslice(self, start=0, end=None):
2707         return list(self._getslice(start, end))
2708
2709     def _getslice(self, start, end):
2710         raise NotImplementedError('This method must be implemented by subclasses')
2711
2712     def __getitem__(self, idx):
2713         assert self._use_cache, 'Indexing PagedList requires cache'
2714         if not isinstance(idx, int) or idx < 0:
2715             raise TypeError('indices must be non-negative integers')
2716         entries = self.getslice(idx, idx + 1)
2717         if not entries:
2718             raise self.IndexError()
2719         return entries[0]
2720
2721
2722 class OnDemandPagedList(PagedList):
2723     """Download pages until a page with less than maximum results"""
2724
2725     def _getslice(self, start, end):
2726         for pagenum in itertools.count(start // self._pagesize):
2727             firstid = pagenum * self._pagesize
2728             nextfirstid = pagenum * self._pagesize + self._pagesize
2729             if start >= nextfirstid:
2730                 continue
2731
2732             startv = (
2733                 start % self._pagesize
2734                 if firstid <= start < nextfirstid
2735                 else 0)
2736             endv = (
2737                 ((end - 1) % self._pagesize) + 1
2738                 if (end is not None and firstid <= end <= nextfirstid)
2739                 else None)
2740
2741             try:
2742                 page_results = self.getpage(pagenum)
2743             except Exception:
2744                 self._pagecount = pagenum - 1
2745                 raise
2746             if startv != 0 or endv is not None:
2747                 page_results = page_results[startv:endv]
2748             yield from page_results
2749
2750             # A little optimization - if current page is not "full", ie. does
2751             # not contain page_size videos then we can assume that this page
2752             # is the last one - there are no more ids on further pages -
2753             # i.e. no need to query again.
2754             if len(page_results) + startv < self._pagesize:
2755                 break
2756
2757             # If we got the whole page, but the next page is not interesting,
2758             # break out early as well
2759             if end == nextfirstid:
2760                 break
2761
2762
2763 class InAdvancePagedList(PagedList):
2764     """PagedList with total number of pages known in advance"""
2765
2766     def __init__(self, pagefunc, pagecount, pagesize):
2767         PagedList.__init__(self, pagefunc, pagesize, True)
2768         self._pagecount = pagecount
2769
2770     def _getslice(self, start, end):
2771         start_page = start // self._pagesize
2772         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2773         skip_elems = start - start_page * self._pagesize
2774         only_more = None if end is None else end - start
2775         for pagenum in range(start_page, end_page):
2776             page_results = self.getpage(pagenum)
2777             if skip_elems:
2778                 page_results = page_results[skip_elems:]
2779                 skip_elems = None
2780             if only_more is not None:
2781                 if len(page_results) < only_more:
2782                     only_more -= len(page_results)
2783                 else:
2784                     yield from page_results[:only_more]
2785                     break
2786             yield from page_results
2787
2788
2789 class PlaylistEntries:
2790     MissingEntry = object()
2791     is_exhausted = False
2792
2793     def __init__(self, ydl, info_dict):
2794         self.ydl = ydl
2795
2796         # _entries must be assigned now since infodict can change during iteration
2797         entries = info_dict.get('entries')
2798         if entries is None:
2799             raise EntryNotInPlaylist('There are no entries')
2800         elif isinstance(entries, list):
2801             self.is_exhausted = True
2802
2803         requested_entries = info_dict.get('requested_entries')
2804         self.is_incomplete = requested_entries is not None
2805         if self.is_incomplete:
2806             assert self.is_exhausted
2807             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2808             for i, entry in zip(requested_entries, entries):
2809                 self._entries[i - 1] = entry
2810         elif isinstance(entries, (list, PagedList, LazyList)):
2811             self._entries = entries
2812         else:
2813             self._entries = LazyList(entries)
2814
2815     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2816         (?P<start>[+-]?\d+)?
2817         (?P<range>[:-]
2818             (?P<end>[+-]?\d+|inf(?:inite)?)?
2819             (?::(?P<step>[+-]?\d+))?
2820         )?''')
2821
2822     @classmethod
2823     def parse_playlist_items(cls, string):
2824         for segment in string.split(','):
2825             if not segment:
2826                 raise ValueError('There is two or more consecutive commas')
2827             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2828             if not mobj:
2829                 raise ValueError(f'{segment!r} is not a valid specification')
2830             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2831             if int_or_none(step) == 0:
2832                 raise ValueError(f'Step in {segment!r} cannot be zero')
2833             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2834
2835     def get_requested_items(self):
2836         playlist_items = self.ydl.params.get('playlist_items')
2837         playlist_start = self.ydl.params.get('playliststart', 1)
2838         playlist_end = self.ydl.params.get('playlistend')
2839         # For backwards compatibility, interpret -1 as whole list
2840         if playlist_end in (-1, None):
2841             playlist_end = ''
2842         if not playlist_items:
2843             playlist_items = f'{playlist_start}:{playlist_end}'
2844         elif playlist_start != 1 or playlist_end:
2845             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2846
2847         for index in self.parse_playlist_items(playlist_items):
2848             for i, entry in self[index]:
2849                 yield i, entry
2850                 if not entry:
2851                     continue
2852                 try:
2853                     # The item may have just been added to archive. Don't break due to it
2854                     if not self.ydl.params.get('lazy_playlist'):
2855                         # TODO: Add auto-generated fields
2856                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2857                 except (ExistingVideoReached, RejectedVideoReached):
2858                     return
2859
2860     def get_full_count(self):
2861         if self.is_exhausted and not self.is_incomplete:
2862             return len(self)
2863         elif isinstance(self._entries, InAdvancePagedList):
2864             if self._entries._pagesize == 1:
2865                 return self._entries._pagecount
2866
2867     @functools.cached_property
2868     def _getter(self):
2869         if isinstance(self._entries, list):
2870             def get_entry(i):
2871                 try:
2872                     entry = self._entries[i]
2873                 except IndexError:
2874                     entry = self.MissingEntry
2875                     if not self.is_incomplete:
2876                         raise self.IndexError()
2877                 if entry is self.MissingEntry:
2878                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2879                 return entry
2880         else:
2881             def get_entry(i):
2882                 try:
2883                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2884                 except (LazyList.IndexError, PagedList.IndexError):
2885                     raise self.IndexError()
2886         return get_entry
2887
2888     def __getitem__(self, idx):
2889         if isinstance(idx, int):
2890             idx = slice(idx, idx)
2891
2892         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2893         step = 1 if idx.step is None else idx.step
2894         if idx.start is None:
2895             start = 0 if step > 0 else len(self) - 1
2896         else:
2897             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2898
2899         # NB: Do not call len(self) when idx == [:]
2900         if idx.stop is None:
2901             stop = 0 if step < 0 else float('inf')
2902         else:
2903             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2904         stop += [-1, 1][step > 0]
2905
2906         for i in frange(start, stop, step):
2907             if i < 0:
2908                 continue
2909             try:
2910                 entry = self._getter(i)
2911             except self.IndexError:
2912                 self.is_exhausted = True
2913                 if step > 0:
2914                     break
2915                 continue
2916             yield i + 1, entry
2917
2918     def __len__(self):
2919         return len(tuple(self[:]))
2920
2921     class IndexError(IndexError):
2922         pass
2923
2924
2925 def uppercase_escape(s):
2926     unicode_escape = codecs.getdecoder('unicode_escape')
2927     return re.sub(
2928         r'\\U[0-9a-fA-F]{8}',
2929         lambda m: unicode_escape(m.group(0))[0],
2930         s)
2931
2932
2933 def lowercase_escape(s):
2934     unicode_escape = codecs.getdecoder('unicode_escape')
2935     return re.sub(
2936         r'\\u[0-9a-fA-F]{4}',
2937         lambda m: unicode_escape(m.group(0))[0],
2938         s)
2939
2940
2941 def escape_rfc3986(s):
2942     """Escape non-ASCII characters as suggested by RFC 3986"""
2943     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2944
2945
2946 def escape_url(url):
2947     """Escape URL as suggested by RFC 3986"""
2948     url_parsed = urllib.parse.urlparse(url)
2949     return url_parsed._replace(
2950         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2951         path=escape_rfc3986(url_parsed.path),
2952         params=escape_rfc3986(url_parsed.params),
2953         query=escape_rfc3986(url_parsed.query),
2954         fragment=escape_rfc3986(url_parsed.fragment)
2955     ).geturl()
2956
2957
2958 def parse_qs(url, **kwargs):
2959     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2960
2961
2962 def read_batch_urls(batch_fd):
2963     def fixup(url):
2964         if not isinstance(url, str):
2965             url = url.decode('utf-8', 'replace')
2966         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2967         for bom in BOM_UTF8:
2968             if url.startswith(bom):
2969                 url = url[len(bom):]
2970         url = url.lstrip()
2971         if not url or url.startswith(('#', ';', ']')):
2972             return False
2973         # "#" cannot be stripped out since it is part of the URI
2974         # However, it can be safely stripped out if following a whitespace
2975         return re.split(r'\s#', url, 1)[0].rstrip()
2976
2977     with contextlib.closing(batch_fd) as fd:
2978         return [url for url in map(fixup, fd) if url]
2979
2980
2981 def urlencode_postdata(*args, **kargs):
2982     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2983
2984
2985 def update_url(url, *, query_update=None, **kwargs):
2986     """Replace URL components specified by kwargs
2987        @param url           str or parse url tuple
2988        @param query_update  update query
2989        @returns             str
2990     """
2991     if isinstance(url, str):
2992         if not kwargs and not query_update:
2993             return url
2994         else:
2995             url = urllib.parse.urlparse(url)
2996     if query_update:
2997         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2998         kwargs['query'] = urllib.parse.urlencode({
2999             **urllib.parse.parse_qs(url.query),
3000             **query_update
3001         }, True)
3002     return urllib.parse.urlunparse(url._replace(**kwargs))
3003
3004
3005 def update_url_query(url, query):
3006     return update_url(url, query_update=query)
3007
3008
3009 def update_Request(req, url=None, data=None, headers=None, query=None):
3010     req_headers = req.headers.copy()
3011     req_headers.update(headers or {})
3012     req_data = data or req.data
3013     req_url = update_url_query(url or req.get_full_url(), query)
3014     req_get_method = req.get_method()
3015     if req_get_method == 'HEAD':
3016         req_type = HEADRequest
3017     elif req_get_method == 'PUT':
3018         req_type = PUTRequest
3019     else:
3020         req_type = urllib.request.Request
3021     new_req = req_type(
3022         req_url, data=req_data, headers=req_headers,
3023         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3024     if hasattr(req, 'timeout'):
3025         new_req.timeout = req.timeout
3026     return new_req
3027
3028
3029 def _multipart_encode_impl(data, boundary):
3030     content_type = 'multipart/form-data; boundary=%s' % boundary
3031
3032     out = b''
3033     for k, v in data.items():
3034         out += b'--' + boundary.encode('ascii') + b'\r\n'
3035         if isinstance(k, str):
3036             k = k.encode()
3037         if isinstance(v, str):
3038             v = v.encode()
3039         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3040         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3041         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3042         if boundary.encode('ascii') in content:
3043             raise ValueError('Boundary overlaps with data')
3044         out += content
3045
3046     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3047
3048     return out, content_type
3049
3050
3051 def multipart_encode(data, boundary=None):
3052     '''
3053     Encode a dict to RFC 7578-compliant form-data
3054
3055     data:
3056         A dict where keys and values can be either Unicode or bytes-like
3057         objects.
3058     boundary:
3059         If specified a Unicode object, it's used as the boundary. Otherwise
3060         a random boundary is generated.
3061
3062     Reference: https://tools.ietf.org/html/rfc7578
3063     '''
3064     has_specified_boundary = boundary is not None
3065
3066     while True:
3067         if boundary is None:
3068             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3069
3070         try:
3071             out, content_type = _multipart_encode_impl(data, boundary)
3072             break
3073         except ValueError:
3074             if has_specified_boundary:
3075                 raise
3076             boundary = None
3077
3078     return out, content_type
3079
3080
3081 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3082     if blocked_types is NO_DEFAULT:
3083         blocked_types = (str, bytes, collections.abc.Mapping)
3084     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3085
3086
3087 def variadic(x, allowed_types=NO_DEFAULT):
3088     if not isinstance(allowed_types, (tuple, type)):
3089         deprecation_warning('allowed_types should be a tuple or a type')
3090         allowed_types = tuple(allowed_types)
3091     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3092
3093
3094 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3095     for f in funcs:
3096         try:
3097             val = f(*args, **kwargs)
3098         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3099             pass
3100         else:
3101             if expected_type is None or isinstance(val, expected_type):
3102                 return val
3103
3104
3105 def try_get(src, getter, expected_type=None):
3106     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3107
3108
3109 def filter_dict(dct, cndn=lambda _, v: v is not None):
3110     return {k: v for k, v in dct.items() if cndn(k, v)}
3111
3112
3113 def merge_dicts(*dicts):
3114     merged = {}
3115     for a_dict in dicts:
3116         for k, v in a_dict.items():
3117             if (v is not None and k not in merged
3118                     or isinstance(v, str) and merged[k] == ''):
3119                 merged[k] = v
3120     return merged
3121
3122
3123 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3124     return string if isinstance(string, str) else str(string, encoding, errors)
3125
3126
3127 US_RATINGS = {
3128     'G': 0,
3129     'PG': 10,
3130     'PG-13': 13,
3131     'R': 16,
3132     'NC': 18,
3133 }
3134
3135
3136 TV_PARENTAL_GUIDELINES = {
3137     'TV-Y': 0,
3138     'TV-Y7': 7,
3139     'TV-G': 0,
3140     'TV-PG': 0,
3141     'TV-14': 14,
3142     'TV-MA': 17,
3143 }
3144
3145
3146 def parse_age_limit(s):
3147     # isinstance(False, int) is True. So type() must be used instead
3148     if type(s) is int:  # noqa: E721
3149         return s if 0 <= s <= 21 else None
3150     elif not isinstance(s, str):
3151         return None
3152     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3153     if m:
3154         return int(m.group('age'))
3155     s = s.upper()
3156     if s in US_RATINGS:
3157         return US_RATINGS[s]
3158     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3159     if m:
3160         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3161     return None
3162
3163
3164 def strip_jsonp(code):
3165     return re.sub(
3166         r'''(?sx)^
3167             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3168             (?:\s*&&\s*(?P=func_name))?
3169             \s*\(\s*(?P<callback_data>.*)\);?
3170             \s*?(?://[^\n]*)*$''',
3171         r'\g<callback_data>', code)
3172
3173
3174 def js_to_json(code, vars={}, *, strict=False):
3175     # vars is a dict of var, val pairs to substitute
3176     STRING_QUOTES = '\'"`'
3177     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3178     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3179     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3180     INTEGER_TABLE = (
3181         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3182         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3183     )
3184
3185     def process_escape(match):
3186         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3187         escape = match.group(1) or match.group(2)
3188
3189         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3190                 else R'\u00' if escape == 'x'
3191                 else '' if escape == '\n'
3192                 else escape)
3193
3194     def template_substitute(match):
3195         evaluated = js_to_json(match.group(1), vars, strict=strict)
3196         if evaluated[0] == '"':
3197             return json.loads(evaluated)
3198         return evaluated
3199
3200     def fix_kv(m):
3201         v = m.group(0)
3202         if v in ('true', 'false', 'null'):
3203             return v
3204         elif v in ('undefined', 'void 0'):
3205             return 'null'
3206         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3207             return ''
3208
3209         if v[0] in STRING_QUOTES:
3210             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3211             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3212             return f'"{escaped}"'
3213
3214         for regex, base in INTEGER_TABLE:
3215             im = re.match(regex, v)
3216             if im:
3217                 i = int(im.group(1), base)
3218                 return f'"{i}":' if v.endswith(':') else str(i)
3219
3220         if v in vars:
3221             try:
3222                 if not strict:
3223                     json.loads(vars[v])
3224             except json.JSONDecodeError:
3225                 return json.dumps(vars[v])
3226             else:
3227                 return vars[v]
3228
3229         if not strict:
3230             return f'"{v}"'
3231
3232         raise ValueError(f'Unknown value: {v}')
3233
3234     def create_map(mobj):
3235         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3236
3237     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3238     if not strict:
3239         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3240         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3241         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3242         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3243
3244     return re.sub(rf'''(?sx)
3245         {STRING_RE}|
3246         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3247         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3248         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3249         [0-9]+(?={SKIP_RE}:)|
3250         !+
3251         ''', fix_kv, code)
3252
3253
3254 def qualities(quality_ids):
3255     """ Get a numeric quality value out of a list of possible values """
3256     def q(qid):
3257         try:
3258             return quality_ids.index(qid)
3259         except ValueError:
3260             return -1
3261     return q
3262
3263
3264 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3265
3266
3267 DEFAULT_OUTTMPL = {
3268     'default': '%(title)s [%(id)s].%(ext)s',
3269     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3270 }
3271 OUTTMPL_TYPES = {
3272     'chapter': None,
3273     'subtitle': None,
3274     'thumbnail': None,
3275     'description': 'description',
3276     'annotation': 'annotations.xml',
3277     'infojson': 'info.json',
3278     'link': None,
3279     'pl_video': None,
3280     'pl_thumbnail': None,
3281     'pl_description': 'description',
3282     'pl_infojson': 'info.json',
3283 }
3284
3285 # As of [1] format syntax is:
3286 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3287 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3288 STR_FORMAT_RE_TMPL = r'''(?x)
3289     (?<!%)(?P<prefix>(?:%%)*)
3290     %
3291     (?P<has_key>\((?P<key>{0})\))?
3292     (?P<format>
3293         (?P<conversion>[#0\-+ ]+)?
3294         (?P<min_width>\d+)?
3295         (?P<precision>\.\d+)?
3296         (?P<len_mod>[hlL])?  # unused in python
3297         {1}  # conversion type
3298     )
3299 '''
3300
3301
3302 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3303
3304
3305 def limit_length(s, length):
3306     """ Add ellipses to overly long strings """
3307     if s is None:
3308         return None
3309     ELLIPSES = '...'
3310     if len(s) > length:
3311         return s[:length - len(ELLIPSES)] + ELLIPSES
3312     return s
3313
3314
3315 def version_tuple(v):
3316     return tuple(int(e) for e in re.split(r'[-.]', v))
3317
3318
3319 def is_outdated_version(version, limit, assume_new=True):
3320     if not version:
3321         return not assume_new
3322     try:
3323         return version_tuple(version) < version_tuple(limit)
3324     except ValueError:
3325         return not assume_new
3326
3327
3328 def ytdl_is_updateable():
3329     """ Returns if yt-dlp can be updated with -U """
3330
3331     from ..update import is_non_updateable
3332
3333     return not is_non_updateable()
3334
3335
3336 def args_to_str(args):
3337     # Get a short string representation for a subprocess command
3338     return ' '.join(compat_shlex_quote(a) for a in args)
3339
3340
3341 def error_to_str(err):
3342     return f'{type(err).__name__}: {err}'
3343
3344
3345 def mimetype2ext(mt, default=NO_DEFAULT):
3346     if not isinstance(mt, str):
3347         if default is not NO_DEFAULT:
3348             return default
3349         return None
3350
3351     MAP = {
3352         # video
3353         '3gpp': '3gp',
3354         'mp2t': 'ts',
3355         'mp4': 'mp4',
3356         'mpeg': 'mpeg',
3357         'mpegurl': 'm3u8',
3358         'quicktime': 'mov',
3359         'webm': 'webm',
3360         'vp9': 'vp9',
3361         'x-flv': 'flv',
3362         'x-m4v': 'm4v',
3363         'x-matroska': 'mkv',
3364         'x-mng': 'mng',
3365         'x-mp4-fragmented': 'mp4',
3366         'x-ms-asf': 'asf',
3367         'x-ms-wmv': 'wmv',
3368         'x-msvideo': 'avi',
3369
3370         # application (streaming playlists)
3371         'dash+xml': 'mpd',
3372         'f4m+xml': 'f4m',
3373         'hds+xml': 'f4m',
3374         'vnd.apple.mpegurl': 'm3u8',
3375         'vnd.ms-sstr+xml': 'ism',
3376         'x-mpegurl': 'm3u8',
3377
3378         # audio
3379         'audio/mp4': 'm4a',
3380         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3381         # Using .mp3 as it's the most popular one
3382         'audio/mpeg': 'mp3',
3383         'audio/webm': 'webm',
3384         'audio/x-matroska': 'mka',
3385         'audio/x-mpegurl': 'm3u',
3386         'midi': 'mid',
3387         'ogg': 'ogg',
3388         'wav': 'wav',
3389         'wave': 'wav',
3390         'x-aac': 'aac',
3391         'x-flac': 'flac',
3392         'x-m4a': 'm4a',
3393         'x-realaudio': 'ra',
3394         'x-wav': 'wav',
3395
3396         # image
3397         'avif': 'avif',
3398         'bmp': 'bmp',
3399         'gif': 'gif',
3400         'jpeg': 'jpg',
3401         'png': 'png',
3402         'svg+xml': 'svg',
3403         'tiff': 'tif',
3404         'vnd.wap.wbmp': 'wbmp',
3405         'webp': 'webp',
3406         'x-icon': 'ico',
3407         'x-jng': 'jng',
3408         'x-ms-bmp': 'bmp',
3409
3410         # caption
3411         'filmstrip+json': 'fs',
3412         'smptett+xml': 'tt',
3413         'ttaf+xml': 'dfxp',
3414         'ttml+xml': 'ttml',
3415         'x-ms-sami': 'sami',
3416
3417         # misc
3418         'gzip': 'gz',
3419         'json': 'json',
3420         'xml': 'xml',
3421         'zip': 'zip',
3422     }
3423
3424     mimetype = mt.partition(';')[0].strip().lower()
3425     _, _, subtype = mimetype.rpartition('/')
3426
3427     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3428     if ext:
3429         return ext
3430     elif default is not NO_DEFAULT:
3431         return default
3432     return subtype.replace('+', '.')
3433
3434
3435 def ext2mimetype(ext_or_url):
3436     if not ext_or_url:
3437         return None
3438     if '.' not in ext_or_url:
3439         ext_or_url = f'file.{ext_or_url}'
3440     return mimetypes.guess_type(ext_or_url)[0]
3441
3442
3443 def parse_codecs(codecs_str):
3444     # http://tools.ietf.org/html/rfc6381
3445     if not codecs_str:
3446         return {}
3447     split_codecs = list(filter(None, map(
3448         str.strip, codecs_str.strip().strip(',').split(','))))
3449     vcodec, acodec, scodec, hdr = None, None, None, None
3450     for full_codec in split_codecs:
3451         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3452         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3453                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3454             if vcodec:
3455                 continue
3456             vcodec = full_codec
3457             if parts[0] in ('dvh1', 'dvhe'):
3458                 hdr = 'DV'
3459             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3460                 hdr = 'HDR10'
3461             elif parts[:2] == ['vp9', '2']:
3462                 hdr = 'HDR10'
3463         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3464                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3465             acodec = acodec or full_codec
3466         elif parts[0] in ('stpp', 'wvtt'):
3467             scodec = scodec or full_codec
3468         else:
3469             write_string(f'WARNING: Unknown codec {full_codec}\n')
3470     if vcodec or acodec or scodec:
3471         return {
3472             'vcodec': vcodec or 'none',
3473             'acodec': acodec or 'none',
3474             'dynamic_range': hdr,
3475             **({'scodec': scodec} if scodec is not None else {}),
3476         }
3477     elif len(split_codecs) == 2:
3478         return {
3479             'vcodec': split_codecs[0],
3480             'acodec': split_codecs[1],
3481         }
3482     return {}
3483
3484
3485 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3486     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3487
3488     allow_mkv = not preferences or 'mkv' in preferences
3489
3490     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3491         return 'mkv'  # TODO: any other format allows this?
3492
3493     # TODO: All codecs supported by parse_codecs isn't handled here
3494     COMPATIBLE_CODECS = {
3495         'mp4': {
3496             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3497             'h264', 'aacl', 'ec-3',  # Set in ISM
3498         },
3499         'webm': {
3500             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3501             'vp9x', 'vp8x',  # in the webm spec
3502         },
3503     }
3504
3505     sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3506     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3507
3508     for ext in preferences or COMPATIBLE_CODECS.keys():
3509         codec_set = COMPATIBLE_CODECS.get(ext, set())
3510         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3511             return ext
3512
3513     COMPATIBLE_EXTS = (
3514         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3515         {'webm', 'weba'},
3516     )
3517     for ext in preferences or vexts:
3518         current_exts = {ext, *vexts, *aexts}
3519         if ext == 'mkv' or current_exts == {ext} or any(
3520                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3521             return ext
3522     return 'mkv' if allow_mkv else preferences[-1]
3523
3524
3525 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3526     getheader = url_handle.headers.get
3527
3528     cd = getheader('Content-Disposition')
3529     if cd:
3530         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3531         if m:
3532             e = determine_ext(m.group('filename'), default_ext=None)
3533             if e:
3534                 return e
3535
3536     meta_ext = getheader('x-amz-meta-name')
3537     if meta_ext:
3538         e = meta_ext.rpartition('.')[2]
3539         if e:
3540             return e
3541
3542     return mimetype2ext(getheader('Content-Type'), default=default)
3543
3544
3545 def encode_data_uri(data, mime_type):
3546     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3547
3548
3549 def age_restricted(content_limit, age_limit):
3550     """ Returns True iff the content should be blocked """
3551
3552     if age_limit is None:  # No limit set
3553         return False
3554     if content_limit is None:
3555         return False  # Content available for everyone
3556     return age_limit < content_limit
3557
3558
3559 # List of known byte-order-marks (BOM)
3560 BOMS = [
3561     (b'\xef\xbb\xbf', 'utf-8'),
3562     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3563     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3564     (b'\xff\xfe', 'utf-16-le'),
3565     (b'\xfe\xff', 'utf-16-be'),
3566 ]
3567
3568
3569 def is_html(first_bytes):
3570     """ Detect whether a file contains HTML by examining its first bytes. """
3571
3572     encoding = 'utf-8'
3573     for bom, enc in BOMS:
3574         while first_bytes.startswith(bom):
3575             encoding, first_bytes = enc, first_bytes[len(bom):]
3576
3577     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3578
3579
3580 def determine_protocol(info_dict):
3581     protocol = info_dict.get('protocol')
3582     if protocol is not None:
3583         return protocol
3584
3585     url = sanitize_url(info_dict['url'])
3586     if url.startswith('rtmp'):
3587         return 'rtmp'
3588     elif url.startswith('mms'):
3589         return 'mms'
3590     elif url.startswith('rtsp'):
3591         return 'rtsp'
3592
3593     ext = determine_ext(url)
3594     if ext == 'm3u8':
3595         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3596     elif ext == 'f4m':
3597         return 'f4m'
3598
3599     return urllib.parse.urlparse(url).scheme
3600
3601
3602 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3603     """ Render a list of rows, each as a list of values.
3604     Text after a \t will be right aligned """
3605     def width(string):
3606         return len(remove_terminal_sequences(string).replace('\t', ''))
3607
3608     def get_max_lens(table):
3609         return [max(width(str(v)) for v in col) for col in zip(*table)]
3610
3611     def filter_using_list(row, filterArray):
3612         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3613
3614     max_lens = get_max_lens(data) if hide_empty else []
3615     header_row = filter_using_list(header_row, max_lens)
3616     data = [filter_using_list(row, max_lens) for row in data]
3617
3618     table = [header_row] + data
3619     max_lens = get_max_lens(table)
3620     extra_gap += 1
3621     if delim:
3622         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3623         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3624     for row in table:
3625         for pos, text in enumerate(map(str, row)):
3626             if '\t' in text:
3627                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3628             else:
3629                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3630     ret = '\n'.join(''.join(row).rstrip() for row in table)
3631     return ret
3632
3633
3634 def _match_one(filter_part, dct, incomplete):
3635     # TODO: Generalize code with YoutubeDL._build_format_filter
3636     STRING_OPERATORS = {
3637         '*=': operator.contains,
3638         '^=': lambda attr, value: attr.startswith(value),
3639         '$=': lambda attr, value: attr.endswith(value),
3640         '~=': lambda attr, value: re.search(value, attr),
3641     }
3642     COMPARISON_OPERATORS = {
3643         **STRING_OPERATORS,
3644         '<=': operator.le,  # "<=" must be defined above "<"
3645         '<': operator.lt,
3646         '>=': operator.ge,
3647         '>': operator.gt,
3648         '=': operator.eq,
3649     }
3650
3651     if isinstance(incomplete, bool):
3652         is_incomplete = lambda _: incomplete
3653     else:
3654         is_incomplete = lambda k: k in incomplete
3655
3656     operator_rex = re.compile(r'''(?x)
3657         (?P<key>[a-z_]+)
3658         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3659         (?:
3660             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3661             (?P<strval>.+?)
3662         )
3663         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3664     m = operator_rex.fullmatch(filter_part.strip())
3665     if m:
3666         m = m.groupdict()
3667         unnegated_op = COMPARISON_OPERATORS[m['op']]
3668         if m['negation']:
3669             op = lambda attr, value: not unnegated_op(attr, value)
3670         else:
3671             op = unnegated_op
3672         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3673         if m['quote']:
3674             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3675         actual_value = dct.get(m['key'])
3676         numeric_comparison = None
3677         if isinstance(actual_value, (int, float)):
3678             # If the original field is a string and matching comparisonvalue is
3679             # a number we should respect the origin of the original field
3680             # and process comparison value as a string (see
3681             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3682             try:
3683                 numeric_comparison = int(comparison_value)
3684             except ValueError:
3685                 numeric_comparison = parse_filesize(comparison_value)
3686                 if numeric_comparison is None:
3687                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3688                 if numeric_comparison is None:
3689                     numeric_comparison = parse_duration(comparison_value)
3690         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3691             raise ValueError('Operator %s only supports string values!' % m['op'])
3692         if actual_value is None:
3693             return is_incomplete(m['key']) or m['none_inclusive']
3694         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3695
3696     UNARY_OPERATORS = {
3697         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3698         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3699     }
3700     operator_rex = re.compile(r'''(?x)
3701         (?P<op>%s)\s*(?P<key>[a-z_]+)
3702         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3703     m = operator_rex.fullmatch(filter_part.strip())
3704     if m:
3705         op = UNARY_OPERATORS[m.group('op')]
3706         actual_value = dct.get(m.group('key'))
3707         if is_incomplete(m.group('key')) and actual_value is None:
3708             return True
3709         return op(actual_value)
3710
3711     raise ValueError('Invalid filter part %r' % filter_part)
3712
3713
3714 def match_str(filter_str, dct, incomplete=False):
3715     """ Filter a dictionary with a simple string syntax.
3716     @returns           Whether the filter passes
3717     @param incomplete  Set of keys that is expected to be missing from dct.
3718                        Can be True/False to indicate all/none of the keys may be missing.
3719                        All conditions on incomplete keys pass if the key is missing
3720     """
3721     return all(
3722         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3723         for filter_part in re.split(r'(?<!\\)&', filter_str))
3724
3725
3726 def match_filter_func(filters, breaking_filters=None):
3727     if not filters and not breaking_filters:
3728         return None
3729     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3730     filters = set(variadic(filters or []))
3731
3732     interactive = '-' in filters
3733     if interactive:
3734         filters.remove('-')
3735
3736     def _match_func(info_dict, incomplete=False):
3737         ret = breaking_filters(info_dict, incomplete)
3738         if ret is not None:
3739             raise RejectedVideoReached(ret)
3740
3741         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3742             return NO_DEFAULT if interactive and not incomplete else None
3743         else:
3744             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3745             filter_str = ') | ('.join(map(str.strip, filters))
3746             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3747     return _match_func
3748
3749
3750 class download_range_func:
3751     def __init__(self, chapters, ranges):
3752         self.chapters, self.ranges = chapters, ranges
3753
3754     def __call__(self, info_dict, ydl):
3755         if not self.ranges and not self.chapters:
3756             yield {}
3757
3758         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3759                    else 'Cannot match chapters since chapter information is unavailable')
3760         for regex in self.chapters or []:
3761             for i, chapter in enumerate(info_dict.get('chapters') or []):
3762                 if re.search(regex, chapter['title']):
3763                     warning = None
3764                     yield {**chapter, 'index': i}
3765         if self.chapters and warning:
3766             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3767
3768         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3769
3770     def __eq__(self, other):
3771         return (isinstance(other, download_range_func)
3772                 and self.chapters == other.chapters and self.ranges == other.ranges)
3773
3774     def __repr__(self):
3775         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3776
3777
3778 def parse_dfxp_time_expr(time_expr):
3779     if not time_expr:
3780         return
3781
3782     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3783     if mobj:
3784         return float(mobj.group('time_offset'))
3785
3786     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3787     if mobj:
3788         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3789
3790
3791 def srt_subtitles_timecode(seconds):
3792     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3793
3794
3795 def ass_subtitles_timecode(seconds):
3796     time = timetuple_from_msec(seconds * 1000)
3797     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3798
3799
3800 def dfxp2srt(dfxp_data):
3801     '''
3802     @param dfxp_data A bytes-like object containing DFXP data
3803     @returns A unicode object containing converted SRT data
3804     '''
3805     LEGACY_NAMESPACES = (
3806         (b'http://www.w3.org/ns/ttml', [
3807             b'http://www.w3.org/2004/11/ttaf1',
3808             b'http://www.w3.org/2006/04/ttaf1',
3809             b'http://www.w3.org/2006/10/ttaf1',
3810         ]),
3811         (b'http://www.w3.org/ns/ttml#styling', [
3812             b'http://www.w3.org/ns/ttml#style',
3813         ]),
3814     )
3815
3816     SUPPORTED_STYLING = [
3817         'color',
3818         'fontFamily',
3819         'fontSize',
3820         'fontStyle',
3821         'fontWeight',
3822         'textDecoration'
3823     ]
3824
3825     _x = functools.partial(xpath_with_ns, ns_map={
3826         'xml': 'http://www.w3.org/XML/1998/namespace',
3827         'ttml': 'http://www.w3.org/ns/ttml',
3828         'tts': 'http://www.w3.org/ns/ttml#styling',
3829     })
3830
3831     styles = {}
3832     default_style = {}
3833
3834     class TTMLPElementParser:
3835         _out = ''
3836         _unclosed_elements = []
3837         _applied_styles = []
3838
3839         def start(self, tag, attrib):
3840             if tag in (_x('ttml:br'), 'br'):
3841                 self._out += '\n'
3842             else:
3843                 unclosed_elements = []
3844                 style = {}
3845                 element_style_id = attrib.get('style')
3846                 if default_style:
3847                     style.update(default_style)
3848                 if element_style_id:
3849                     style.update(styles.get(element_style_id, {}))
3850                 for prop in SUPPORTED_STYLING:
3851                     prop_val = attrib.get(_x('tts:' + prop))
3852                     if prop_val:
3853                         style[prop] = prop_val
3854                 if style:
3855                     font = ''
3856                     for k, v in sorted(style.items()):
3857                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3858                             continue
3859                         if k == 'color':
3860                             font += ' color="%s"' % v
3861                         elif k == 'fontSize':
3862                             font += ' size="%s"' % v
3863                         elif k == 'fontFamily':
3864                             font += ' face="%s"' % v
3865                         elif k == 'fontWeight' and v == 'bold':
3866                             self._out += '<b>'
3867                             unclosed_elements.append('b')
3868                         elif k == 'fontStyle' and v == 'italic':
3869                             self._out += '<i>'
3870                             unclosed_elements.append('i')
3871                         elif k == 'textDecoration' and v == 'underline':
3872                             self._out += '<u>'
3873                             unclosed_elements.append('u')
3874                     if font:
3875                         self._out += '<font' + font + '>'
3876                         unclosed_elements.append('font')
3877                     applied_style = {}
3878                     if self._applied_styles:
3879                         applied_style.update(self._applied_styles[-1])
3880                     applied_style.update(style)
3881                     self._applied_styles.append(applied_style)
3882                 self._unclosed_elements.append(unclosed_elements)
3883
3884         def end(self, tag):
3885             if tag not in (_x('ttml:br'), 'br'):
3886                 unclosed_elements = self._unclosed_elements.pop()
3887                 for element in reversed(unclosed_elements):
3888                     self._out += '</%s>' % element
3889                 if unclosed_elements and self._applied_styles:
3890                     self._applied_styles.pop()
3891
3892         def data(self, data):
3893             self._out += data
3894
3895         def close(self):
3896             return self._out.strip()
3897
3898     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3899     # This will not trigger false positives since only UTF-8 text is being replaced
3900     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3901
3902     def parse_node(node):
3903         target = TTMLPElementParser()
3904         parser = xml.etree.ElementTree.XMLParser(target=target)
3905         parser.feed(xml.etree.ElementTree.tostring(node))
3906         return parser.close()
3907
3908     for k, v in LEGACY_NAMESPACES:
3909         for ns in v:
3910             dfxp_data = dfxp_data.replace(ns, k)
3911
3912     dfxp = compat_etree_fromstring(dfxp_data)
3913     out = []
3914     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3915
3916     if not paras:
3917         raise ValueError('Invalid dfxp/TTML subtitle')
3918
3919     repeat = False
3920     while True:
3921         for style in dfxp.findall(_x('.//ttml:style')):
3922             style_id = style.get('id') or style.get(_x('xml:id'))
3923             if not style_id:
3924                 continue
3925             parent_style_id = style.get('style')
3926             if parent_style_id:
3927                 if parent_style_id not in styles:
3928                     repeat = True
3929                     continue
3930                 styles[style_id] = styles[parent_style_id].copy()
3931             for prop in SUPPORTED_STYLING:
3932                 prop_val = style.get(_x('tts:' + prop))
3933                 if prop_val:
3934                     styles.setdefault(style_id, {})[prop] = prop_val
3935         if repeat:
3936             repeat = False
3937         else:
3938             break
3939
3940     for p in ('body', 'div'):
3941         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3942         if ele is None:
3943             continue
3944         style = styles.get(ele.get('style'))
3945         if not style:
3946             continue
3947         default_style.update(style)
3948
3949     for para, index in zip(paras, itertools.count(1)):
3950         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3951         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3952         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3953         if begin_time is None:
3954             continue
3955         if not end_time:
3956             if not dur:
3957                 continue
3958             end_time = begin_time + dur
3959         out.append('%d\n%s --> %s\n%s\n\n' % (
3960             index,
3961             srt_subtitles_timecode(begin_time),
3962             srt_subtitles_timecode(end_time),
3963             parse_node(para)))
3964
3965     return ''.join(out)
3966
3967
3968 def cli_option(params, command_option, param, separator=None):
3969     param = params.get(param)
3970     return ([] if param is None
3971             else [command_option, str(param)] if separator is None
3972             else [f'{command_option}{separator}{param}'])
3973
3974
3975 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3976     param = params.get(param)
3977     assert param in (True, False, None)
3978     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3979
3980
3981 def cli_valueless_option(params, command_option, param, expected_value=True):
3982     return [command_option] if params.get(param) == expected_value else []
3983
3984
3985 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3986     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3987         if use_compat:
3988             return argdict
3989         else:
3990             argdict = None
3991     if argdict is None:
3992         return default
3993     assert isinstance(argdict, dict)
3994
3995     assert isinstance(keys, (list, tuple))
3996     for key_list in keys:
3997         arg_list = list(filter(
3998             lambda x: x is not None,
3999             [argdict.get(key.lower()) for key in variadic(key_list)]))
4000         if arg_list:
4001             return [arg for args in arg_list for arg in args]
4002     return default
4003
4004
4005 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4006     main_key, exe = main_key.lower(), exe.lower()
4007     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4008     keys = [f'{root_key}{k}' for k in (keys or [''])]
4009     if root_key in keys:
4010         if main_key != exe:
4011             keys.append((main_key, exe))
4012         keys.append('default')
4013     else:
4014         use_compat = False
4015     return cli_configuration_args(argdict, keys, default, use_compat)
4016
4017
4018 class ISO639Utils:
4019     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4020     _lang_map = {
4021         'aa': 'aar',
4022         'ab': 'abk',
4023         'ae': 'ave',
4024         'af': 'afr',
4025         'ak': 'aka',
4026         'am': 'amh',
4027         'an': 'arg',
4028         'ar': 'ara',
4029         'as': 'asm',
4030         'av': 'ava',
4031         'ay': 'aym',
4032         'az': 'aze',
4033         'ba': 'bak',
4034         'be': 'bel',
4035         'bg': 'bul',
4036         'bh': 'bih',
4037         'bi': 'bis',
4038         'bm': 'bam',
4039         'bn': 'ben',
4040         'bo': 'bod',
4041         'br': 'bre',
4042         'bs': 'bos',
4043         'ca': 'cat',
4044         'ce': 'che',
4045         'ch': 'cha',
4046         'co': 'cos',
4047         'cr': 'cre',
4048         'cs': 'ces',
4049         'cu': 'chu',
4050         'cv': 'chv',
4051         'cy': 'cym',
4052         'da': 'dan',
4053         'de': 'deu',
4054         'dv': 'div',
4055         'dz': 'dzo',
4056         'ee': 'ewe',
4057         'el': 'ell',
4058         'en': 'eng',
4059         'eo': 'epo',
4060         'es': 'spa',
4061         'et': 'est',
4062         'eu': 'eus',
4063         'fa': 'fas',
4064         'ff': 'ful',
4065         'fi': 'fin',
4066         'fj': 'fij',
4067         'fo': 'fao',
4068         'fr': 'fra',
4069         'fy': 'fry',
4070         'ga': 'gle',
4071         'gd': 'gla',
4072         'gl': 'glg',
4073         'gn': 'grn',
4074         'gu': 'guj',
4075         'gv': 'glv',
4076         'ha': 'hau',
4077         'he': 'heb',
4078         'iw': 'heb',  # Replaced by he in 1989 revision
4079         'hi': 'hin',
4080         'ho': 'hmo',
4081         'hr': 'hrv',
4082         'ht': 'hat',
4083         'hu': 'hun',
4084         'hy': 'hye',
4085         'hz': 'her',
4086         'ia': 'ina',
4087         'id': 'ind',
4088         'in': 'ind',  # Replaced by id in 1989 revision
4089         'ie': 'ile',
4090         'ig': 'ibo',
4091         'ii': 'iii',
4092         'ik': 'ipk',
4093         'io': 'ido',
4094         'is': 'isl',
4095         'it': 'ita',
4096         'iu': 'iku',
4097         'ja': 'jpn',
4098         'jv': 'jav',
4099         'ka': 'kat',
4100         'kg': 'kon',
4101         'ki': 'kik',
4102         'kj': 'kua',
4103         'kk': 'kaz',
4104         'kl': 'kal',
4105         'km': 'khm',
4106         'kn': 'kan',
4107         'ko': 'kor',
4108         'kr': 'kau',
4109         'ks': 'kas',
4110         'ku': 'kur',
4111         'kv': 'kom',
4112         'kw': 'cor',
4113         'ky': 'kir',
4114         'la': 'lat',
4115         'lb': 'ltz',
4116         'lg': 'lug',
4117         'li': 'lim',
4118         'ln': 'lin',
4119         'lo': 'lao',
4120         'lt': 'lit',
4121         'lu': 'lub',
4122         'lv': 'lav',
4123         'mg': 'mlg',
4124         'mh': 'mah',
4125         'mi': 'mri',
4126         'mk': 'mkd',
4127         'ml': 'mal',
4128         'mn': 'mon',
4129         'mr': 'mar',
4130         'ms': 'msa',
4131         'mt': 'mlt',
4132         'my': 'mya',
4133         'na': 'nau',
4134         'nb': 'nob',
4135         'nd': 'nde',
4136         'ne': 'nep',
4137         'ng': 'ndo',
4138         'nl': 'nld',
4139         'nn': 'nno',
4140         'no': 'nor',
4141         'nr': 'nbl',
4142         'nv': 'nav',
4143         'ny': 'nya',
4144         'oc': 'oci',
4145         'oj': 'oji',
4146         'om': 'orm',
4147         'or': 'ori',
4148         'os': 'oss',
4149         'pa': 'pan',
4150         'pi': 'pli',
4151         'pl': 'pol',
4152         'ps': 'pus',
4153         'pt': 'por',
4154         'qu': 'que',
4155         'rm': 'roh',
4156         'rn': 'run',
4157         'ro': 'ron',
4158         'ru': 'rus',
4159         'rw': 'kin',
4160         'sa': 'san',
4161         'sc': 'srd',
4162         'sd': 'snd',
4163         'se': 'sme',
4164         'sg': 'sag',
4165         'si': 'sin',
4166         'sk': 'slk',
4167         'sl': 'slv',
4168         'sm': 'smo',
4169         'sn': 'sna',
4170         'so': 'som',
4171         'sq': 'sqi',
4172         'sr': 'srp',
4173         'ss': 'ssw',
4174         'st': 'sot',
4175         'su': 'sun',
4176         'sv': 'swe',
4177         'sw': 'swa',
4178         'ta': 'tam',
4179         'te': 'tel',
4180         'tg': 'tgk',
4181         'th': 'tha',
4182         'ti': 'tir',
4183         'tk': 'tuk',
4184         'tl': 'tgl',
4185         'tn': 'tsn',
4186         'to': 'ton',
4187         'tr': 'tur',
4188         'ts': 'tso',
4189         'tt': 'tat',
4190         'tw': 'twi',
4191         'ty': 'tah',
4192         'ug': 'uig',
4193         'uk': 'ukr',
4194         'ur': 'urd',
4195         'uz': 'uzb',
4196         've': 'ven',
4197         'vi': 'vie',
4198         'vo': 'vol',
4199         'wa': 'wln',
4200         'wo': 'wol',
4201         'xh': 'xho',
4202         'yi': 'yid',
4203         'ji': 'yid',  # Replaced by yi in 1989 revision
4204         'yo': 'yor',
4205         'za': 'zha',
4206         'zh': 'zho',
4207         'zu': 'zul',
4208     }
4209
4210     @classmethod
4211     def short2long(cls, code):
4212         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4213         return cls._lang_map.get(code[:2])
4214
4215     @classmethod
4216     def long2short(cls, code):
4217         """Convert language code from ISO 639-2/T to ISO 639-1"""
4218         for short_name, long_name in cls._lang_map.items():
4219             if long_name == code:
4220                 return short_name
4221
4222
4223 class ISO3166Utils:
4224     # From http://data.okfn.org/data/core/country-list
4225     _country_map = {
4226         'AF': 'Afghanistan',
4227         'AX': 'Åland Islands',
4228         'AL': 'Albania',
4229         'DZ': 'Algeria',
4230         'AS': 'American Samoa',
4231         'AD': 'Andorra',
4232         'AO': 'Angola',
4233         'AI': 'Anguilla',
4234         'AQ': 'Antarctica',
4235         'AG': 'Antigua and Barbuda',
4236         'AR': 'Argentina',
4237         'AM': 'Armenia',
4238         'AW': 'Aruba',
4239         'AU': 'Australia',
4240         'AT': 'Austria',
4241         'AZ': 'Azerbaijan',
4242         'BS': 'Bahamas',
4243         'BH': 'Bahrain',
4244         'BD': 'Bangladesh',
4245         'BB': 'Barbados',
4246         'BY': 'Belarus',
4247         'BE': 'Belgium',
4248         'BZ': 'Belize',
4249         'BJ': 'Benin',
4250         'BM': 'Bermuda',
4251         'BT': 'Bhutan',
4252         'BO': 'Bolivia, Plurinational State of',
4253         'BQ': 'Bonaire, Sint Eustatius and Saba',
4254         'BA': 'Bosnia and Herzegovina',
4255         'BW': 'Botswana',
4256         'BV': 'Bouvet Island',
4257         'BR': 'Brazil',
4258         'IO': 'British Indian Ocean Territory',
4259         'BN': 'Brunei Darussalam',
4260         'BG': 'Bulgaria',
4261         'BF': 'Burkina Faso',
4262         'BI': 'Burundi',
4263         'KH': 'Cambodia',
4264         'CM': 'Cameroon',
4265         'CA': 'Canada',
4266         'CV': 'Cape Verde',
4267         'KY': 'Cayman Islands',
4268         'CF': 'Central African Republic',
4269         'TD': 'Chad',
4270         'CL': 'Chile',
4271         'CN': 'China',
4272         'CX': 'Christmas Island',
4273         'CC': 'Cocos (Keeling) Islands',
4274         'CO': 'Colombia',
4275         'KM': 'Comoros',
4276         'CG': 'Congo',
4277         'CD': 'Congo, the Democratic Republic of the',
4278         'CK': 'Cook Islands',
4279         'CR': 'Costa Rica',
4280         'CI': 'Côte d\'Ivoire',
4281         'HR': 'Croatia',
4282         'CU': 'Cuba',
4283         'CW': 'Curaçao',
4284         'CY': 'Cyprus',
4285         'CZ': 'Czech Republic',
4286         'DK': 'Denmark',
4287         'DJ': 'Djibouti',
4288         'DM': 'Dominica',
4289         'DO': 'Dominican Republic',
4290         'EC': 'Ecuador',
4291         'EG': 'Egypt',
4292         'SV': 'El Salvador',
4293         'GQ': 'Equatorial Guinea',
4294         'ER': 'Eritrea',
4295         'EE': 'Estonia',
4296         'ET': 'Ethiopia',
4297         'FK': 'Falkland Islands (Malvinas)',
4298         'FO': 'Faroe Islands',
4299         'FJ': 'Fiji',
4300         'FI': 'Finland',
4301         'FR': 'France',
4302         'GF': 'French Guiana',
4303         'PF': 'French Polynesia',
4304         'TF': 'French Southern Territories',
4305         'GA': 'Gabon',
4306         'GM': 'Gambia',
4307         'GE': 'Georgia',
4308         'DE': 'Germany',
4309         'GH': 'Ghana',
4310         'GI': 'Gibraltar',
4311         'GR': 'Greece',
4312         'GL': 'Greenland',
4313         'GD': 'Grenada',
4314         'GP': 'Guadeloupe',
4315         'GU': 'Guam',
4316         'GT': 'Guatemala',
4317         'GG': 'Guernsey',
4318         'GN': 'Guinea',
4319         'GW': 'Guinea-Bissau',
4320         'GY': 'Guyana',
4321         'HT': 'Haiti',
4322         'HM': 'Heard Island and McDonald Islands',
4323         'VA': 'Holy See (Vatican City State)',
4324         'HN': 'Honduras',
4325         'HK': 'Hong Kong',
4326         'HU': 'Hungary',
4327         'IS': 'Iceland',
4328         'IN': 'India',
4329         'ID': 'Indonesia',
4330         'IR': 'Iran, Islamic Republic of',
4331         'IQ': 'Iraq',
4332         'IE': 'Ireland',
4333         'IM': 'Isle of Man',
4334         'IL': 'Israel',
4335         'IT': 'Italy',
4336         'JM': 'Jamaica',
4337         'JP': 'Japan',
4338         'JE': 'Jersey',
4339         'JO': 'Jordan',
4340         'KZ': 'Kazakhstan',
4341         'KE': 'Kenya',
4342         'KI': 'Kiribati',
4343         'KP': 'Korea, Democratic People\'s Republic of',
4344         'KR': 'Korea, Republic of',
4345         'KW': 'Kuwait',
4346         'KG': 'Kyrgyzstan',
4347         'LA': 'Lao People\'s Democratic Republic',
4348         'LV': 'Latvia',
4349         'LB': 'Lebanon',
4350         'LS': 'Lesotho',
4351         'LR': 'Liberia',
4352         'LY': 'Libya',
4353         'LI': 'Liechtenstein',
4354         'LT': 'Lithuania',
4355         'LU': 'Luxembourg',
4356         'MO': 'Macao',
4357         'MK': 'Macedonia, the Former Yugoslav Republic of',
4358         'MG': 'Madagascar',
4359         'MW': 'Malawi',
4360         'MY': 'Malaysia',
4361         'MV': 'Maldives',
4362         'ML': 'Mali',
4363         'MT': 'Malta',
4364         'MH': 'Marshall Islands',
4365         'MQ': 'Martinique',
4366         'MR': 'Mauritania',
4367         'MU': 'Mauritius',
4368         'YT': 'Mayotte',
4369         'MX': 'Mexico',
4370         'FM': 'Micronesia, Federated States of',
4371         'MD': 'Moldova, Republic of',
4372         'MC': 'Monaco',
4373         'MN': 'Mongolia',
4374         'ME': 'Montenegro',
4375         'MS': 'Montserrat',
4376         'MA': 'Morocco',
4377         'MZ': 'Mozambique',
4378         'MM': 'Myanmar',
4379         'NA': 'Namibia',
4380         'NR': 'Nauru',
4381         'NP': 'Nepal',
4382         'NL': 'Netherlands',
4383         'NC': 'New Caledonia',
4384         'NZ': 'New Zealand',
4385         'NI': 'Nicaragua',
4386         'NE': 'Niger',
4387         'NG': 'Nigeria',
4388         'NU': 'Niue',
4389         'NF': 'Norfolk Island',
4390         'MP': 'Northern Mariana Islands',
4391         'NO': 'Norway',
4392         'OM': 'Oman',
4393         'PK': 'Pakistan',
4394         'PW': 'Palau',
4395         'PS': 'Palestine, State of',
4396         'PA': 'Panama',
4397         'PG': 'Papua New Guinea',
4398         'PY': 'Paraguay',
4399         'PE': 'Peru',
4400         'PH': 'Philippines',
4401         'PN': 'Pitcairn',
4402         'PL': 'Poland',
4403         'PT': 'Portugal',
4404         'PR': 'Puerto Rico',
4405         'QA': 'Qatar',
4406         'RE': 'Réunion',
4407         'RO': 'Romania',
4408         'RU': 'Russian Federation',
4409         'RW': 'Rwanda',
4410         'BL': 'Saint Barthélemy',
4411         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4412         'KN': 'Saint Kitts and Nevis',
4413         'LC': 'Saint Lucia',
4414         'MF': 'Saint Martin (French part)',
4415         'PM': 'Saint Pierre and Miquelon',
4416         'VC': 'Saint Vincent and the Grenadines',
4417         'WS': 'Samoa',
4418         'SM': 'San Marino',
4419         'ST': 'Sao Tome and Principe',
4420         'SA': 'Saudi Arabia',
4421         'SN': 'Senegal',
4422         'RS': 'Serbia',
4423         'SC': 'Seychelles',
4424         'SL': 'Sierra Leone',
4425         'SG': 'Singapore',
4426         'SX': 'Sint Maarten (Dutch part)',
4427         'SK': 'Slovakia',
4428         'SI': 'Slovenia',
4429         'SB': 'Solomon Islands',
4430         'SO': 'Somalia',
4431         'ZA': 'South Africa',
4432         'GS': 'South Georgia and the South Sandwich Islands',
4433         'SS': 'South Sudan',
4434         'ES': 'Spain',
4435         'LK': 'Sri Lanka',
4436         'SD': 'Sudan',
4437         'SR': 'Suriname',
4438         'SJ': 'Svalbard and Jan Mayen',
4439         'SZ': 'Swaziland',
4440         'SE': 'Sweden',
4441         'CH': 'Switzerland',
4442         'SY': 'Syrian Arab Republic',
4443         'TW': 'Taiwan, Province of China',
4444         'TJ': 'Tajikistan',
4445         'TZ': 'Tanzania, United Republic of',
4446         'TH': 'Thailand',
4447         'TL': 'Timor-Leste',
4448         'TG': 'Togo',
4449         'TK': 'Tokelau',
4450         'TO': 'Tonga',
4451         'TT': 'Trinidad and Tobago',
4452         'TN': 'Tunisia',
4453         'TR': 'Turkey',
4454         'TM': 'Turkmenistan',
4455         'TC': 'Turks and Caicos Islands',
4456         'TV': 'Tuvalu',
4457         'UG': 'Uganda',
4458         'UA': 'Ukraine',
4459         'AE': 'United Arab Emirates',
4460         'GB': 'United Kingdom',
4461         'US': 'United States',
4462         'UM': 'United States Minor Outlying Islands',
4463         'UY': 'Uruguay',
4464         'UZ': 'Uzbekistan',
4465         'VU': 'Vanuatu',
4466         'VE': 'Venezuela, Bolivarian Republic of',
4467         'VN': 'Viet Nam',
4468         'VG': 'Virgin Islands, British',
4469         'VI': 'Virgin Islands, U.S.',
4470         'WF': 'Wallis and Futuna',
4471         'EH': 'Western Sahara',
4472         'YE': 'Yemen',
4473         'ZM': 'Zambia',
4474         'ZW': 'Zimbabwe',
4475         # Not ISO 3166 codes, but used for IP blocks
4476         'AP': 'Asia/Pacific Region',
4477         'EU': 'Europe',
4478     }
4479
4480     @classmethod
4481     def short2full(cls, code):
4482         """Convert an ISO 3166-2 country code to the corresponding full name"""
4483         return cls._country_map.get(code.upper())
4484
4485
4486 class GeoUtils:
4487     # Major IPv4 address blocks per country
4488     _country_ip_map = {
4489         'AD': '46.172.224.0/19',
4490         'AE': '94.200.0.0/13',
4491         'AF': '149.54.0.0/17',
4492         'AG': '209.59.64.0/18',
4493         'AI': '204.14.248.0/21',
4494         'AL': '46.99.0.0/16',
4495         'AM': '46.70.0.0/15',
4496         'AO': '105.168.0.0/13',
4497         'AP': '182.50.184.0/21',
4498         'AQ': '23.154.160.0/24',
4499         'AR': '181.0.0.0/12',
4500         'AS': '202.70.112.0/20',
4501         'AT': '77.116.0.0/14',
4502         'AU': '1.128.0.0/11',
4503         'AW': '181.41.0.0/18',
4504         'AX': '185.217.4.0/22',
4505         'AZ': '5.197.0.0/16',
4506         'BA': '31.176.128.0/17',
4507         'BB': '65.48.128.0/17',
4508         'BD': '114.130.0.0/16',
4509         'BE': '57.0.0.0/8',
4510         'BF': '102.178.0.0/15',
4511         'BG': '95.42.0.0/15',
4512         'BH': '37.131.0.0/17',
4513         'BI': '154.117.192.0/18',
4514         'BJ': '137.255.0.0/16',
4515         'BL': '185.212.72.0/23',
4516         'BM': '196.12.64.0/18',
4517         'BN': '156.31.0.0/16',
4518         'BO': '161.56.0.0/16',
4519         'BQ': '161.0.80.0/20',
4520         'BR': '191.128.0.0/12',
4521         'BS': '24.51.64.0/18',
4522         'BT': '119.2.96.0/19',
4523         'BW': '168.167.0.0/16',
4524         'BY': '178.120.0.0/13',
4525         'BZ': '179.42.192.0/18',
4526         'CA': '99.224.0.0/11',
4527         'CD': '41.243.0.0/16',
4528         'CF': '197.242.176.0/21',
4529         'CG': '160.113.0.0/16',
4530         'CH': '85.0.0.0/13',
4531         'CI': '102.136.0.0/14',
4532         'CK': '202.65.32.0/19',
4533         'CL': '152.172.0.0/14',
4534         'CM': '102.244.0.0/14',
4535         'CN': '36.128.0.0/10',
4536         'CO': '181.240.0.0/12',
4537         'CR': '201.192.0.0/12',
4538         'CU': '152.206.0.0/15',
4539         'CV': '165.90.96.0/19',
4540         'CW': '190.88.128.0/17',
4541         'CY': '31.153.0.0/16',
4542         'CZ': '88.100.0.0/14',
4543         'DE': '53.0.0.0/8',
4544         'DJ': '197.241.0.0/17',
4545         'DK': '87.48.0.0/12',
4546         'DM': '192.243.48.0/20',
4547         'DO': '152.166.0.0/15',
4548         'DZ': '41.96.0.0/12',
4549         'EC': '186.68.0.0/15',
4550         'EE': '90.190.0.0/15',
4551         'EG': '156.160.0.0/11',
4552         'ER': '196.200.96.0/20',
4553         'ES': '88.0.0.0/11',
4554         'ET': '196.188.0.0/14',
4555         'EU': '2.16.0.0/13',
4556         'FI': '91.152.0.0/13',
4557         'FJ': '144.120.0.0/16',
4558         'FK': '80.73.208.0/21',
4559         'FM': '119.252.112.0/20',
4560         'FO': '88.85.32.0/19',
4561         'FR': '90.0.0.0/9',
4562         'GA': '41.158.0.0/15',
4563         'GB': '25.0.0.0/8',
4564         'GD': '74.122.88.0/21',
4565         'GE': '31.146.0.0/16',
4566         'GF': '161.22.64.0/18',
4567         'GG': '62.68.160.0/19',
4568         'GH': '154.160.0.0/12',
4569         'GI': '95.164.0.0/16',
4570         'GL': '88.83.0.0/19',
4571         'GM': '160.182.0.0/15',
4572         'GN': '197.149.192.0/18',
4573         'GP': '104.250.0.0/19',
4574         'GQ': '105.235.224.0/20',
4575         'GR': '94.64.0.0/13',
4576         'GT': '168.234.0.0/16',
4577         'GU': '168.123.0.0/16',
4578         'GW': '197.214.80.0/20',
4579         'GY': '181.41.64.0/18',
4580         'HK': '113.252.0.0/14',
4581         'HN': '181.210.0.0/16',
4582         'HR': '93.136.0.0/13',
4583         'HT': '148.102.128.0/17',
4584         'HU': '84.0.0.0/14',
4585         'ID': '39.192.0.0/10',
4586         'IE': '87.32.0.0/12',
4587         'IL': '79.176.0.0/13',
4588         'IM': '5.62.80.0/20',
4589         'IN': '117.192.0.0/10',
4590         'IO': '203.83.48.0/21',
4591         'IQ': '37.236.0.0/14',
4592         'IR': '2.176.0.0/12',
4593         'IS': '82.221.0.0/16',
4594         'IT': '79.0.0.0/10',
4595         'JE': '87.244.64.0/18',
4596         'JM': '72.27.0.0/17',
4597         'JO': '176.29.0.0/16',
4598         'JP': '133.0.0.0/8',
4599         'KE': '105.48.0.0/12',
4600         'KG': '158.181.128.0/17',
4601         'KH': '36.37.128.0/17',
4602         'KI': '103.25.140.0/22',
4603         'KM': '197.255.224.0/20',
4604         'KN': '198.167.192.0/19',
4605         'KP': '175.45.176.0/22',
4606         'KR': '175.192.0.0/10',
4607         'KW': '37.36.0.0/14',
4608         'KY': '64.96.0.0/15',
4609         'KZ': '2.72.0.0/13',
4610         'LA': '115.84.64.0/18',
4611         'LB': '178.135.0.0/16',
4612         'LC': '24.92.144.0/20',
4613         'LI': '82.117.0.0/19',
4614         'LK': '112.134.0.0/15',
4615         'LR': '102.183.0.0/16',
4616         'LS': '129.232.0.0/17',
4617         'LT': '78.56.0.0/13',
4618         'LU': '188.42.0.0/16',
4619         'LV': '46.109.0.0/16',
4620         'LY': '41.252.0.0/14',
4621         'MA': '105.128.0.0/11',
4622         'MC': '88.209.64.0/18',
4623         'MD': '37.246.0.0/16',
4624         'ME': '178.175.0.0/17',
4625         'MF': '74.112.232.0/21',
4626         'MG': '154.126.0.0/17',
4627         'MH': '117.103.88.0/21',
4628         'MK': '77.28.0.0/15',
4629         'ML': '154.118.128.0/18',
4630         'MM': '37.111.0.0/17',
4631         'MN': '49.0.128.0/17',
4632         'MO': '60.246.0.0/16',
4633         'MP': '202.88.64.0/20',
4634         'MQ': '109.203.224.0/19',
4635         'MR': '41.188.64.0/18',
4636         'MS': '208.90.112.0/22',
4637         'MT': '46.11.0.0/16',
4638         'MU': '105.16.0.0/12',
4639         'MV': '27.114.128.0/18',
4640         'MW': '102.70.0.0/15',
4641         'MX': '187.192.0.0/11',
4642         'MY': '175.136.0.0/13',
4643         'MZ': '197.218.0.0/15',
4644         'NA': '41.182.0.0/16',
4645         'NC': '101.101.0.0/18',
4646         'NE': '197.214.0.0/18',
4647         'NF': '203.17.240.0/22',
4648         'NG': '105.112.0.0/12',
4649         'NI': '186.76.0.0/15',
4650         'NL': '145.96.0.0/11',
4651         'NO': '84.208.0.0/13',
4652         'NP': '36.252.0.0/15',
4653         'NR': '203.98.224.0/19',
4654         'NU': '49.156.48.0/22',
4655         'NZ': '49.224.0.0/14',
4656         'OM': '5.36.0.0/15',
4657         'PA': '186.72.0.0/15',
4658         'PE': '186.160.0.0/14',
4659         'PF': '123.50.64.0/18',
4660         'PG': '124.240.192.0/19',
4661         'PH': '49.144.0.0/13',
4662         'PK': '39.32.0.0/11',
4663         'PL': '83.0.0.0/11',
4664         'PM': '70.36.0.0/20',
4665         'PR': '66.50.0.0/16',
4666         'PS': '188.161.0.0/16',
4667         'PT': '85.240.0.0/13',
4668         'PW': '202.124.224.0/20',
4669         'PY': '181.120.0.0/14',
4670         'QA': '37.210.0.0/15',
4671         'RE': '102.35.0.0/16',
4672         'RO': '79.112.0.0/13',
4673         'RS': '93.86.0.0/15',
4674         'RU': '5.136.0.0/13',
4675         'RW': '41.186.0.0/16',
4676         'SA': '188.48.0.0/13',
4677         'SB': '202.1.160.0/19',
4678         'SC': '154.192.0.0/11',
4679         'SD': '102.120.0.0/13',
4680         'SE': '78.64.0.0/12',
4681         'SG': '8.128.0.0/10',
4682         'SI': '188.196.0.0/14',
4683         'SK': '78.98.0.0/15',
4684         'SL': '102.143.0.0/17',
4685         'SM': '89.186.32.0/19',
4686         'SN': '41.82.0.0/15',
4687         'SO': '154.115.192.0/18',
4688         'SR': '186.179.128.0/17',
4689         'SS': '105.235.208.0/21',
4690         'ST': '197.159.160.0/19',
4691         'SV': '168.243.0.0/16',
4692         'SX': '190.102.0.0/20',
4693         'SY': '5.0.0.0/16',
4694         'SZ': '41.84.224.0/19',
4695         'TC': '65.255.48.0/20',
4696         'TD': '154.68.128.0/19',
4697         'TG': '196.168.0.0/14',
4698         'TH': '171.96.0.0/13',
4699         'TJ': '85.9.128.0/18',
4700         'TK': '27.96.24.0/21',
4701         'TL': '180.189.160.0/20',
4702         'TM': '95.85.96.0/19',
4703         'TN': '197.0.0.0/11',
4704         'TO': '175.176.144.0/21',
4705         'TR': '78.160.0.0/11',
4706         'TT': '186.44.0.0/15',
4707         'TV': '202.2.96.0/19',
4708         'TW': '120.96.0.0/11',
4709         'TZ': '156.156.0.0/14',
4710         'UA': '37.52.0.0/14',
4711         'UG': '102.80.0.0/13',
4712         'US': '6.0.0.0/8',
4713         'UY': '167.56.0.0/13',
4714         'UZ': '84.54.64.0/18',
4715         'VA': '212.77.0.0/19',
4716         'VC': '207.191.240.0/21',
4717         'VE': '186.88.0.0/13',
4718         'VG': '66.81.192.0/20',
4719         'VI': '146.226.0.0/16',
4720         'VN': '14.160.0.0/11',
4721         'VU': '202.80.32.0/20',
4722         'WF': '117.20.32.0/21',
4723         'WS': '202.4.32.0/19',
4724         'YE': '134.35.0.0/16',
4725         'YT': '41.242.116.0/22',
4726         'ZA': '41.0.0.0/11',
4727         'ZM': '102.144.0.0/13',
4728         'ZW': '102.177.192.0/18',
4729     }
4730
4731     @classmethod
4732     def random_ipv4(cls, code_or_block):
4733         if len(code_or_block) == 2:
4734             block = cls._country_ip_map.get(code_or_block.upper())
4735             if not block:
4736                 return None
4737         else:
4738             block = code_or_block
4739         addr, preflen = block.split('/')
4740         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4741         addr_max = addr_min | (0xffffffff >> int(preflen))
4742         return str(socket.inet_ntoa(
4743             struct.pack('!L', random.randint(addr_min, addr_max))))
4744
4745
4746 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4747     def __init__(self, proxies=None):
4748         # Set default handlers
4749         for type in ('http', 'https'):
4750             setattr(self, '%s_open' % type,
4751                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4752                         meth(r, proxy, type))
4753         urllib.request.ProxyHandler.__init__(self, proxies)
4754
4755     def proxy_open(self, req, proxy, type):
4756         req_proxy = req.headers.get('Ytdl-request-proxy')
4757         if req_proxy is not None:
4758             proxy = req_proxy
4759             del req.headers['Ytdl-request-proxy']
4760
4761         if proxy == '__noproxy__':
4762             return None  # No Proxy
4763         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4764             req.add_header('Ytdl-socks-proxy', proxy)
4765             # yt-dlp's http/https handlers do wrapping the socket with socks
4766             return None
4767         return urllib.request.ProxyHandler.proxy_open(
4768             self, req, proxy, type)
4769
4770
4771 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4772 # released into Public Domain
4773 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4774
4775 def long_to_bytes(n, blocksize=0):
4776     """long_to_bytes(n:long, blocksize:int) : string
4777     Convert a long integer to a byte string.
4778
4779     If optional blocksize is given and greater than zero, pad the front of the
4780     byte string with binary zeros so that the length is a multiple of
4781     blocksize.
4782     """
4783     # after much testing, this algorithm was deemed to be the fastest
4784     s = b''
4785     n = int(n)
4786     while n > 0:
4787         s = struct.pack('>I', n & 0xffffffff) + s
4788         n = n >> 32
4789     # strip off leading zeros
4790     for i in range(len(s)):
4791         if s[i] != b'\000'[0]:
4792             break
4793     else:
4794         # only happens when n == 0
4795         s = b'\000'
4796         i = 0
4797     s = s[i:]
4798     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4799     # de-padding being done above, but sigh...
4800     if blocksize > 0 and len(s) % blocksize:
4801         s = (blocksize - len(s) % blocksize) * b'\000' + s
4802     return s
4803
4804
4805 def bytes_to_long(s):
4806     """bytes_to_long(string) : long
4807     Convert a byte string to a long integer.
4808
4809     This is (essentially) the inverse of long_to_bytes().
4810     """
4811     acc = 0
4812     length = len(s)
4813     if length % 4:
4814         extra = (4 - length % 4)
4815         s = b'\000' * extra + s
4816         length = length + extra
4817     for i in range(0, length, 4):
4818         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4819     return acc
4820
4821
4822 def ohdave_rsa_encrypt(data, exponent, modulus):
4823     '''
4824     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4825
4826     Input:
4827         data: data to encrypt, bytes-like object
4828         exponent, modulus: parameter e and N of RSA algorithm, both integer
4829     Output: hex string of encrypted data
4830
4831     Limitation: supports one block encryption only
4832     '''
4833
4834     payload = int(binascii.hexlify(data[::-1]), 16)
4835     encrypted = pow(payload, exponent, modulus)
4836     return '%x' % encrypted
4837
4838
4839 def pkcs1pad(data, length):
4840     """
4841     Padding input data with PKCS#1 scheme
4842
4843     @param {int[]} data        input data
4844     @param {int}   length      target length
4845     @returns {int[]}           padded data
4846     """
4847     if len(data) > length - 11:
4848         raise ValueError('Input data too long for PKCS#1 padding')
4849
4850     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4851     return [0, 2] + pseudo_random + [0] + data
4852
4853
4854 def _base_n_table(n, table):
4855     if not table and not n:
4856         raise ValueError('Either table or n must be specified')
4857     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4858
4859     if n and n != len(table):
4860         raise ValueError(f'base {n} exceeds table length {len(table)}')
4861     return table
4862
4863
4864 def encode_base_n(num, n=None, table=None):
4865     """Convert given int to a base-n string"""
4866     table = _base_n_table(n, table)
4867     if not num:
4868         return table[0]
4869
4870     result, base = '', len(table)
4871     while num:
4872         result = table[num % base] + result
4873         num = num // base
4874     return result
4875
4876
4877 def decode_base_n(string, n=None, table=None):
4878     """Convert given base-n string to int"""
4879     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4880     result, base = 0, len(table)
4881     for char in string:
4882         result = result * base + table[char]
4883     return result
4884
4885
4886 def decode_packed_codes(code):
4887     mobj = re.search(PACKED_CODES_RE, code)
4888     obfuscated_code, base, count, symbols = mobj.groups()
4889     base = int(base)
4890     count = int(count)
4891     symbols = symbols.split('|')
4892     symbol_table = {}
4893
4894     while count:
4895         count -= 1
4896         base_n_count = encode_base_n(count, base)
4897         symbol_table[base_n_count] = symbols[count] or base_n_count
4898
4899     return re.sub(
4900         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4901         obfuscated_code)
4902
4903
4904 def caesar(s, alphabet, shift):
4905     if shift == 0:
4906         return s
4907     l = len(alphabet)
4908     return ''.join(
4909         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4910         for c in s)
4911
4912
4913 def rot47(s):
4914     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4915
4916
4917 def parse_m3u8_attributes(attrib):
4918     info = {}
4919     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4920         if val.startswith('"'):
4921             val = val[1:-1]
4922         info[key] = val
4923     return info
4924
4925
4926 def urshift(val, n):
4927     return val >> n if val >= 0 else (val + 0x100000000) >> n
4928
4929
4930 def write_xattr(path, key, value):
4931     # Windows: Write xattrs to NTFS Alternate Data Streams:
4932     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4933     if compat_os_name == 'nt':
4934         assert ':' not in key
4935         assert os.path.exists(path)
4936
4937         try:
4938             with open(f'{path}:{key}', 'wb') as f:
4939                 f.write(value)
4940         except OSError as e:
4941             raise XAttrMetadataError(e.errno, e.strerror)
4942         return
4943
4944     # UNIX Method 1. Use xattrs/pyxattrs modules
4945
4946     setxattr = None
4947     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4948         # Unicode arguments are not supported in pyxattr until version 0.5.0
4949         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4950         if version_tuple(xattr.__version__) >= (0, 5, 0):
4951             setxattr = xattr.set
4952     elif xattr:
4953         setxattr = xattr.setxattr
4954
4955     if setxattr:
4956         try:
4957             setxattr(path, key, value)
4958         except OSError as e:
4959             raise XAttrMetadataError(e.errno, e.strerror)
4960         return
4961
4962     # UNIX Method 2. Use setfattr/xattr executables
4963     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4964            else 'xattr' if check_executable('xattr', ['-h']) else None)
4965     if not exe:
4966         raise XAttrUnavailableError(
4967             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4968             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4969
4970     value = value.decode()
4971     try:
4972         _, stderr, returncode = Popen.run(
4973             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4974             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4975     except OSError as e:
4976         raise XAttrMetadataError(e.errno, e.strerror)
4977     if returncode:
4978         raise XAttrMetadataError(returncode, stderr)
4979
4980
4981 def random_birthday(year_field, month_field, day_field):
4982     start_date = datetime.date(1950, 1, 1)
4983     end_date = datetime.date(1995, 12, 31)
4984     offset = random.randint(0, (end_date - start_date).days)
4985     random_date = start_date + datetime.timedelta(offset)
4986     return {
4987         year_field: str(random_date.year),
4988         month_field: str(random_date.month),
4989         day_field: str(random_date.day),
4990     }
4991
4992
4993 def find_available_port(interface=''):
4994     try:
4995         with socket.socket() as sock:
4996             sock.bind((interface, 0))
4997             return sock.getsockname()[1]
4998     except OSError:
4999         return None
5000
5001
5002 # Templates for internet shortcut files, which are plain text files.
5003 DOT_URL_LINK_TEMPLATE = '''\
5004 [InternetShortcut]
5005 URL=%(url)s
5006 '''
5007
5008 DOT_WEBLOC_LINK_TEMPLATE = '''\
5009 <?xml version="1.0" encoding="UTF-8"?>
5010 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5011 <plist version="1.0">
5012 <dict>
5013 \t<key>URL</key>
5014 \t<string>%(url)s</string>
5015 </dict>
5016 </plist>
5017 '''
5018
5019 DOT_DESKTOP_LINK_TEMPLATE = '''\
5020 [Desktop Entry]
5021 Encoding=UTF-8
5022 Name=%(filename)s
5023 Type=Link
5024 URL=%(url)s
5025 Icon=text-html
5026 '''
5027
5028 LINK_TEMPLATES = {
5029     'url': DOT_URL_LINK_TEMPLATE,
5030     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5031     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5032 }
5033
5034
5035 def iri_to_uri(iri):
5036     """
5037     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5038
5039     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5040     """
5041
5042     iri_parts = urllib.parse.urlparse(iri)
5043
5044     if '[' in iri_parts.netloc:
5045         raise ValueError('IPv6 URIs are not, yet, supported.')
5046         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5047
5048     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5049
5050     net_location = ''
5051     if iri_parts.username:
5052         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5053         if iri_parts.password is not None:
5054             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5055         net_location += '@'
5056
5057     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5058     # The 'idna' encoding produces ASCII text.
5059     if iri_parts.port is not None and iri_parts.port != 80:
5060         net_location += ':' + str(iri_parts.port)
5061
5062     return urllib.parse.urlunparse(
5063         (iri_parts.scheme,
5064             net_location,
5065
5066             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5067
5068             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5069             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5070
5071             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5072             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5073
5074             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5075
5076     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5077
5078
5079 def to_high_limit_path(path):
5080     if sys.platform in ['win32', 'cygwin']:
5081         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5082         return '\\\\?\\' + os.path.abspath(path)
5083
5084     return path
5085
5086
5087 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5088     val = traversal.traverse_obj(obj, *variadic(field))
5089     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5090         return default
5091     return template % func(val)
5092
5093
5094 def clean_podcast_url(url):
5095     return re.sub(r'''(?x)
5096         (?:
5097             (?:
5098                 chtbl\.com/track|
5099                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5100                 play\.podtrac\.com
5101             )/[^/]+|
5102             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5103             flex\.acast\.com|
5104             pd(?:
5105                 cn\.co| # https://podcorn.com/analytics-prefix/
5106                 st\.fm # https://podsights.com/docs/
5107             )/e
5108         )/''', '', url)
5109
5110
5111 _HEX_TABLE = '0123456789abcdef'
5112
5113
5114 def random_uuidv4():
5115     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5116
5117
5118 def make_dir(path, to_screen=None):
5119     try:
5120         dn = os.path.dirname(path)
5121         if dn:
5122             os.makedirs(dn, exist_ok=True)
5123         return True
5124     except OSError as err:
5125         if callable(to_screen) is not None:
5126             to_screen(f'unable to create directory {err}')
5127         return False
5128
5129
5130 def get_executable_path():
5131     from ..update import _get_variant_and_executable_path
5132
5133     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5134
5135
5136 def get_user_config_dirs(package_name):
5137     # .config (e.g. ~/.config/package_name)
5138     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5139     yield os.path.join(xdg_config_home, package_name)
5140
5141     # appdata (%APPDATA%/package_name)
5142     appdata_dir = os.getenv('appdata')
5143     if appdata_dir:
5144         yield os.path.join(appdata_dir, package_name)
5145
5146     # home (~/.package_name)
5147     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5148
5149
5150 def get_system_config_dirs(package_name):
5151     # /etc/package_name
5152     yield os.path.join('/etc', package_name)
5153
5154
5155 def time_seconds(**kwargs):
5156     """
5157     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5158     """
5159     return time.time() + datetime.timedelta(**kwargs).total_seconds()
5160
5161
5162 # create a JSON Web Signature (jws) with HS256 algorithm
5163 # the resulting format is in JWS Compact Serialization
5164 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5165 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5166 def jwt_encode_hs256(payload_data, key, headers={}):
5167     header_data = {
5168         'alg': 'HS256',
5169         'typ': 'JWT',
5170     }
5171     if headers:
5172         header_data.update(headers)
5173     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5174     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5175     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5176     signature_b64 = base64.b64encode(h.digest())
5177     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5178     return token
5179
5180
5181 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5182 def jwt_decode_hs256(jwt):
5183     header_b64, payload_b64, signature_b64 = jwt.split('.')
5184     # add trailing ='s that may have been stripped, superfluous ='s are ignored
5185     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5186     return payload_data
5187
5188
5189 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5190
5191
5192 @functools.cache
5193 def supports_terminal_sequences(stream):
5194     if compat_os_name == 'nt':
5195         if not WINDOWS_VT_MODE:
5196             return False
5197     elif not os.getenv('TERM'):
5198         return False
5199     try:
5200         return stream.isatty()
5201     except BaseException:
5202         return False
5203
5204
5205 def windows_enable_vt_mode():
5206     """Ref: https://bugs.python.org/issue30075 """
5207     if get_windows_version() < (10, 0, 10586):
5208         return
5209
5210     import ctypes
5211     import ctypes.wintypes
5212     import msvcrt
5213
5214     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5215
5216     dll = ctypes.WinDLL('kernel32', use_last_error=False)
5217     handle = os.open('CONOUT$', os.O_RDWR)
5218     try:
5219         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5220         dw_original_mode = ctypes.wintypes.DWORD()
5221         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5222         if not success:
5223             raise Exception('GetConsoleMode failed')
5224
5225         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5226             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5227         if not success:
5228             raise Exception('SetConsoleMode failed')
5229     finally:
5230         os.close(handle)
5231
5232     global WINDOWS_VT_MODE
5233     WINDOWS_VT_MODE = True
5234     supports_terminal_sequences.cache_clear()
5235
5236
5237 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5238
5239
5240 def remove_terminal_sequences(string):
5241     return _terminal_sequences_re.sub('', string)
5242
5243
5244 def number_of_digits(number):
5245     return len('%d' % number)
5246
5247
5248 def join_nonempty(*values, delim='-', from_dict=None):
5249     if from_dict is not None:
5250         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
5251     return delim.join(map(str, filter(None, values)))
5252
5253
5254 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5255     """
5256     Find the largest format dimensions in terms of video width and, for each thumbnail:
5257     * Modify the URL: Match the width with the provided regex and replace with the former width
5258     * Update dimensions
5259
5260     This function is useful with video services that scale the provided thumbnails on demand
5261     """
5262     _keys = ('width', 'height')
5263     max_dimensions = max(
5264         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5265         default=(0, 0))
5266     if not max_dimensions[0]:
5267         return thumbnails
5268     return [
5269         merge_dicts(
5270             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5271             dict(zip(_keys, max_dimensions)), thumbnail)
5272         for thumbnail in thumbnails
5273     ]
5274
5275
5276 def parse_http_range(range):
5277     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5278     if not range:
5279         return None, None, None
5280     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5281     if not crg:
5282         return None, None, None
5283     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5284
5285
5286 def read_stdin(what):
5287     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5288     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5289     return sys.stdin
5290
5291
5292 def determine_file_encoding(data):
5293     """
5294     Detect the text encoding used
5295     @returns (encoding, bytes to skip)
5296     """
5297
5298     # BOM marks are given priority over declarations
5299     for bom, enc in BOMS:
5300         if data.startswith(bom):
5301             return enc, len(bom)
5302
5303     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5304     # We ignore the endianness to get a good enough match
5305     data = data.replace(b'\0', b'')
5306     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5307     return mobj.group(1).decode() if mobj else None, 0
5308
5309
5310 class Config:
5311     own_args = None
5312     parsed_args = None
5313     filename = None
5314     __initialized = False
5315
5316     def __init__(self, parser, label=None):
5317         self.parser, self.label = parser, label
5318         self._loaded_paths, self.configs = set(), []
5319
5320     def init(self, args=None, filename=None):
5321         assert not self.__initialized
5322         self.own_args, self.filename = args, filename
5323         return self.load_configs()
5324
5325     def load_configs(self):
5326         directory = ''
5327         if self.filename:
5328             location = os.path.realpath(self.filename)
5329             directory = os.path.dirname(location)
5330             if location in self._loaded_paths:
5331                 return False
5332             self._loaded_paths.add(location)
5333
5334         self.__initialized = True
5335         opts, _ = self.parser.parse_known_args(self.own_args)
5336         self.parsed_args = self.own_args
5337         for location in opts.config_locations or []:
5338             if location == '-':
5339                 if location in self._loaded_paths:
5340                     continue
5341                 self._loaded_paths.add(location)
5342                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5343                 continue
5344             location = os.path.join(directory, expand_path(location))
5345             if os.path.isdir(location):
5346                 location = os.path.join(location, 'yt-dlp.conf')
5347             if not os.path.exists(location):
5348                 self.parser.error(f'config location {location} does not exist')
5349             self.append_config(self.read_file(location), location)
5350         return True
5351
5352     def __str__(self):
5353         label = join_nonempty(
5354             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5355             delim=' ')
5356         return join_nonempty(
5357             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5358             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5359             delim='\n')
5360
5361     @staticmethod
5362     def read_file(filename, default=[]):
5363         try:
5364             optionf = open(filename, 'rb')
5365         except OSError:
5366             return default  # silently skip if file is not present
5367         try:
5368             enc, skip = determine_file_encoding(optionf.read(512))
5369             optionf.seek(skip, io.SEEK_SET)
5370         except OSError:
5371             enc = None  # silently skip read errors
5372         try:
5373             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5374             contents = optionf.read().decode(enc or preferredencoding())
5375             res = shlex.split(contents, comments=True)
5376         except Exception as err:
5377             raise ValueError(f'Unable to parse "{filename}": {err}')
5378         finally:
5379             optionf.close()
5380         return res
5381
5382     @staticmethod
5383     def hide_login_info(opts):
5384         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5385         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5386
5387         def _scrub_eq(o):
5388             m = eqre.match(o)
5389             if m:
5390                 return m.group('key') + '=PRIVATE'
5391             else:
5392                 return o
5393
5394         opts = list(map(_scrub_eq, opts))
5395         for idx, opt in enumerate(opts):
5396             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5397                 opts[idx + 1] = 'PRIVATE'
5398         return opts
5399
5400     def append_config(self, *args, label=None):
5401         config = type(self)(self.parser, label)
5402         config._loaded_paths = self._loaded_paths
5403         if config.init(*args):
5404             self.configs.append(config)
5405
5406     @property
5407     def all_args(self):
5408         for config in reversed(self.configs):
5409             yield from config.all_args
5410         yield from self.parsed_args or []
5411
5412     def parse_known_args(self, **kwargs):
5413         return self.parser.parse_known_args(self.all_args, **kwargs)
5414
5415     def parse_args(self):
5416         return self.parser.parse_args(self.all_args)
5417
5418
5419 class WebSocketsWrapper:
5420     """Wraps websockets module to use in non-async scopes"""
5421     pool = None
5422
5423     def __init__(self, url, headers=None, connect=True):
5424         self.loop = asyncio.new_event_loop()
5425         # XXX: "loop" is deprecated
5426         self.conn = websockets.connect(
5427             url, extra_headers=headers, ping_interval=None,
5428             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5429         if connect:
5430             self.__enter__()
5431         atexit.register(self.__exit__, None, None, None)
5432
5433     def __enter__(self):
5434         if not self.pool:
5435             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5436         return self
5437
5438     def send(self, *args):
5439         self.run_with_loop(self.pool.send(*args), self.loop)
5440
5441     def recv(self, *args):
5442         return self.run_with_loop(self.pool.recv(*args), self.loop)
5443
5444     def __exit__(self, type, value, traceback):
5445         try:
5446             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5447         finally:
5448             self.loop.close()
5449             self._cancel_all_tasks(self.loop)
5450
5451     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5452     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5453     @staticmethod
5454     def run_with_loop(main, loop):
5455         if not asyncio.iscoroutine(main):
5456             raise ValueError(f'a coroutine was expected, got {main!r}')
5457
5458         try:
5459             return loop.run_until_complete(main)
5460         finally:
5461             loop.run_until_complete(loop.shutdown_asyncgens())
5462             if hasattr(loop, 'shutdown_default_executor'):
5463                 loop.run_until_complete(loop.shutdown_default_executor())
5464
5465     @staticmethod
5466     def _cancel_all_tasks(loop):
5467         to_cancel = asyncio.all_tasks(loop)
5468
5469         if not to_cancel:
5470             return
5471
5472         for task in to_cancel:
5473             task.cancel()
5474
5475         # XXX: "loop" is removed in python 3.10+
5476         loop.run_until_complete(
5477             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5478
5479         for task in to_cancel:
5480             if task.cancelled():
5481                 continue
5482             if task.exception() is not None:
5483                 loop.call_exception_handler({
5484                     'message': 'unhandled exception during asyncio.run() shutdown',
5485                     'exception': task.exception(),
5486                     'task': task,
5487                 })
5488
5489
5490 def merge_headers(*dicts):
5491     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5492     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5493
5494
5495 def cached_method(f):
5496     """Cache a method"""
5497     signature = inspect.signature(f)
5498
5499     @functools.wraps(f)
5500     def wrapper(self, *args, **kwargs):
5501         bound_args = signature.bind(self, *args, **kwargs)
5502         bound_args.apply_defaults()
5503         key = tuple(bound_args.arguments.values())[1:]
5504
5505         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5506         if key not in cache:
5507             cache[key] = f(self, *args, **kwargs)
5508         return cache[key]
5509     return wrapper
5510
5511
5512 class classproperty:
5513     """property access for class methods with optional caching"""
5514     def __new__(cls, func=None, *args, **kwargs):
5515         if not func:
5516             return functools.partial(cls, *args, **kwargs)
5517         return super().__new__(cls)
5518
5519     def __init__(self, func, *, cache=False):
5520         functools.update_wrapper(self, func)
5521         self.func = func
5522         self._cache = {} if cache else None
5523
5524     def __get__(self, _, cls):
5525         if self._cache is None:
5526             return self.func(cls)
5527         elif cls not in self._cache:
5528             self._cache[cls] = self.func(cls)
5529         return self._cache[cls]
5530
5531
5532 class function_with_repr:
5533     def __init__(self, func, repr_=None):
5534         functools.update_wrapper(self, func)
5535         self.func, self.__repr = func, repr_
5536
5537     def __call__(self, *args, **kwargs):
5538         return self.func(*args, **kwargs)
5539
5540     def __repr__(self):
5541         if self.__repr:
5542             return self.__repr
5543         return f'{self.func.__module__}.{self.func.__qualname__}'
5544
5545
5546 class Namespace(types.SimpleNamespace):
5547     """Immutable namespace"""
5548
5549     def __iter__(self):
5550         return iter(self.__dict__.values())
5551
5552     @property
5553     def items_(self):
5554         return self.__dict__.items()
5555
5556
5557 MEDIA_EXTENSIONS = Namespace(
5558     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5559     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5560     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5561     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5562     thumbnails=('jpg', 'png', 'webp'),
5563     storyboards=('mhtml', ),
5564     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5565     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5566 )
5567 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5568 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5569
5570 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5571
5572
5573 class RetryManager:
5574     """Usage:
5575         for retry in RetryManager(...):
5576             try:
5577                 ...
5578             except SomeException as err:
5579                 retry.error = err
5580                 continue
5581     """
5582     attempt, _error = 0, None
5583
5584     def __init__(self, _retries, _error_callback, **kwargs):
5585         self.retries = _retries or 0
5586         self.error_callback = functools.partial(_error_callback, **kwargs)
5587
5588     def _should_retry(self):
5589         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5590
5591     @property
5592     def error(self):
5593         if self._error is NO_DEFAULT:
5594             return None
5595         return self._error
5596
5597     @error.setter
5598     def error(self, value):
5599         self._error = value
5600
5601     def __iter__(self):
5602         while self._should_retry():
5603             self.error = NO_DEFAULT
5604             self.attempt += 1
5605             yield self
5606             if self.error:
5607                 self.error_callback(self.error, self.attempt, self.retries)
5608
5609     @staticmethod
5610     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5611         """Utility function for reporting retries"""
5612         if count > retries:
5613             if error:
5614                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5615             raise e
5616
5617         if not count:
5618             return warn(e)
5619         elif isinstance(e, ExtractorError):
5620             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5621         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5622
5623         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5624         if delay:
5625             info(f'Sleeping {delay:.2f} seconds ...')
5626             time.sleep(delay)
5627
5628
5629 def make_archive_id(ie, video_id):
5630     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5631     return f'{ie_key.lower()} {video_id}'
5632
5633
5634 def truncate_string(s, left, right=0):
5635     assert left > 3 and right >= 0
5636     if s is None or len(s) <= left + right:
5637         return s
5638     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5639
5640
5641 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5642     assert 'all' in alias_dict, '"all" alias is required'
5643     requested = list(start or [])
5644     for val in options:
5645         discard = val.startswith('-')
5646         if discard:
5647             val = val[1:]
5648
5649         if val in alias_dict:
5650             val = alias_dict[val] if not discard else [
5651                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5652             # NB: Do not allow regex in aliases for performance
5653             requested = orderedSet_from_options(val, alias_dict, start=requested)
5654             continue
5655
5656         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5657                    else [val] if val in alias_dict['all'] else None)
5658         if current is None:
5659             raise ValueError(val)
5660
5661         if discard:
5662             for item in current:
5663                 while item in requested:
5664                     requested.remove(item)
5665         else:
5666             requested.extend(current)
5667
5668     return orderedSet(requested)
5669
5670
5671 class FormatSorter:
5672     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5673
5674     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5675                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5676                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5677     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5678                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5679                     'fps', 'fs_approx', 'source', 'id')
5680
5681     settings = {
5682         'vcodec': {'type': 'ordered', 'regex': True,
5683                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5684         'acodec': {'type': 'ordered', 'regex': True,
5685                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5686         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5687                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5688         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5689                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5690         'vext': {'type': 'ordered', 'field': 'video_ext',
5691                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5692                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5693         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5694                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5695                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5696         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5697         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5698                        'field': ('vcodec', 'acodec'),
5699                        'function': lambda it: int(any(v != 'none' for v in it))},
5700         'ie_pref': {'priority': True, 'type': 'extractor'},
5701         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5702         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5703         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5704         'quality': {'convert': 'float', 'default': -1},
5705         'filesize': {'convert': 'bytes'},
5706         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5707         'id': {'convert': 'string', 'field': 'format_id'},
5708         'height': {'convert': 'float_none'},
5709         'width': {'convert': 'float_none'},
5710         'fps': {'convert': 'float_none'},
5711         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5712         'tbr': {'convert': 'float_none'},
5713         'vbr': {'convert': 'float_none'},
5714         'abr': {'convert': 'float_none'},
5715         'asr': {'convert': 'float_none'},
5716         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5717
5718         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5719         'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
5720         'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
5721         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5722         'res': {'type': 'multiple', 'field': ('height', 'width'),
5723                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5724
5725         # Actual field names
5726         'format_id': {'type': 'alias', 'field': 'id'},
5727         'preference': {'type': 'alias', 'field': 'ie_pref'},
5728         'language_preference': {'type': 'alias', 'field': 'lang'},
5729         'source_preference': {'type': 'alias', 'field': 'source'},
5730         'protocol': {'type': 'alias', 'field': 'proto'},
5731         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5732         'audio_channels': {'type': 'alias', 'field': 'channels'},
5733
5734         # Deprecated
5735         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5736         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5737         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5738         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5739         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5740         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5741         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5742         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5743         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5744         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5745         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5746         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5747         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5748         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5749         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5750         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5751         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5752         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5753         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5754         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5755     }
5756
5757     def __init__(self, ydl, field_preference):
5758         self.ydl = ydl
5759         self._order = []
5760         self.evaluate_params(self.ydl.params, field_preference)
5761         if ydl.params.get('verbose'):
5762             self.print_verbose_info(self.ydl.write_debug)
5763
5764     def _get_field_setting(self, field, key):
5765         if field not in self.settings:
5766             if key in ('forced', 'priority'):
5767                 return False
5768             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5769                                         'deprecated and may be removed in a future version')
5770             self.settings[field] = {}
5771         propObj = self.settings[field]
5772         if key not in propObj:
5773             type = propObj.get('type')
5774             if key == 'field':
5775                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5776             elif key == 'convert':
5777                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5778             else:
5779                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5780             propObj[key] = default
5781         return propObj[key]
5782
5783     def _resolve_field_value(self, field, value, convertNone=False):
5784         if value is None:
5785             if not convertNone:
5786                 return None
5787         else:
5788             value = value.lower()
5789         conversion = self._get_field_setting(field, 'convert')
5790         if conversion == 'ignore':
5791             return None
5792         if conversion == 'string':
5793             return value
5794         elif conversion == 'float_none':
5795             return float_or_none(value)
5796         elif conversion == 'bytes':
5797             return parse_bytes(value)
5798         elif conversion == 'order':
5799             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5800             use_regex = self._get_field_setting(field, 'regex')
5801             list_length = len(order_list)
5802             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5803             if use_regex and value is not None:
5804                 for i, regex in enumerate(order_list):
5805                     if regex and re.match(regex, value):
5806                         return list_length - i
5807                 return list_length - empty_pos  # not in list
5808             else:  # not regex or  value = None
5809                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5810         else:
5811             if value.isnumeric():
5812                 return float(value)
5813             else:
5814                 self.settings[field]['convert'] = 'string'
5815                 return value
5816
5817     def evaluate_params(self, params, sort_extractor):
5818         self._use_free_order = params.get('prefer_free_formats', False)
5819         self._sort_user = params.get('format_sort', [])
5820         self._sort_extractor = sort_extractor
5821
5822         def add_item(field, reverse, closest, limit_text):
5823             field = field.lower()
5824             if field in self._order:
5825                 return
5826             self._order.append(field)
5827             limit = self._resolve_field_value(field, limit_text)
5828             data = {
5829                 'reverse': reverse,
5830                 'closest': False if limit is None else closest,
5831                 'limit_text': limit_text,
5832                 'limit': limit}
5833             if field in self.settings:
5834                 self.settings[field].update(data)
5835             else:
5836                 self.settings[field] = data
5837
5838         sort_list = (
5839             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5840             + (tuple() if params.get('format_sort_force', False)
5841                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5842             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5843
5844         for item in sort_list:
5845             match = re.match(self.regex, item)
5846             if match is None:
5847                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5848             field = match.group('field')
5849             if field is None:
5850                 continue
5851             if self._get_field_setting(field, 'type') == 'alias':
5852                 alias, field = field, self._get_field_setting(field, 'field')
5853                 if self._get_field_setting(alias, 'deprecated'):
5854                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5855                                                 f'be removed in a future version. Please use {field} instead')
5856             reverse = match.group('reverse') is not None
5857             closest = match.group('separator') == '~'
5858             limit_text = match.group('limit')
5859
5860             has_limit = limit_text is not None
5861             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5862             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5863
5864             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5865             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5866             limit_count = len(limits)
5867             for (i, f) in enumerate(fields):
5868                 add_item(f, reverse, closest,
5869                          limits[i] if i < limit_count
5870                          else limits[0] if has_limit and not has_multiple_limits
5871                          else None)
5872
5873     def print_verbose_info(self, write_debug):
5874         if self._sort_user:
5875             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5876         if self._sort_extractor:
5877             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5878         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5879             '+' if self._get_field_setting(field, 'reverse') else '', field,
5880             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5881                           self._get_field_setting(field, 'limit_text'),
5882                           self._get_field_setting(field, 'limit'))
5883             if self._get_field_setting(field, 'limit_text') is not None else '')
5884             for field in self._order if self._get_field_setting(field, 'visible')]))
5885
5886     def _calculate_field_preference_from_value(self, format, field, type, value):
5887         reverse = self._get_field_setting(field, 'reverse')
5888         closest = self._get_field_setting(field, 'closest')
5889         limit = self._get_field_setting(field, 'limit')
5890
5891         if type == 'extractor':
5892             maximum = self._get_field_setting(field, 'max')
5893             if value is None or (maximum is not None and value >= maximum):
5894                 value = -1
5895         elif type == 'boolean':
5896             in_list = self._get_field_setting(field, 'in_list')
5897             not_in_list = self._get_field_setting(field, 'not_in_list')
5898             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5899         elif type == 'ordered':
5900             value = self._resolve_field_value(field, value, True)
5901
5902         # try to convert to number
5903         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5904         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5905         if is_num:
5906             value = val_num
5907
5908         return ((-10, 0) if value is None
5909                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5910                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5911                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5912                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5913                 else (-1, value, 0))
5914
5915     def _calculate_field_preference(self, format, field):
5916         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5917         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5918         if type == 'multiple':
5919             type = 'field'  # Only 'field' is allowed in multiple for now
5920             actual_fields = self._get_field_setting(field, 'field')
5921
5922             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5923         else:
5924             value = get_value(field)
5925         return self._calculate_field_preference_from_value(format, field, type, value)
5926
5927     def calculate_preference(self, format):
5928         # Determine missing protocol
5929         if not format.get('protocol'):
5930             format['protocol'] = determine_protocol(format)
5931
5932         # Determine missing ext
5933         if not format.get('ext') and 'url' in format:
5934             format['ext'] = determine_ext(format['url'])
5935         if format.get('vcodec') == 'none':
5936             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5937             format['video_ext'] = 'none'
5938         else:
5939             format['video_ext'] = format['ext']
5940             format['audio_ext'] = 'none'
5941         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5942         #    format['preference'] = -1000
5943
5944         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5945             # HEVC-over-FLV is out-of-spec by FLV's original spec
5946             # ref. https://trac.ffmpeg.org/ticket/6389
5947             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5948             format['preference'] = -100
5949
5950         # Determine missing bitrates
5951         if format.get('tbr') is None:
5952             if format.get('vbr') is not None and format.get('abr') is not None:
5953                 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
5954         else:
5955             if format.get('vcodec') != 'none' and format.get('vbr') is None:
5956                 format['vbr'] = format.get('tbr') - format.get('abr', 0)
5957             if format.get('acodec') != 'none' and format.get('abr') is None:
5958                 format['abr'] = format.get('tbr') - format.get('vbr', 0)
5959
5960         return tuple(self._calculate_field_preference(format, field) for field in self._order)