yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import unicodedata
  44 import urllib.error
  45 import urllib.parse
  46 import urllib.request
  47 import xml.etree.ElementTree
  48 import zlib
  49
  50 from . import traversal
  51
  52 from ..compat import functools  # isort: split
  53 from ..compat import (
  54     compat_etree_fromstring,
  55     compat_expanduser,
  56     compat_HTMLParseError,
  57     compat_os_name,
  58     compat_shlex_quote,
  59 )
  60 from ..dependencies import brotli, certifi, websockets, xattr
  61 from ..socks import ProxyType, sockssocket
  62
  63 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  64
  65 # This is not clearly defined otherwise
  66 compiled_regex_type = type(re.compile(''))
  67
  68
  69 def random_user_agent():
  70     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  71     _CHROME_VERSIONS = (
  72         '90.0.4430.212',
  73         '90.0.4430.24',
  74         '90.0.4430.70',
  75         '90.0.4430.72',
  76         '90.0.4430.85',
  77         '90.0.4430.93',
  78         '91.0.4472.101',
  79         '91.0.4472.106',
  80         '91.0.4472.114',
  81         '91.0.4472.124',
  82         '91.0.4472.164',
  83         '91.0.4472.19',
  84         '91.0.4472.77',
  85         '92.0.4515.107',
  86         '92.0.4515.115',
  87         '92.0.4515.131',
  88         '92.0.4515.159',
  89         '92.0.4515.43',
  90         '93.0.4556.0',
  91         '93.0.4577.15',
  92         '93.0.4577.63',
  93         '93.0.4577.82',
  94         '94.0.4606.41',
  95         '94.0.4606.54',
  96         '94.0.4606.61',
  97         '94.0.4606.71',
  98         '94.0.4606.81',
  99         '94.0.4606.85',
 100         '95.0.4638.17',
 101         '95.0.4638.50',
 102         '95.0.4638.54',
 103         '95.0.4638.69',
 104         '95.0.4638.74',
 105         '96.0.4664.18',
 106         '96.0.4664.45',
 107         '96.0.4664.55',
 108         '96.0.4664.93',
 109         '97.0.4692.20',
 110     )
 111     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 112
 113
 114 SUPPORTED_ENCODINGS = [
 115     'gzip', 'deflate'
 116 ]
 117 if brotli:
 118     SUPPORTED_ENCODINGS.append('br')
 119
 120 std_headers = {
 121     'User-Agent': random_user_agent(),
 122     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 123     'Accept-Language': 'en-us,en;q=0.5',
 124     'Sec-Fetch-Mode': 'navigate',
 125 }
 126
 127
 128 USER_AGENTS = {
 129     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 130 }
 131
 132
 133 class NO_DEFAULT:
 134     pass
 135
 136
 137 def IDENTITY(x):
 138     return x
 139
 140
 141 ENGLISH_MONTH_NAMES = [
 142     'January', 'February', 'March', 'April', 'May', 'June',
 143     'July', 'August', 'September', 'October', 'November', 'December']
 144
 145 MONTH_NAMES = {
 146     'en': ENGLISH_MONTH_NAMES,
 147     'fr': [
 148         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 149         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 150     # these follow the genitive grammatical case (dopełniacz)
 151     # some websites might be using nominative, which will require another month list
 152     # https://en.wikibooks.org/wiki/Polish/Noun_cases
 153     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
 154            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
 155 }
 156
 157 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 158 TIMEZONE_NAMES = {
 159     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 160     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 161     'EST': -5, 'EDT': -4,  # Eastern
 162     'CST': -6, 'CDT': -5,  # Central
 163     'MST': -7, 'MDT': -6,  # Mountain
 164     'PST': -8, 'PDT': -7   # Pacific
 165 }
 166
 167 # needed for sanitizing filenames in restricted mode
 168 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 169                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 170                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 171
 172 DATE_FORMATS = (
 173     '%d %B %Y',
 174     '%d %b %Y',
 175     '%B %d %Y',
 176     '%B %dst %Y',
 177     '%B %dnd %Y',
 178     '%B %drd %Y',
 179     '%B %dth %Y',
 180     '%b %d %Y',
 181     '%b %dst %Y',
 182     '%b %dnd %Y',
 183     '%b %drd %Y',
 184     '%b %dth %Y',
 185     '%b %dst %Y %I:%M',
 186     '%b %dnd %Y %I:%M',
 187     '%b %drd %Y %I:%M',
 188     '%b %dth %Y %I:%M',
 189     '%Y %m %d',
 190     '%Y-%m-%d',
 191     '%Y.%m.%d.',
 192     '%Y/%m/%d',
 193     '%Y/%m/%d %H:%M',
 194     '%Y/%m/%d %H:%M:%S',
 195     '%Y%m%d%H%M',
 196     '%Y%m%d%H%M%S',
 197     '%Y%m%d',
 198     '%Y-%m-%d %H:%M',
 199     '%Y-%m-%d %H:%M:%S',
 200     '%Y-%m-%d %H:%M:%S.%f',
 201     '%Y-%m-%d %H:%M:%S:%f',
 202     '%d.%m.%Y %H:%M',
 203     '%d.%m.%Y %H.%M',
 204     '%Y-%m-%dT%H:%M:%SZ',
 205     '%Y-%m-%dT%H:%M:%S.%fZ',
 206     '%Y-%m-%dT%H:%M:%S.%f0Z',
 207     '%Y-%m-%dT%H:%M:%S',
 208     '%Y-%m-%dT%H:%M:%S.%f',
 209     '%Y-%m-%dT%H:%M',
 210     '%b %d %Y at %H:%M',
 211     '%b %d %Y at %H:%M:%S',
 212     '%B %d %Y at %H:%M',
 213     '%B %d %Y at %H:%M:%S',
 214     '%H:%M %d-%b-%Y',
 215 )
 216
 217 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 218 DATE_FORMATS_DAY_FIRST.extend([
 219     '%d-%m-%Y',
 220     '%d.%m.%Y',
 221     '%d.%m.%y',
 222     '%d/%m/%Y',
 223     '%d/%m/%y',
 224     '%d/%m/%Y %H:%M:%S',
 225     '%d-%m-%Y %H:%M',
 226     '%H:%M %d/%m/%Y',
 227 ])
 228
 229 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 230 DATE_FORMATS_MONTH_FIRST.extend([
 231     '%m-%d-%Y',
 232     '%m.%d.%Y',
 233     '%m/%d/%Y',
 234     '%m/%d/%y',
 235     '%m/%d/%Y %H:%M:%S',
 236 ])
 237
 238 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 239 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 240
 241 NUMBER_RE = r'\d+(?:\.\d+)?'
 242
 243
 244 @functools.cache
 245 def preferredencoding():
 246     """Get preferred encoding.
 247
 248     Returns the best encoding scheme for the system, based on
 249     locale.getpreferredencoding() and some further tweaks.
 250     """
 251     try:
 252         pref = locale.getpreferredencoding()
 253         'TEST'.encode(pref)
 254     except Exception:
 255         pref = 'UTF-8'
 256
 257     return pref
 258
 259
 260 def write_json_file(obj, fn):
 261     """ Encode obj as JSON and write it to fn, atomically if possible """
 262
 263     tf = tempfile.NamedTemporaryFile(
 264         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 265         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 266
 267     try:
 268         with tf:
 269             json.dump(obj, tf, ensure_ascii=False)
 270         if sys.platform == 'win32':
 271             # Need to remove existing file on Windows, else os.rename raises
 272             # WindowsError or FileExistsError.
 273             with contextlib.suppress(OSError):
 274                 os.unlink(fn)
 275         with contextlib.suppress(OSError):
 276             mask = os.umask(0)
 277             os.umask(mask)
 278             os.chmod(tf.name, 0o666 & ~mask)
 279         os.rename(tf.name, fn)
 280     except Exception:
 281         with contextlib.suppress(OSError):
 282             os.remove(tf.name)
 283         raise
 284
 285
 286 def find_xpath_attr(node, xpath, key, val=None):
 287     """ Find the xpath xpath[@key=val] """
 288     assert re.match(r'^[a-zA-Z_-]+$', key)
 289     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 290     return node.find(expr)
 291
 292 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 293 # the namespace parameter
 294
 295
 296 def xpath_with_ns(path, ns_map):
 297     components = [c.split(':') for c in path.split('/')]
 298     replaced = []
 299     for c in components:
 300         if len(c) == 1:
 301             replaced.append(c[0])
 302         else:
 303             ns, tag = c
 304             replaced.append('{%s}%s' % (ns_map[ns], tag))
 305     return '/'.join(replaced)
 306
 307
 308 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 309     def _find_xpath(xpath):
 310         return node.find(xpath)
 311
 312     if isinstance(xpath, str):
 313         n = _find_xpath(xpath)
 314     else:
 315         for xp in xpath:
 316             n = _find_xpath(xp)
 317             if n is not None:
 318                 break
 319
 320     if n is None:
 321         if default is not NO_DEFAULT:
 322             return default
 323         elif fatal:
 324             name = xpath if name is None else name
 325             raise ExtractorError('Could not find XML element %s' % name)
 326         else:
 327             return None
 328     return n
 329
 330
 331 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 332     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 333     if n is None or n == default:
 334         return n
 335     if n.text is None:
 336         if default is not NO_DEFAULT:
 337             return default
 338         elif fatal:
 339             name = xpath if name is None else name
 340             raise ExtractorError('Could not find XML element\'s text %s' % name)
 341         else:
 342             return None
 343     return n.text
 344
 345
 346 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 347     n = find_xpath_attr(node, xpath, key)
 348     if n is None:
 349         if default is not NO_DEFAULT:
 350             return default
 351         elif fatal:
 352             name = f'{xpath}[@{key}]' if name is None else name
 353             raise ExtractorError('Could not find XML attribute %s' % name)
 354         else:
 355             return None
 356     return n.attrib[key]
 357
 358
 359 def get_element_by_id(id, html, **kwargs):
 360     """Return the content of the tag with the specified ID in the passed HTML document"""
 361     return get_element_by_attribute('id', id, html, **kwargs)
 362
 363
 364 def get_element_html_by_id(id, html, **kwargs):
 365     """Return the html of the tag with the specified ID in the passed HTML document"""
 366     return get_element_html_by_attribute('id', id, html, **kwargs)
 367
 368
 369 def get_element_by_class(class_name, html):
 370     """Return the content of the first tag with the specified class in the passed HTML document"""
 371     retval = get_elements_by_class(class_name, html)
 372     return retval[0] if retval else None
 373
 374
 375 def get_element_html_by_class(class_name, html):
 376     """Return the html of the first tag with the specified class in the passed HTML document"""
 377     retval = get_elements_html_by_class(class_name, html)
 378     return retval[0] if retval else None
 379
 380
 381 def get_element_by_attribute(attribute, value, html, **kwargs):
 382     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 383     return retval[0] if retval else None
 384
 385
 386 def get_element_html_by_attribute(attribute, value, html, **kargs):
 387     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 388     return retval[0] if retval else None
 389
 390
 391 def get_elements_by_class(class_name, html, **kargs):
 392     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 393     return get_elements_by_attribute(
 394         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 395         html, escape_value=False)
 396
 397
 398 def get_elements_html_by_class(class_name, html):
 399     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 400     return get_elements_html_by_attribute(
 401         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 402         html, escape_value=False)
 403
 404
 405 def get_elements_by_attribute(*args, **kwargs):
 406     """Return the content of the tag with the specified attribute in the passed HTML document"""
 407     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 408
 409
 410 def get_elements_html_by_attribute(*args, **kwargs):
 411     """Return the html of the tag with the specified attribute in the passed HTML document"""
 412     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 413
 414
 415 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 416     """
 417     Return the text (content) and the html (whole) of the tag with the specified
 418     attribute in the passed HTML document
 419     """
 420     if not value:
 421         return
 422
 423     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 424
 425     value = re.escape(value) if escape_value else value
 426
 427     partial_element_re = rf'''(?x)
 428         <(?P<tag>{tag})
 429          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 430          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 431         '''
 432
 433     for m in re.finditer(partial_element_re, html):
 434         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 435
 436         yield (
 437             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 438             whole
 439         )
 440
 441
 442 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 443     """
 444     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 445     closing tag for the first opening tag it has encountered, and can be used
 446     as a context manager
 447     """
 448
 449     class HTMLBreakOnClosingTagException(Exception):
 450         pass
 451
 452     def __init__(self):
 453         self.tagstack = collections.deque()
 454         html.parser.HTMLParser.__init__(self)
 455
 456     def __enter__(self):
 457         return self
 458
 459     def __exit__(self, *_):
 460         self.close()
 461
 462     def close(self):
 463         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 464         # so data remains buffered; we no longer have any interest in it, thus
 465         # override this method to discard it
 466         pass
 467
 468     def handle_starttag(self, tag, _):
 469         self.tagstack.append(tag)
 470
 471     def handle_endtag(self, tag):
 472         if not self.tagstack:
 473             raise compat_HTMLParseError('no tags in the stack')
 474         while self.tagstack:
 475             inner_tag = self.tagstack.pop()
 476             if inner_tag == tag:
 477                 break
 478         else:
 479             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 480         if not self.tagstack:
 481             raise self.HTMLBreakOnClosingTagException()
 482
 483
 484 # XXX: This should be far less strict
 485 def get_element_text_and_html_by_tag(tag, html):
 486     """
 487     For the first element with the specified tag in the passed HTML document
 488     return its' content (text) and the whole element (html)
 489     """
 490     def find_or_raise(haystack, needle, exc):
 491         try:
 492             return haystack.index(needle)
 493         except ValueError:
 494             raise exc
 495     closing_tag = f'</{tag}>'
 496     whole_start = find_or_raise(
 497         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 498     content_start = find_or_raise(
 499         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 500     content_start += whole_start + 1
 501     with HTMLBreakOnClosingTagParser() as parser:
 502         parser.feed(html[whole_start:content_start])
 503         if not parser.tagstack or parser.tagstack[0] != tag:
 504             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 505         offset = content_start
 506         while offset < len(html):
 507             next_closing_tag_start = find_or_raise(
 508                 html[offset:], closing_tag,
 509                 compat_HTMLParseError(f'closing {tag} tag not found'))
 510             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 511             try:
 512                 parser.feed(html[offset:offset + next_closing_tag_end])
 513                 offset += next_closing_tag_end
 514             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 515                 return html[content_start:offset + next_closing_tag_start], \
 516                     html[whole_start:offset + next_closing_tag_end]
 517         raise compat_HTMLParseError('unexpected end of html')
 518
 519
 520 class HTMLAttributeParser(html.parser.HTMLParser):
 521     """Trivial HTML parser to gather the attributes for a single element"""
 522
 523     def __init__(self):
 524         self.attrs = {}
 525         html.parser.HTMLParser.__init__(self)
 526
 527     def handle_starttag(self, tag, attrs):
 528         self.attrs = dict(attrs)
 529         raise compat_HTMLParseError('done')
 530
 531
 532 class HTMLListAttrsParser(html.parser.HTMLParser):
 533     """HTML parser to gather the attributes for the elements of a list"""
 534
 535     def __init__(self):
 536         html.parser.HTMLParser.__init__(self)
 537         self.items = []
 538         self._level = 0
 539
 540     def handle_starttag(self, tag, attrs):
 541         if tag == 'li' and self._level == 0:
 542             self.items.append(dict(attrs))
 543         self._level += 1
 544
 545     def handle_endtag(self, tag):
 546         self._level -= 1
 547
 548
 549 def extract_attributes(html_element):
 550     """Given a string for an HTML element such as
 551     <el
 552          a="foo" B="bar" c="&98;az" d=boz
 553          empty= noval entity="&amp;"
 554          sq='"' dq="'"
 555     >
 556     Decode and return a dictionary of attributes.
 557     {
 558         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 559         'empty': '', 'noval': None, 'entity': '&',
 560         'sq': '"', 'dq': '\''
 561     }.
 562     """
 563     parser = HTMLAttributeParser()
 564     with contextlib.suppress(compat_HTMLParseError):
 565         parser.feed(html_element)
 566         parser.close()
 567     return parser.attrs
 568
 569
 570 def parse_list(webpage):
 571     """Given a string for an series of HTML <li> elements,
 572     return a dictionary of their attributes"""
 573     parser = HTMLListAttrsParser()
 574     parser.feed(webpage)
 575     parser.close()
 576     return parser.items
 577
 578
 579 def clean_html(html):
 580     """Clean an HTML snippet into a readable string"""
 581
 582     if html is None:  # Convenience for sanitizing descriptions etc.
 583         return html
 584
 585     html = re.sub(r'\s+', ' ', html)
 586     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 587     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 588     # Strip html tags
 589     html = re.sub('<.*?>', '', html)
 590     # Replace html entities
 591     html = unescapeHTML(html)
 592     return html.strip()
 593
 594
 595 class LenientJSONDecoder(json.JSONDecoder):
 596     # TODO: Write tests
 597     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 598         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 599         self._close_attempts = 2 * close_objects
 600         super().__init__(*args, **kwargs)
 601
 602     @staticmethod
 603     def _close_object(err):
 604         doc = err.doc[:err.pos]
 605         # We need to add comma first to get the correct error message
 606         if err.msg.startswith('Expecting \',\''):
 607             return doc + ','
 608         elif not doc.endswith(','):
 609             return
 610
 611         if err.msg.startswith('Expecting property name'):
 612             return doc[:-1] + '}'
 613         elif err.msg.startswith('Expecting value'):
 614             return doc[:-1] + ']'
 615
 616     def decode(self, s):
 617         if self.transform_source:
 618             s = self.transform_source(s)
 619         for attempt in range(self._close_attempts + 1):
 620             try:
 621                 if self.ignore_extra:
 622                     return self.raw_decode(s.lstrip())[0]
 623                 return super().decode(s)
 624             except json.JSONDecodeError as e:
 625                 if e.pos is None:
 626                     raise
 627                 elif attempt < self._close_attempts:
 628                     s = self._close_object(e)
 629                     if s is not None:
 630                         continue
 631                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 632         assert False, 'Too many attempts to decode JSON'
 633
 634
 635 def sanitize_open(filename, open_mode):
 636     """Try to open the given filename, and slightly tweak it if this fails.
 637
 638     Attempts to open the given filename. If this fails, it tries to change
 639     the filename slightly, step by step, until it's either able to open it
 640     or it fails and raises a final exception, like the standard open()
 641     function.
 642
 643     It returns the tuple (stream, definitive_file_name).
 644     """
 645     if filename == '-':
 646         if sys.platform == 'win32':
 647             import msvcrt
 648
 649             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 650             with contextlib.suppress(io.UnsupportedOperation):
 651                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 652         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 653
 654     for attempt in range(2):
 655         try:
 656             try:
 657                 if sys.platform == 'win32':
 658                     # FIXME: An exclusive lock also locks the file from being read.
 659                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 660                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 661                     raise LockingUnsupportedError()
 662                 stream = locked_file(filename, open_mode, block=False).__enter__()
 663             except OSError:
 664                 stream = open(filename, open_mode)
 665             return stream, filename
 666         except OSError as err:
 667             if attempt or err.errno in (errno.EACCES,):
 668                 raise
 669             old_filename, filename = filename, sanitize_path(filename)
 670             if old_filename == filename:
 671                 raise
 672
 673
 674 def timeconvert(timestr):
 675     """Convert RFC 2822 defined time string into system timestamp"""
 676     timestamp = None
 677     timetuple = email.utils.parsedate_tz(timestr)
 678     if timetuple is not None:
 679         timestamp = email.utils.mktime_tz(timetuple)
 680     return timestamp
 681
 682
 683 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 684     """Sanitizes a string so it could be used as part of a filename.
 685     @param restricted   Use a stricter subset of allowed characters
 686     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 687                         If unset, yt-dlp's new sanitization rules are in effect
 688     """
 689     if s == '':
 690         return ''
 691
 692     def replace_insane(char):
 693         if restricted and char in ACCENT_CHARS:
 694             return ACCENT_CHARS[char]
 695         elif not restricted and char == '\n':
 696             return '\0 '
 697         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 698             # Replace with their full-width unicode counterparts
 699             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 700         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 701             return ''
 702         elif char == '"':
 703             return '' if restricted else '\''
 704         elif char == ':':
 705             return '\0_\0-' if restricted else '\0 \0-'
 706         elif char in '\\/|*<>':
 707             return '\0_'
 708         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 709             return '\0_'
 710         return char
 711
 712     # Replace look-alike Unicode glyphs
 713     if restricted and (is_id is NO_DEFAULT or not is_id):
 714         s = unicodedata.normalize('NFKC', s)
 715     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 716     result = ''.join(map(replace_insane, s))
 717     if is_id is NO_DEFAULT:
 718         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 719         STRIP_RE = r'(?:\0.|[ _-])*'
 720         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 721     result = result.replace('\0', '') or '_'
 722
 723     if not is_id:
 724         while '__' in result:
 725             result = result.replace('__', '_')
 726         result = result.strip('_')
 727         # Common case of "Foreign band name - English song title"
 728         if restricted and result.startswith('-_'):
 729             result = result[2:]
 730         if result.startswith('-'):
 731             result = '_' + result[len('-'):]
 732         result = result.lstrip('.')
 733         if not result:
 734             result = '_'
 735     return result
 736
 737
 738 def sanitize_path(s, force=False):
 739     """Sanitizes and normalizes path on Windows"""
 740     if sys.platform == 'win32':
 741         force = False
 742         drive_or_unc, _ = os.path.splitdrive(s)
 743     elif force:
 744         drive_or_unc = ''
 745     else:
 746         return s
 747
 748     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 749     if drive_or_unc:
 750         norm_path.pop(0)
 751     sanitized_path = [
 752         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 753         for path_part in norm_path]
 754     if drive_or_unc:
 755         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 756     elif force and s and s[0] == os.path.sep:
 757         sanitized_path.insert(0, os.path.sep)
 758     return os.path.join(*sanitized_path)
 759
 760
 761 def sanitize_url(url, *, scheme='http'):
 762     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 763     # the number of unwanted failures due to missing protocol
 764     if url is None:
 765         return
 766     elif url.startswith('//'):
 767         return f'{scheme}:{url}'
 768     # Fix some common typos seen so far
 769     COMMON_TYPOS = (
 770         # https://github.com/ytdl-org/youtube-dl/issues/15649
 771         (r'^httpss://', r'https://'),
 772         # https://bx1.be/lives/direct-tv/
 773         (r'^rmtp([es]?)://', r'rtmp\1://'),
 774     )
 775     for mistake, fixup in COMMON_TYPOS:
 776         if re.match(mistake, url):
 777             return re.sub(mistake, fixup, url)
 778     return url
 779
 780
 781 def extract_basic_auth(url):
 782     parts = urllib.parse.urlsplit(url)
 783     if parts.username is None:
 784         return url, None
 785     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 786         parts.hostname if parts.port is None
 787         else '%s:%d' % (parts.hostname, parts.port))))
 788     auth_payload = base64.b64encode(
 789         ('%s:%s' % (parts.username, parts.password or '')).encode())
 790     return url, f'Basic {auth_payload.decode()}'
 791
 792
 793 def sanitized_Request(url, *args, **kwargs):
 794     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 795     if auth_header is not None:
 796         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 797         headers['Authorization'] = auth_header
 798     return urllib.request.Request(url, *args, **kwargs)
 799
 800
 801 def expand_path(s):
 802     """Expand shell variables and ~"""
 803     return os.path.expandvars(compat_expanduser(s))
 804
 805
 806 def orderedSet(iterable, *, lazy=False):
 807     """Remove all duplicates from the input iterable"""
 808     def _iter():
 809         seen = []  # Do not use set since the items can be unhashable
 810         for x in iterable:
 811             if x not in seen:
 812                 seen.append(x)
 813                 yield x
 814
 815     return _iter() if lazy else list(_iter())
 816
 817
 818 def _htmlentity_transform(entity_with_semicolon):
 819     """Transforms an HTML entity to a character."""
 820     entity = entity_with_semicolon[:-1]
 821
 822     # Known non-numeric HTML entity
 823     if entity in html.entities.name2codepoint:
 824         return chr(html.entities.name2codepoint[entity])
 825
 826     # TODO: HTML5 allows entities without a semicolon.
 827     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 828     if entity_with_semicolon in html.entities.html5:
 829         return html.entities.html5[entity_with_semicolon]
 830
 831     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 832     if mobj is not None:
 833         numstr = mobj.group(1)
 834         if numstr.startswith('x'):
 835             base = 16
 836             numstr = '0%s' % numstr
 837         else:
 838             base = 10
 839         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 840         with contextlib.suppress(ValueError):
 841             return chr(int(numstr, base))
 842
 843     # Unknown entity in name, return its literal representation
 844     return '&%s;' % entity
 845
 846
 847 def unescapeHTML(s):
 848     if s is None:
 849         return None
 850     assert isinstance(s, str)
 851
 852     return re.sub(
 853         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 854
 855
 856 def escapeHTML(text):
 857     return (
 858         text
 859         .replace('&', '&amp;')
 860         .replace('<', '&lt;')
 861         .replace('>', '&gt;')
 862         .replace('"', '&quot;')
 863         .replace("'", '&#39;')
 864     )
 865
 866
 867 def process_communicate_or_kill(p, *args, **kwargs):
 868     deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
 869                         f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
 870     return Popen.communicate_or_kill(p, *args, **kwargs)
 871
 872
 873 class Popen(subprocess.Popen):
 874     if sys.platform == 'win32':
 875         _startupinfo = subprocess.STARTUPINFO()
 876         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 877     else:
 878         _startupinfo = None
 879
 880     @staticmethod
 881     def _fix_pyinstaller_ld_path(env):
 882         """Restore LD_LIBRARY_PATH when using PyInstaller
 883             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 884                  https://github.com/yt-dlp/yt-dlp/issues/4573
 885         """
 886         if not hasattr(sys, '_MEIPASS'):
 887             return
 888
 889         def _fix(key):
 890             orig = env.get(f'{key}_ORIG')
 891             if orig is None:
 892                 env.pop(key, None)
 893             else:
 894                 env[key] = orig
 895
 896         _fix('LD_LIBRARY_PATH')  # Linux
 897         _fix('DYLD_LIBRARY_PATH')  # macOS
 898
 899     def __init__(self, *args, env=None, text=False, **kwargs):
 900         if env is None:
 901             env = os.environ.copy()
 902         self._fix_pyinstaller_ld_path(env)
 903
 904         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 905         if text is True:
 906             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 907             kwargs.setdefault('encoding', 'utf-8')
 908             kwargs.setdefault('errors', 'replace')
 909         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 910
 911     def communicate_or_kill(self, *args, **kwargs):
 912         try:
 913             return self.communicate(*args, **kwargs)
 914         except BaseException:  # Including KeyboardInterrupt
 915             self.kill(timeout=None)
 916             raise
 917
 918     def kill(self, *, timeout=0):
 919         super().kill()
 920         if timeout != 0:
 921             self.wait(timeout=timeout)
 922
 923     @classmethod
 924     def run(cls, *args, timeout=None, **kwargs):
 925         with cls(*args, **kwargs) as proc:
 926             default = '' if proc.__text_mode else b''
 927             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 928             return stdout or default, stderr or default, proc.returncode
 929
 930
 931 def encodeArgument(s):
 932     # Legacy code that uses byte strings
 933     # Uncomment the following line after fixing all post processors
 934     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 935     return s if isinstance(s, str) else s.decode('ascii')
 936
 937
 938 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 939
 940
 941 def timetuple_from_msec(msec):
 942     secs, msec = divmod(msec, 1000)
 943     mins, secs = divmod(secs, 60)
 944     hrs, mins = divmod(mins, 60)
 945     return _timetuple(hrs, mins, secs, msec)
 946
 947
 948 def formatSeconds(secs, delim=':', msec=False):
 949     time = timetuple_from_msec(secs * 1000)
 950     if time.hours:
 951         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 952     elif time.minutes:
 953         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 954     else:
 955         ret = '%d' % time.seconds
 956     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 957
 958
 959 def _ssl_load_windows_store_certs(ssl_context, storename):
 960     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 961     try:
 962         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 963                  if encoding == 'x509_asn' and (
 964                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 965     except PermissionError:
 966         return
 967     for cert in certs:
 968         with contextlib.suppress(ssl.SSLError):
 969             ssl_context.load_verify_locations(cadata=cert)
 970
 971
 972 def make_HTTPS_handler(params, **kwargs):
 973     opts_check_certificate = not params.get('nocheckcertificate')
 974     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 975     context.check_hostname = opts_check_certificate
 976     if params.get('legacyserverconnect'):
 977         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 978         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 979         context.set_ciphers('DEFAULT')
 980     elif (
 981         sys.version_info < (3, 10)
 982         and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
 983         and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
 984     ):
 985         # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
 986         # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
 987         # in some situations [2][3].
 988         # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
 989         # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
 990         # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
 991         # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
 992         # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
 993         # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
 994         # 4. https://peps.python.org/pep-0644/
 995         # 5. https://peps.python.org/pep-0644/#libressl-support
 996         # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
 997         context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
 998         context.minimum_version = ssl.TLSVersion.TLSv1_2
 999
1000     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1001     if opts_check_certificate:
1002         if certifi and 'no-certifi' not in params.get('compat_opts', []):
1003             context.load_verify_locations(cafile=certifi.where())
1004         else:
1005             try:
1006                 context.load_default_certs()
1007                 # Work around the issue in load_default_certs when there are bad certificates. See:
1008                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1009                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1010             except ssl.SSLError:
1011                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1012                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1013                     for storename in ('CA', 'ROOT'):
1014                         _ssl_load_windows_store_certs(context, storename)
1015                 context.set_default_verify_paths()
1016
1017     client_certfile = params.get('client_certificate')
1018     if client_certfile:
1019         try:
1020             context.load_cert_chain(
1021                 client_certfile, keyfile=params.get('client_certificate_key'),
1022                 password=params.get('client_certificate_password'))
1023         except ssl.SSLError:
1024             raise YoutubeDLError('Unable to load client certificate')
1025
1026     # Some servers may reject requests if ALPN extension is not sent. See:
1027     # https://github.com/python/cpython/issues/85140
1028     # https://github.com/yt-dlp/yt-dlp/issues/3878
1029     with contextlib.suppress(NotImplementedError):
1030         context.set_alpn_protocols(['http/1.1'])
1031
1032     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1033
1034
1035 def bug_reports_message(before=';'):
1036     from ..update import REPOSITORY
1037
1038     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1039            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1040
1041     before = before.rstrip()
1042     if not before or before.endswith(('.', '!', '?')):
1043         msg = msg[0].title() + msg[1:]
1044
1045     return (before + ' ' if before else '') + msg
1046
1047
1048 class YoutubeDLError(Exception):
1049     """Base exception for YoutubeDL errors."""
1050     msg = None
1051
1052     def __init__(self, msg=None):
1053         if msg is not None:
1054             self.msg = msg
1055         elif self.msg is None:
1056             self.msg = type(self).__name__
1057         super().__init__(self.msg)
1058
1059
1060 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1061 if hasattr(ssl, 'CertificateError'):
1062     network_exceptions.append(ssl.CertificateError)
1063 network_exceptions = tuple(network_exceptions)
1064
1065
1066 class ExtractorError(YoutubeDLError):
1067     """Error during info extraction."""
1068
1069     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1070         """ tb, if given, is the original traceback (so that it can be printed out).
1071         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1072         """
1073         if sys.exc_info()[0] in network_exceptions:
1074             expected = True
1075
1076         self.orig_msg = str(msg)
1077         self.traceback = tb
1078         self.expected = expected
1079         self.cause = cause
1080         self.video_id = video_id
1081         self.ie = ie
1082         self.exc_info = sys.exc_info()  # preserve original exception
1083         if isinstance(self.exc_info[1], ExtractorError):
1084             self.exc_info = self.exc_info[1].exc_info
1085         super().__init__(self.__msg)
1086
1087     @property
1088     def __msg(self):
1089         return ''.join((
1090             format_field(self.ie, None, '[%s] '),
1091             format_field(self.video_id, None, '%s: '),
1092             self.orig_msg,
1093             format_field(self.cause, None, ' (caused by %r)'),
1094             '' if self.expected else bug_reports_message()))
1095
1096     def format_traceback(self):
1097         return join_nonempty(
1098             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1099             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1100             delim='\n') or None
1101
1102     def __setattr__(self, name, value):
1103         super().__setattr__(name, value)
1104         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1105             self.msg = self.__msg or type(self).__name__
1106             self.args = (self.msg, )  # Cannot be property
1107
1108
1109 class UnsupportedError(ExtractorError):
1110     def __init__(self, url):
1111         super().__init__(
1112             'Unsupported URL: %s' % url, expected=True)
1113         self.url = url
1114
1115
1116 class RegexNotFoundError(ExtractorError):
1117     """Error when a regex didn't match"""
1118     pass
1119
1120
1121 class GeoRestrictedError(ExtractorError):
1122     """Geographic restriction Error exception.
1123
1124     This exception may be thrown when a video is not available from your
1125     geographic location due to geographic restrictions imposed by a website.
1126     """
1127
1128     def __init__(self, msg, countries=None, **kwargs):
1129         kwargs['expected'] = True
1130         super().__init__(msg, **kwargs)
1131         self.countries = countries
1132
1133
1134 class UserNotLive(ExtractorError):
1135     """Error when a channel/user is not live"""
1136
1137     def __init__(self, msg=None, **kwargs):
1138         kwargs['expected'] = True
1139         super().__init__(msg or 'The channel is not currently live', **kwargs)
1140
1141
1142 class DownloadError(YoutubeDLError):
1143     """Download Error exception.
1144
1145     This exception may be thrown by FileDownloader objects if they are not
1146     configured to continue on errors. They will contain the appropriate
1147     error message.
1148     """
1149
1150     def __init__(self, msg, exc_info=None):
1151         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1152         super().__init__(msg)
1153         self.exc_info = exc_info
1154
1155
1156 class EntryNotInPlaylist(YoutubeDLError):
1157     """Entry not in playlist exception.
1158
1159     This exception will be thrown by YoutubeDL when a requested entry
1160     is not found in the playlist info_dict
1161     """
1162     msg = 'Entry not found in info'
1163
1164
1165 class SameFileError(YoutubeDLError):
1166     """Same File exception.
1167
1168     This exception will be thrown by FileDownloader objects if they detect
1169     multiple files would have to be downloaded to the same file on disk.
1170     """
1171     msg = 'Fixed output name but more than one file to download'
1172
1173     def __init__(self, filename=None):
1174         if filename is not None:
1175             self.msg += f': {filename}'
1176         super().__init__(self.msg)
1177
1178
1179 class PostProcessingError(YoutubeDLError):
1180     """Post Processing exception.
1181
1182     This exception may be raised by PostProcessor's .run() method to
1183     indicate an error in the postprocessing task.
1184     """
1185
1186
1187 class DownloadCancelled(YoutubeDLError):
1188     """ Exception raised when the download queue should be interrupted """
1189     msg = 'The download was cancelled'
1190
1191
1192 class ExistingVideoReached(DownloadCancelled):
1193     """ --break-on-existing triggered """
1194     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1195
1196
1197 class RejectedVideoReached(DownloadCancelled):
1198     """ --break-match-filter triggered """
1199     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1200
1201
1202 class MaxDownloadsReached(DownloadCancelled):
1203     """ --max-downloads limit has been reached. """
1204     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1205
1206
1207 class ReExtractInfo(YoutubeDLError):
1208     """ Video info needs to be re-extracted. """
1209
1210     def __init__(self, msg, expected=False):
1211         super().__init__(msg)
1212         self.expected = expected
1213
1214
1215 class ThrottledDownload(ReExtractInfo):
1216     """ Download speed below --throttled-rate. """
1217     msg = 'The download speed is below throttle limit'
1218
1219     def __init__(self):
1220         super().__init__(self.msg, expected=False)
1221
1222
1223 class UnavailableVideoError(YoutubeDLError):
1224     """Unavailable Format exception.
1225
1226     This exception will be thrown when a video is requested
1227     in a format that is not available for that video.
1228     """
1229     msg = 'Unable to download video'
1230
1231     def __init__(self, err=None):
1232         if err is not None:
1233             self.msg += f': {err}'
1234         super().__init__(self.msg)
1235
1236
1237 class ContentTooShortError(YoutubeDLError):
1238     """Content Too Short exception.
1239
1240     This exception may be raised by FileDownloader objects when a file they
1241     download is too small for what the server announced first, indicating
1242     the connection was probably interrupted.
1243     """
1244
1245     def __init__(self, downloaded, expected):
1246         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1247         # Both in bytes
1248         self.downloaded = downloaded
1249         self.expected = expected
1250
1251
1252 class XAttrMetadataError(YoutubeDLError):
1253     def __init__(self, code=None, msg='Unknown error'):
1254         super().__init__(msg)
1255         self.code = code
1256         self.msg = msg
1257
1258         # Parsing code and msg
1259         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1260                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1261             self.reason = 'NO_SPACE'
1262         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1263             self.reason = 'VALUE_TOO_LONG'
1264         else:
1265             self.reason = 'NOT_SUPPORTED'
1266
1267
1268 class XAttrUnavailableError(YoutubeDLError):
1269     pass
1270
1271
1272 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1273     hc = http_class(*args, **kwargs)
1274     source_address = ydl_handler._params.get('source_address')
1275
1276     if source_address is not None:
1277         # This is to workaround _create_connection() from socket where it will try all
1278         # address data from getaddrinfo() including IPv6. This filters the result from
1279         # getaddrinfo() based on the source_address value.
1280         # This is based on the cpython socket.create_connection() function.
1281         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1282         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1283             host, port = address
1284             err = None
1285             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1286             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1287             ip_addrs = [addr for addr in addrs if addr[0] == af]
1288             if addrs and not ip_addrs:
1289                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1290                 raise OSError(
1291                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1292                     % (ip_version, source_address[0]))
1293             for res in ip_addrs:
1294                 af, socktype, proto, canonname, sa = res
1295                 sock = None
1296                 try:
1297                     sock = socket.socket(af, socktype, proto)
1298                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1299                         sock.settimeout(timeout)
1300                     sock.bind(source_address)
1301                     sock.connect(sa)
1302                     err = None  # Explicitly break reference cycle
1303                     return sock
1304                 except OSError as _:
1305                     err = _
1306                     if sock is not None:
1307                         sock.close()
1308             if err is not None:
1309                 raise err
1310             else:
1311                 raise OSError('getaddrinfo returns an empty list')
1312         if hasattr(hc, '_create_connection'):
1313             hc._create_connection = _create_connection
1314         hc.source_address = (source_address, 0)
1315
1316     return hc
1317
1318
1319 class YoutubeDLHandler(urllib.request.HTTPHandler):
1320     """Handler for HTTP requests and responses.
1321
1322     This class, when installed with an OpenerDirector, automatically adds
1323     the standard headers to every HTTP request and handles gzipped, deflated and
1324     brotli responses from web servers.
1325
1326     Part of this code was copied from:
1327
1328     http://techknack.net/python-urllib2-handlers/
1329
1330     Andrew Rowls, the author of that code, agreed to release it to the
1331     public domain.
1332     """
1333
1334     def __init__(self, params, *args, **kwargs):
1335         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1336         self._params = params
1337
1338     def http_open(self, req):
1339         conn_class = http.client.HTTPConnection
1340
1341         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1342         if socks_proxy:
1343             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1344             del req.headers['Ytdl-socks-proxy']
1345
1346         return self.do_open(functools.partial(
1347             _create_http_connection, self, conn_class, False),
1348             req)
1349
1350     @staticmethod
1351     def deflate(data):
1352         if not data:
1353             return data
1354         try:
1355             return zlib.decompress(data, -zlib.MAX_WBITS)
1356         except zlib.error:
1357             return zlib.decompress(data)
1358
1359     @staticmethod
1360     def brotli(data):
1361         if not data:
1362             return data
1363         return brotli.decompress(data)
1364
1365     @staticmethod
1366     def gz(data):
1367         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
1368         try:
1369             return gz.read()
1370         except OSError as original_oserror:
1371             # There may be junk add the end of the file
1372             # See http://stackoverflow.com/q/4928560/35070 for details
1373             for i in range(1, 1024):
1374                 try:
1375                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
1376                     return gz.read()
1377                 except OSError:
1378                     continue
1379             else:
1380                 raise original_oserror
1381
1382     def http_request(self, req):
1383         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1384         # always respected by websites, some tend to give out URLs with non percent-encoded
1385         # non-ASCII characters (see telemb.py, ard.py [#3412])
1386         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1387         # To work around aforementioned issue we will replace request's original URL with
1388         # percent-encoded one
1389         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1390         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1391         url = req.get_full_url()
1392         url_escaped = escape_url(url)
1393
1394         # Substitute URL if any change after escaping
1395         if url != url_escaped:
1396             req = update_Request(req, url=url_escaped)
1397
1398         for h, v in self._params.get('http_headers', std_headers).items():
1399             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1400             # The dict keys are capitalized because of this bug by urllib
1401             if h.capitalize() not in req.headers:
1402                 req.add_header(h, v)
1403
1404         if 'Youtubedl-no-compression' in req.headers:  # deprecated
1405             req.headers.pop('Youtubedl-no-compression', None)
1406             req.add_header('Accept-encoding', 'identity')
1407
1408         if 'Accept-encoding' not in req.headers:
1409             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1410
1411         return super().do_request_(req)
1412
1413     def http_response(self, req, resp):
1414         old_resp = resp
1415
1416         # Content-Encoding header lists the encodings in order that they were applied [1].
1417         # To decompress, we simply do the reverse.
1418         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
1419         decoded_response = None
1420         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
1421             if encoding == 'gzip':
1422                 decoded_response = self.gz(decoded_response or resp.read())
1423             elif encoding == 'deflate':
1424                 decoded_response = self.deflate(decoded_response or resp.read())
1425             elif encoding == 'br' and brotli:
1426                 decoded_response = self.brotli(decoded_response or resp.read())
1427
1428         if decoded_response is not None:
1429             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
1430             resp.msg = old_resp.msg
1431         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1432         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1433         if 300 <= resp.code < 400:
1434             location = resp.headers.get('Location')
1435             if location:
1436                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1437                 location = location.encode('iso-8859-1').decode()
1438                 location_escaped = escape_url(location)
1439                 if location != location_escaped:
1440                     del resp.headers['Location']
1441                     resp.headers['Location'] = location_escaped
1442         return resp
1443
1444     https_request = http_request
1445     https_response = http_response
1446
1447
1448 def make_socks_conn_class(base_class, socks_proxy):
1449     assert issubclass(base_class, (
1450         http.client.HTTPConnection, http.client.HTTPSConnection))
1451
1452     url_components = urllib.parse.urlparse(socks_proxy)
1453     if url_components.scheme.lower() == 'socks5':
1454         socks_type = ProxyType.SOCKS5
1455     elif url_components.scheme.lower() in ('socks', 'socks4'):
1456         socks_type = ProxyType.SOCKS4
1457     elif url_components.scheme.lower() == 'socks4a':
1458         socks_type = ProxyType.SOCKS4A
1459
1460     def unquote_if_non_empty(s):
1461         if not s:
1462             return s
1463         return urllib.parse.unquote_plus(s)
1464
1465     proxy_args = (
1466         socks_type,
1467         url_components.hostname, url_components.port or 1080,
1468         True,  # Remote DNS
1469         unquote_if_non_empty(url_components.username),
1470         unquote_if_non_empty(url_components.password),
1471     )
1472
1473     class SocksConnection(base_class):
1474         def connect(self):
1475             self.sock = sockssocket()
1476             self.sock.setproxy(*proxy_args)
1477             if isinstance(self.timeout, (int, float)):
1478                 self.sock.settimeout(self.timeout)
1479             self.sock.connect((self.host, self.port))
1480
1481             if isinstance(self, http.client.HTTPSConnection):
1482                 if hasattr(self, '_context'):  # Python > 2.6
1483                     self.sock = self._context.wrap_socket(
1484                         self.sock, server_hostname=self.host)
1485                 else:
1486                     self.sock = ssl.wrap_socket(self.sock)
1487
1488     return SocksConnection
1489
1490
1491 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1492     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1493         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1494         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1495         self._params = params
1496
1497     def https_open(self, req):
1498         kwargs = {}
1499         conn_class = self._https_conn_class
1500
1501         if hasattr(self, '_context'):  # python > 2.6
1502             kwargs['context'] = self._context
1503         if hasattr(self, '_check_hostname'):  # python 3.x
1504             kwargs['check_hostname'] = self._check_hostname
1505
1506         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1507         if socks_proxy:
1508             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1509             del req.headers['Ytdl-socks-proxy']
1510
1511         try:
1512             return self.do_open(
1513                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1514         except urllib.error.URLError as e:
1515             if (isinstance(e.reason, ssl.SSLError)
1516                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1517                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1518             raise
1519
1520
1521 def is_path_like(f):
1522     return isinstance(f, (str, bytes, os.PathLike))
1523
1524
1525 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1526     def __init__(self, cookiejar=None):
1527         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1528
1529     def http_response(self, request, response):
1530         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1531
1532     https_request = urllib.request.HTTPCookieProcessor.http_request
1533     https_response = http_response
1534
1535
1536 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1537     """YoutubeDL redirect handler
1538
1539     The code is based on HTTPRedirectHandler implementation from CPython [1].
1540
1541     This redirect handler fixes and improves the logic to better align with RFC7261
1542      and what browsers tend to do [2][3]
1543
1544     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1545     2. https://datatracker.ietf.org/doc/html/rfc7231
1546     3. https://github.com/python/cpython/issues/91306
1547     """
1548
1549     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1550
1551     def redirect_request(self, req, fp, code, msg, headers, newurl):
1552         if code not in (301, 302, 303, 307, 308):
1553             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1554
1555         new_method = req.get_method()
1556         new_data = req.data
1557         remove_headers = []
1558         # A 303 must either use GET or HEAD for subsequent request
1559         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1560         if code == 303 and req.get_method() != 'HEAD':
1561             new_method = 'GET'
1562         # 301 and 302 redirects are commonly turned into a GET from a POST
1563         # for subsequent requests by browsers, so we'll do the same.
1564         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1565         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1566         elif code in (301, 302) and req.get_method() == 'POST':
1567             new_method = 'GET'
1568
1569         # only remove payload if method changed (e.g. POST to GET)
1570         if new_method != req.get_method():
1571             new_data = None
1572             remove_headers.extend(['Content-Length', 'Content-Type'])
1573
1574         new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
1575
1576         return urllib.request.Request(
1577             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1578             unverifiable=True, method=new_method, data=new_data)
1579
1580
1581 def extract_timezone(date_str):
1582     m = re.search(
1583         r'''(?x)
1584             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1585             (?P<tz>Z|                                            # just the UTC Z, or
1586                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1587                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1588                    [ ]?                                          # optional space
1589                 (?P<sign>\+|-)                                   # +/-
1590                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1591             $)
1592         ''', date_str)
1593     if not m:
1594         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1595         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1596         if timezone is not None:
1597             date_str = date_str[:-len(m.group('tz'))]
1598         timezone = datetime.timedelta(hours=timezone or 0)
1599     else:
1600         date_str = date_str[:-len(m.group('tz'))]
1601         if not m.group('sign'):
1602             timezone = datetime.timedelta()
1603         else:
1604             sign = 1 if m.group('sign') == '+' else -1
1605             timezone = datetime.timedelta(
1606                 hours=sign * int(m.group('hours')),
1607                 minutes=sign * int(m.group('minutes')))
1608     return timezone, date_str
1609
1610
1611 def parse_iso8601(date_str, delimiter='T', timezone=None):
1612     """ Return a UNIX timestamp from the given date """
1613
1614     if date_str is None:
1615         return None
1616
1617     date_str = re.sub(r'\.[0-9]+', '', date_str)
1618
1619     if timezone is None:
1620         timezone, date_str = extract_timezone(date_str)
1621
1622     with contextlib.suppress(ValueError):
1623         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1624         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1625         return calendar.timegm(dt.timetuple())
1626
1627
1628 def date_formats(day_first=True):
1629     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1630
1631
1632 def unified_strdate(date_str, day_first=True):
1633     """Return a string with the date in the format YYYYMMDD"""
1634
1635     if date_str is None:
1636         return None
1637     upload_date = None
1638     # Replace commas
1639     date_str = date_str.replace(',', ' ')
1640     # Remove AM/PM + timezone
1641     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1642     _, date_str = extract_timezone(date_str)
1643
1644     for expression in date_formats(day_first):
1645         with contextlib.suppress(ValueError):
1646             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1647     if upload_date is None:
1648         timetuple = email.utils.parsedate_tz(date_str)
1649         if timetuple:
1650             with contextlib.suppress(ValueError):
1651                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1652     if upload_date is not None:
1653         return str(upload_date)
1654
1655
1656 def unified_timestamp(date_str, day_first=True):
1657     if date_str is None:
1658         return None
1659
1660     date_str = re.sub(r'\s+', ' ', re.sub(
1661         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1662
1663     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1664     timezone, date_str = extract_timezone(date_str)
1665
1666     # Remove AM/PM + timezone
1667     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1668
1669     # Remove unrecognized timezones from ISO 8601 alike timestamps
1670     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1671     if m:
1672         date_str = date_str[:-len(m.group('tz'))]
1673
1674     # Python only supports microseconds, so remove nanoseconds
1675     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1676     if m:
1677         date_str = m.group(1)
1678
1679     for expression in date_formats(day_first):
1680         with contextlib.suppress(ValueError):
1681             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1682             return calendar.timegm(dt.timetuple())
1683
1684     timetuple = email.utils.parsedate_tz(date_str)
1685     if timetuple:
1686         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1687
1688
1689 def determine_ext(url, default_ext='unknown_video'):
1690     if url is None or '.' not in url:
1691         return default_ext
1692     guess = url.partition('?')[0].rpartition('.')[2]
1693     if re.match(r'^[A-Za-z0-9]+$', guess):
1694         return guess
1695     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1696     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1697         return guess.rstrip('/')
1698     else:
1699         return default_ext
1700
1701
1702 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1703     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1704
1705
1706 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1707     R"""
1708     Return a datetime object from a string.
1709     Supported format:
1710         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1711
1712     @param format       strftime format of DATE
1713     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1714                         auto: round to the unit provided in date_str (if applicable).
1715     """
1716     auto_precision = False
1717     if precision == 'auto':
1718         auto_precision = True
1719         precision = 'microsecond'
1720     today = datetime_round(datetime.datetime.utcnow(), precision)
1721     if date_str in ('now', 'today'):
1722         return today
1723     if date_str == 'yesterday':
1724         return today - datetime.timedelta(days=1)
1725     match = re.match(
1726         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1727         date_str)
1728     if match is not None:
1729         start_time = datetime_from_str(match.group('start'), precision, format)
1730         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1731         unit = match.group('unit')
1732         if unit == 'month' or unit == 'year':
1733             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1734             unit = 'day'
1735         else:
1736             if unit == 'week':
1737                 unit = 'day'
1738                 time *= 7
1739             delta = datetime.timedelta(**{unit + 's': time})
1740             new_date = start_time + delta
1741         if auto_precision:
1742             return datetime_round(new_date, unit)
1743         return new_date
1744
1745     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1746
1747
1748 def date_from_str(date_str, format='%Y%m%d', strict=False):
1749     R"""
1750     Return a date object from a string using datetime_from_str
1751
1752     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1753                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1754     """
1755     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1756         raise ValueError(f'Invalid date format "{date_str}"')
1757     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1758
1759
1760 def datetime_add_months(dt, months):
1761     """Increment/Decrement a datetime object by months."""
1762     month = dt.month + months - 1
1763     year = dt.year + month // 12
1764     month = month % 12 + 1
1765     day = min(dt.day, calendar.monthrange(year, month)[1])
1766     return dt.replace(year, month, day)
1767
1768
1769 def datetime_round(dt, precision='day'):
1770     """
1771     Round a datetime object's time to a specific precision
1772     """
1773     if precision == 'microsecond':
1774         return dt
1775
1776     unit_seconds = {
1777         'day': 86400,
1778         'hour': 3600,
1779         'minute': 60,
1780         'second': 1,
1781     }
1782     roundto = lambda x, n: ((x + n / 2) // n) * n
1783     timestamp = calendar.timegm(dt.timetuple())
1784     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1785
1786
1787 def hyphenate_date(date_str):
1788     """
1789     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1790     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1791     if match is not None:
1792         return '-'.join(match.groups())
1793     else:
1794         return date_str
1795
1796
1797 class DateRange:
1798     """Represents a time interval between two dates"""
1799
1800     def __init__(self, start=None, end=None):
1801         """start and end must be strings in the format accepted by date"""
1802         if start is not None:
1803             self.start = date_from_str(start, strict=True)
1804         else:
1805             self.start = datetime.datetime.min.date()
1806         if end is not None:
1807             self.end = date_from_str(end, strict=True)
1808         else:
1809             self.end = datetime.datetime.max.date()
1810         if self.start > self.end:
1811             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1812
1813     @classmethod
1814     def day(cls, day):
1815         """Returns a range that only contains the given day"""
1816         return cls(day, day)
1817
1818     def __contains__(self, date):
1819         """Check if the date is in the range"""
1820         if not isinstance(date, datetime.date):
1821             date = date_from_str(date)
1822         return self.start <= date <= self.end
1823
1824     def __repr__(self):
1825         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1826
1827     def __eq__(self, other):
1828         return (isinstance(other, DateRange)
1829                 and self.start == other.start and self.end == other.end)
1830
1831
1832 @functools.cache
1833 def system_identifier():
1834     python_implementation = platform.python_implementation()
1835     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1836         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1837     libc_ver = []
1838     with contextlib.suppress(OSError):  # We may not have access to the executable
1839         libc_ver = platform.libc_ver()
1840
1841     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1842         platform.python_version(),
1843         python_implementation,
1844         platform.machine(),
1845         platform.architecture()[0],
1846         platform.platform(),
1847         ssl.OPENSSL_VERSION,
1848         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1849     )
1850
1851
1852 @functools.cache
1853 def get_windows_version():
1854     ''' Get Windows version. returns () if it's not running on Windows '''
1855     if compat_os_name == 'nt':
1856         return version_tuple(platform.win32_ver()[1])
1857     else:
1858         return ()
1859
1860
1861 def write_string(s, out=None, encoding=None):
1862     assert isinstance(s, str)
1863     out = out or sys.stderr
1864     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1865     if not out:
1866         return
1867
1868     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1869         s = re.sub(r'([\r\n]+)', r' \1', s)
1870
1871     enc, buffer = None, out
1872     if 'b' in getattr(out, 'mode', ''):
1873         enc = encoding or preferredencoding()
1874     elif hasattr(out, 'buffer'):
1875         buffer = out.buffer
1876         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1877
1878     buffer.write(s.encode(enc, 'ignore') if enc else s)
1879     out.flush()
1880
1881
1882 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1883     from .. import _IN_CLI
1884     if _IN_CLI:
1885         if msg in deprecation_warning._cache:
1886             return
1887         deprecation_warning._cache.add(msg)
1888         if printer:
1889             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1890         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1891     else:
1892         import warnings
1893         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1894
1895
1896 deprecation_warning._cache = set()
1897
1898
1899 def bytes_to_intlist(bs):
1900     if not bs:
1901         return []
1902     if isinstance(bs[0], int):  # Python 3
1903         return list(bs)
1904     else:
1905         return [ord(c) for c in bs]
1906
1907
1908 def intlist_to_bytes(xs):
1909     if not xs:
1910         return b''
1911     return struct.pack('%dB' % len(xs), *xs)
1912
1913
1914 class LockingUnsupportedError(OSError):
1915     msg = 'File locking is not supported'
1916
1917     def __init__(self):
1918         super().__init__(self.msg)
1919
1920
1921 # Cross-platform file locking
1922 if sys.platform == 'win32':
1923     import ctypes
1924     import ctypes.wintypes
1925     import msvcrt
1926
1927     class OVERLAPPED(ctypes.Structure):
1928         _fields_ = [
1929             ('Internal', ctypes.wintypes.LPVOID),
1930             ('InternalHigh', ctypes.wintypes.LPVOID),
1931             ('Offset', ctypes.wintypes.DWORD),
1932             ('OffsetHigh', ctypes.wintypes.DWORD),
1933             ('hEvent', ctypes.wintypes.HANDLE),
1934         ]
1935
1936     kernel32 = ctypes.WinDLL('kernel32')
1937     LockFileEx = kernel32.LockFileEx
1938     LockFileEx.argtypes = [
1939         ctypes.wintypes.HANDLE,     # hFile
1940         ctypes.wintypes.DWORD,      # dwFlags
1941         ctypes.wintypes.DWORD,      # dwReserved
1942         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1943         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1944         ctypes.POINTER(OVERLAPPED)  # Overlapped
1945     ]
1946     LockFileEx.restype = ctypes.wintypes.BOOL
1947     UnlockFileEx = kernel32.UnlockFileEx
1948     UnlockFileEx.argtypes = [
1949         ctypes.wintypes.HANDLE,     # hFile
1950         ctypes.wintypes.DWORD,      # dwReserved
1951         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1952         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1953         ctypes.POINTER(OVERLAPPED)  # Overlapped
1954     ]
1955     UnlockFileEx.restype = ctypes.wintypes.BOOL
1956     whole_low = 0xffffffff
1957     whole_high = 0x7fffffff
1958
1959     def _lock_file(f, exclusive, block):
1960         overlapped = OVERLAPPED()
1961         overlapped.Offset = 0
1962         overlapped.OffsetHigh = 0
1963         overlapped.hEvent = 0
1964         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1965
1966         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1967                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1968                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1969             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1970             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1971
1972     def _unlock_file(f):
1973         assert f._lock_file_overlapped_p
1974         handle = msvcrt.get_osfhandle(f.fileno())
1975         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1976             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1977
1978 else:
1979     try:
1980         import fcntl
1981
1982         def _lock_file(f, exclusive, block):
1983             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1984             if not block:
1985                 flags |= fcntl.LOCK_NB
1986             try:
1987                 fcntl.flock(f, flags)
1988             except BlockingIOError:
1989                 raise
1990             except OSError:  # AOSP does not have flock()
1991                 fcntl.lockf(f, flags)
1992
1993         def _unlock_file(f):
1994             with contextlib.suppress(OSError):
1995                 return fcntl.flock(f, fcntl.LOCK_UN)
1996             with contextlib.suppress(OSError):
1997                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1998             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1999
2000     except ImportError:
2001
2002         def _lock_file(f, exclusive, block):
2003             raise LockingUnsupportedError()
2004
2005         def _unlock_file(f):
2006             raise LockingUnsupportedError()
2007
2008
2009 class locked_file:
2010     locked = False
2011
2012     def __init__(self, filename, mode, block=True, encoding=None):
2013         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2014             raise NotImplementedError(mode)
2015         self.mode, self.block = mode, block
2016
2017         writable = any(f in mode for f in 'wax+')
2018         readable = any(f in mode for f in 'r+')
2019         flags = functools.reduce(operator.ior, (
2020             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2021             getattr(os, 'O_BINARY', 0),  # Windows only
2022             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2023             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2024             os.O_APPEND if 'a' in mode else 0,
2025             os.O_EXCL if 'x' in mode else 0,
2026             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2027         ))
2028
2029         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2030
2031     def __enter__(self):
2032         exclusive = 'r' not in self.mode
2033         try:
2034             _lock_file(self.f, exclusive, self.block)
2035             self.locked = True
2036         except OSError:
2037             self.f.close()
2038             raise
2039         if 'w' in self.mode:
2040             try:
2041                 self.f.truncate()
2042             except OSError as e:
2043                 if e.errno not in (
2044                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2045                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2046                 ):
2047                     raise
2048         return self
2049
2050     def unlock(self):
2051         if not self.locked:
2052             return
2053         try:
2054             _unlock_file(self.f)
2055         finally:
2056             self.locked = False
2057
2058     def __exit__(self, *_):
2059         try:
2060             self.unlock()
2061         finally:
2062             self.f.close()
2063
2064     open = __enter__
2065     close = __exit__
2066
2067     def __getattr__(self, attr):
2068         return getattr(self.f, attr)
2069
2070     def __iter__(self):
2071         return iter(self.f)
2072
2073
2074 @functools.cache
2075 def get_filesystem_encoding():
2076     encoding = sys.getfilesystemencoding()
2077     return encoding if encoding is not None else 'utf-8'
2078
2079
2080 def shell_quote(args):
2081     quoted_args = []
2082     encoding = get_filesystem_encoding()
2083     for a in args:
2084         if isinstance(a, bytes):
2085             # We may get a filename encoded with 'encodeFilename'
2086             a = a.decode(encoding)
2087         quoted_args.append(compat_shlex_quote(a))
2088     return ' '.join(quoted_args)
2089
2090
2091 def smuggle_url(url, data):
2092     """ Pass additional data in a URL for internal use. """
2093
2094     url, idata = unsmuggle_url(url, {})
2095     data.update(idata)
2096     sdata = urllib.parse.urlencode(
2097         {'__youtubedl_smuggle': json.dumps(data)})
2098     return url + '#' + sdata
2099
2100
2101 def unsmuggle_url(smug_url, default=None):
2102     if '#__youtubedl_smuggle' not in smug_url:
2103         return smug_url, default
2104     url, _, sdata = smug_url.rpartition('#')
2105     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2106     data = json.loads(jsond)
2107     return url, data
2108
2109
2110 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2111     """ Formats numbers with decimal sufixes like K, M, etc """
2112     num, factor = float_or_none(num), float(factor)
2113     if num is None or num < 0:
2114         return None
2115     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2116     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2117     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2118     if factor == 1024:
2119         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2120     converted = num / (factor ** exponent)
2121     return fmt % (converted, suffix)
2122
2123
2124 def format_bytes(bytes):
2125     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2126
2127
2128 def lookup_unit_table(unit_table, s, strict=False):
2129     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2130     units_re = '|'.join(re.escape(u) for u in unit_table)
2131     m = (re.fullmatch if strict else re.match)(
2132         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2133     if not m:
2134         return None
2135
2136     num = float(m.group('num').replace(',', '.'))
2137     mult = unit_table[m.group('unit')]
2138     return round(num * mult)
2139
2140
2141 def parse_bytes(s):
2142     """Parse a string indicating a byte quantity into an integer"""
2143     return lookup_unit_table(
2144         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2145         s.upper(), strict=True)
2146
2147
2148 def parse_filesize(s):
2149     if s is None:
2150         return None
2151
2152     # The lower-case forms are of course incorrect and unofficial,
2153     # but we support those too
2154     _UNIT_TABLE = {
2155         'B': 1,
2156         'b': 1,
2157         'bytes': 1,
2158         'KiB': 1024,
2159         'KB': 1000,
2160         'kB': 1024,
2161         'Kb': 1000,
2162         'kb': 1000,
2163         'kilobytes': 1000,
2164         'kibibytes': 1024,
2165         'MiB': 1024 ** 2,
2166         'MB': 1000 ** 2,
2167         'mB': 1024 ** 2,
2168         'Mb': 1000 ** 2,
2169         'mb': 1000 ** 2,
2170         'megabytes': 1000 ** 2,
2171         'mebibytes': 1024 ** 2,
2172         'GiB': 1024 ** 3,
2173         'GB': 1000 ** 3,
2174         'gB': 1024 ** 3,
2175         'Gb': 1000 ** 3,
2176         'gb': 1000 ** 3,
2177         'gigabytes': 1000 ** 3,
2178         'gibibytes': 1024 ** 3,
2179         'TiB': 1024 ** 4,
2180         'TB': 1000 ** 4,
2181         'tB': 1024 ** 4,
2182         'Tb': 1000 ** 4,
2183         'tb': 1000 ** 4,
2184         'terabytes': 1000 ** 4,
2185         'tebibytes': 1024 ** 4,
2186         'PiB': 1024 ** 5,
2187         'PB': 1000 ** 5,
2188         'pB': 1024 ** 5,
2189         'Pb': 1000 ** 5,
2190         'pb': 1000 ** 5,
2191         'petabytes': 1000 ** 5,
2192         'pebibytes': 1024 ** 5,
2193         'EiB': 1024 ** 6,
2194         'EB': 1000 ** 6,
2195         'eB': 1024 ** 6,
2196         'Eb': 1000 ** 6,
2197         'eb': 1000 ** 6,
2198         'exabytes': 1000 ** 6,
2199         'exbibytes': 1024 ** 6,
2200         'ZiB': 1024 ** 7,
2201         'ZB': 1000 ** 7,
2202         'zB': 1024 ** 7,
2203         'Zb': 1000 ** 7,
2204         'zb': 1000 ** 7,
2205         'zettabytes': 1000 ** 7,
2206         'zebibytes': 1024 ** 7,
2207         'YiB': 1024 ** 8,
2208         'YB': 1000 ** 8,
2209         'yB': 1024 ** 8,
2210         'Yb': 1000 ** 8,
2211         'yb': 1000 ** 8,
2212         'yottabytes': 1000 ** 8,
2213         'yobibytes': 1024 ** 8,
2214     }
2215
2216     return lookup_unit_table(_UNIT_TABLE, s)
2217
2218
2219 def parse_count(s):
2220     if s is None:
2221         return None
2222
2223     s = re.sub(r'^[^\d]+\s', '', s).strip()
2224
2225     if re.match(r'^[\d,.]+$', s):
2226         return str_to_int(s)
2227
2228     _UNIT_TABLE = {
2229         'k': 1000,
2230         'K': 1000,
2231         'm': 1000 ** 2,
2232         'M': 1000 ** 2,
2233         'kk': 1000 ** 2,
2234         'KK': 1000 ** 2,
2235         'b': 1000 ** 3,
2236         'B': 1000 ** 3,
2237     }
2238
2239     ret = lookup_unit_table(_UNIT_TABLE, s)
2240     if ret is not None:
2241         return ret
2242
2243     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2244     if mobj:
2245         return str_to_int(mobj.group(1))
2246
2247
2248 def parse_resolution(s, *, lenient=False):
2249     if s is None:
2250         return {}
2251
2252     if lenient:
2253         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2254     else:
2255         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2256     if mobj:
2257         return {
2258             'width': int(mobj.group('w')),
2259             'height': int(mobj.group('h')),
2260         }
2261
2262     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2263     if mobj:
2264         return {'height': int(mobj.group(1))}
2265
2266     mobj = re.search(r'\b([48])[kK]\b', s)
2267     if mobj:
2268         return {'height': int(mobj.group(1)) * 540}
2269
2270     return {}
2271
2272
2273 def parse_bitrate(s):
2274     if not isinstance(s, str):
2275         return
2276     mobj = re.search(r'\b(\d+)\s*kbps', s)
2277     if mobj:
2278         return int(mobj.group(1))
2279
2280
2281 def month_by_name(name, lang='en'):
2282     """ Return the number of a month by (locale-independently) English name """
2283
2284     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2285
2286     try:
2287         return month_names.index(name) + 1
2288     except ValueError:
2289         return None
2290
2291
2292 def month_by_abbreviation(abbrev):
2293     """ Return the number of a month by (locale-independently) English
2294         abbreviations """
2295
2296     try:
2297         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2298     except ValueError:
2299         return None
2300
2301
2302 def fix_xml_ampersands(xml_str):
2303     """Replace all the '&' by '&amp;' in XML"""
2304     return re.sub(
2305         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2306         '&amp;',
2307         xml_str)
2308
2309
2310 def setproctitle(title):
2311     assert isinstance(title, str)
2312
2313     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2314     try:
2315         import ctypes
2316     except ImportError:
2317         return
2318
2319     try:
2320         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2321     except OSError:
2322         return
2323     except TypeError:
2324         # LoadLibrary in Windows Python 2.7.13 only expects
2325         # a bytestring, but since unicode_literals turns
2326         # every string into a unicode string, it fails.
2327         return
2328     title_bytes = title.encode()
2329     buf = ctypes.create_string_buffer(len(title_bytes))
2330     buf.value = title_bytes
2331     try:
2332         libc.prctl(15, buf, 0, 0, 0)
2333     except AttributeError:
2334         return  # Strange libc, just skip this
2335
2336
2337 def remove_start(s, start):
2338     return s[len(start):] if s is not None and s.startswith(start) else s
2339
2340
2341 def remove_end(s, end):
2342     return s[:-len(end)] if s is not None and s.endswith(end) else s
2343
2344
2345 def remove_quotes(s):
2346     if s is None or len(s) < 2:
2347         return s
2348     for quote in ('"', "'", ):
2349         if s[0] == quote and s[-1] == quote:
2350             return s[1:-1]
2351     return s
2352
2353
2354 def get_domain(url):
2355     """
2356     This implementation is inconsistent, but is kept for compatibility.
2357     Use this only for "webpage_url_domain"
2358     """
2359     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2360
2361
2362 def url_basename(url):
2363     path = urllib.parse.urlparse(url).path
2364     return path.strip('/').split('/')[-1]
2365
2366
2367 def base_url(url):
2368     return re.match(r'https?://[^?#]+/', url).group()
2369
2370
2371 def urljoin(base, path):
2372     if isinstance(path, bytes):
2373         path = path.decode()
2374     if not isinstance(path, str) or not path:
2375         return None
2376     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2377         return path
2378     if isinstance(base, bytes):
2379         base = base.decode()
2380     if not isinstance(base, str) or not re.match(
2381             r'^(?:https?:)?//', base):
2382         return None
2383     return urllib.parse.urljoin(base, path)
2384
2385
2386 class HEADRequest(urllib.request.Request):
2387     def get_method(self):
2388         return 'HEAD'
2389
2390
2391 class PUTRequest(urllib.request.Request):
2392     def get_method(self):
2393         return 'PUT'
2394
2395
2396 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2397     if get_attr and v is not None:
2398         v = getattr(v, get_attr, None)
2399     try:
2400         return int(v) * invscale // scale
2401     except (ValueError, TypeError, OverflowError):
2402         return default
2403
2404
2405 def str_or_none(v, default=None):
2406     return default if v is None else str(v)
2407
2408
2409 def str_to_int(int_str):
2410     """ A more relaxed version of int_or_none """
2411     if isinstance(int_str, int):
2412         return int_str
2413     elif isinstance(int_str, str):
2414         int_str = re.sub(r'[,\.\+]', '', int_str)
2415         return int_or_none(int_str)
2416
2417
2418 def float_or_none(v, scale=1, invscale=1, default=None):
2419     if v is None:
2420         return default
2421     try:
2422         return float(v) * invscale / scale
2423     except (ValueError, TypeError):
2424         return default
2425
2426
2427 def bool_or_none(v, default=None):
2428     return v if isinstance(v, bool) else default
2429
2430
2431 def strip_or_none(v, default=None):
2432     return v.strip() if isinstance(v, str) else default
2433
2434
2435 def url_or_none(url):
2436     if not url or not isinstance(url, str):
2437         return None
2438     url = url.strip()
2439     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2440
2441
2442 def request_to_url(req):
2443     if isinstance(req, urllib.request.Request):
2444         return req.get_full_url()
2445     else:
2446         return req
2447
2448
2449 def strftime_or_none(timestamp, date_format, default=None):
2450     datetime_object = None
2451     try:
2452         if isinstance(timestamp, (int, float)):  # unix timestamp
2453             # Using naive datetime here can break timestamp() in Windows
2454             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2455             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2456             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2457             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2458                                + datetime.timedelta(seconds=timestamp))
2459         elif isinstance(timestamp, str):  # assume YYYYMMDD
2460             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2461         date_format = re.sub(  # Support %s on windows
2462             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2463         return datetime_object.strftime(date_format)
2464     except (ValueError, TypeError, AttributeError):
2465         return default
2466
2467
2468 def parse_duration(s):
2469     if not isinstance(s, str):
2470         return None
2471     s = s.strip()
2472     if not s:
2473         return None
2474
2475     days, hours, mins, secs, ms = [None] * 5
2476     m = re.match(r'''(?x)
2477             (?P<before_secs>
2478                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2479             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2480             (?P<ms>[.:][0-9]+)?Z?$
2481         ''', s)
2482     if m:
2483         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2484     else:
2485         m = re.match(
2486             r'''(?ix)(?:P?
2487                 (?:
2488                     [0-9]+\s*y(?:ears?)?,?\s*
2489                 )?
2490                 (?:
2491                     [0-9]+\s*m(?:onths?)?,?\s*
2492                 )?
2493                 (?:
2494                     [0-9]+\s*w(?:eeks?)?,?\s*
2495                 )?
2496                 (?:
2497                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2498                 )?
2499                 T)?
2500                 (?:
2501                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2502                 )?
2503                 (?:
2504                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2505                 )?
2506                 (?:
2507                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2508                 )?Z?$''', s)
2509         if m:
2510             days, hours, mins, secs, ms = m.groups()
2511         else:
2512             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2513             if m:
2514                 hours, mins = m.groups()
2515             else:
2516                 return None
2517
2518     if ms:
2519         ms = ms.replace(':', '.')
2520     return sum(float(part or 0) * mult for part, mult in (
2521         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2522
2523
2524 def prepend_extension(filename, ext, expected_real_ext=None):
2525     name, real_ext = os.path.splitext(filename)
2526     return (
2527         f'{name}.{ext}{real_ext}'
2528         if not expected_real_ext or real_ext[1:] == expected_real_ext
2529         else f'{filename}.{ext}')
2530
2531
2532 def replace_extension(filename, ext, expected_real_ext=None):
2533     name, real_ext = os.path.splitext(filename)
2534     return '{}.{}'.format(
2535         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2536         ext)
2537
2538
2539 def check_executable(exe, args=[]):
2540     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2541     args can be a list of arguments for a short output (like -version) """
2542     try:
2543         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2544     except OSError:
2545         return False
2546     return exe
2547
2548
2549 def _get_exe_version_output(exe, args):
2550     try:
2551         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2552         # SIGTTOU if yt-dlp is run in the background.
2553         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2554         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2555                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2556         if ret:
2557             return None
2558     except OSError:
2559         return False
2560     return stdout
2561
2562
2563 def detect_exe_version(output, version_re=None, unrecognized='present'):
2564     assert isinstance(output, str)
2565     if version_re is None:
2566         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2567     m = re.search(version_re, output)
2568     if m:
2569         return m.group(1)
2570     else:
2571         return unrecognized
2572
2573
2574 def get_exe_version(exe, args=['--version'],
2575                     version_re=None, unrecognized=('present', 'broken')):
2576     """ Returns the version of the specified executable,
2577     or False if the executable is not present """
2578     unrecognized = variadic(unrecognized)
2579     assert len(unrecognized) in (1, 2)
2580     out = _get_exe_version_output(exe, args)
2581     if out is None:
2582         return unrecognized[-1]
2583     return out and detect_exe_version(out, version_re, unrecognized[0])
2584
2585
2586 def frange(start=0, stop=None, step=1):
2587     """Float range"""
2588     if stop is None:
2589         start, stop = 0, start
2590     sign = [-1, 1][step > 0] if step else 0
2591     while sign * start < sign * stop:
2592         yield start
2593         start += step
2594
2595
2596 class LazyList(collections.abc.Sequence):
2597     """Lazy immutable list from an iterable
2598     Note that slices of a LazyList are lists and not LazyList"""
2599
2600     class IndexError(IndexError):
2601         pass
2602
2603     def __init__(self, iterable, *, reverse=False, _cache=None):
2604         self._iterable = iter(iterable)
2605         self._cache = [] if _cache is None else _cache
2606         self._reversed = reverse
2607
2608     def __iter__(self):
2609         if self._reversed:
2610             # We need to consume the entire iterable to iterate in reverse
2611             yield from self.exhaust()
2612             return
2613         yield from self._cache
2614         for item in self._iterable:
2615             self._cache.append(item)
2616             yield item
2617
2618     def _exhaust(self):
2619         self._cache.extend(self._iterable)
2620         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2621         return self._cache
2622
2623     def exhaust(self):
2624         """Evaluate the entire iterable"""
2625         return self._exhaust()[::-1 if self._reversed else 1]
2626
2627     @staticmethod
2628     def _reverse_index(x):
2629         return None if x is None else ~x
2630
2631     def __getitem__(self, idx):
2632         if isinstance(idx, slice):
2633             if self._reversed:
2634                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2635             start, stop, step = idx.start, idx.stop, idx.step or 1
2636         elif isinstance(idx, int):
2637             if self._reversed:
2638                 idx = self._reverse_index(idx)
2639             start, stop, step = idx, idx, 0
2640         else:
2641             raise TypeError('indices must be integers or slices')
2642         if ((start or 0) < 0 or (stop or 0) < 0
2643                 or (start is None and step < 0)
2644                 or (stop is None and step > 0)):
2645             # We need to consume the entire iterable to be able to slice from the end
2646             # Obviously, never use this with infinite iterables
2647             self._exhaust()
2648             try:
2649                 return self._cache[idx]
2650             except IndexError as e:
2651                 raise self.IndexError(e) from e
2652         n = max(start or 0, stop or 0) - len(self._cache) + 1
2653         if n > 0:
2654             self._cache.extend(itertools.islice(self._iterable, n))
2655         try:
2656             return self._cache[idx]
2657         except IndexError as e:
2658             raise self.IndexError(e) from e
2659
2660     def __bool__(self):
2661         try:
2662             self[-1] if self._reversed else self[0]
2663         except self.IndexError:
2664             return False
2665         return True
2666
2667     def __len__(self):
2668         self._exhaust()
2669         return len(self._cache)
2670
2671     def __reversed__(self):
2672         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2673
2674     def __copy__(self):
2675         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2676
2677     def __repr__(self):
2678         # repr and str should mimic a list. So we exhaust the iterable
2679         return repr(self.exhaust())
2680
2681     def __str__(self):
2682         return repr(self.exhaust())
2683
2684
2685 class PagedList:
2686
2687     class IndexError(IndexError):
2688         pass
2689
2690     def __len__(self):
2691         # This is only useful for tests
2692         return len(self.getslice())
2693
2694     def __init__(self, pagefunc, pagesize, use_cache=True):
2695         self._pagefunc = pagefunc
2696         self._pagesize = pagesize
2697         self._pagecount = float('inf')
2698         self._use_cache = use_cache
2699         self._cache = {}
2700
2701     def getpage(self, pagenum):
2702         page_results = self._cache.get(pagenum)
2703         if page_results is None:
2704             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2705         if self._use_cache:
2706             self._cache[pagenum] = page_results
2707         return page_results
2708
2709     def getslice(self, start=0, end=None):
2710         return list(self._getslice(start, end))
2711
2712     def _getslice(self, start, end):
2713         raise NotImplementedError('This method must be implemented by subclasses')
2714
2715     def __getitem__(self, idx):
2716         assert self._use_cache, 'Indexing PagedList requires cache'
2717         if not isinstance(idx, int) or idx < 0:
2718             raise TypeError('indices must be non-negative integers')
2719         entries = self.getslice(idx, idx + 1)
2720         if not entries:
2721             raise self.IndexError()
2722         return entries[0]
2723
2724
2725 class OnDemandPagedList(PagedList):
2726     """Download pages until a page with less than maximum results"""
2727
2728     def _getslice(self, start, end):
2729         for pagenum in itertools.count(start // self._pagesize):
2730             firstid = pagenum * self._pagesize
2731             nextfirstid = pagenum * self._pagesize + self._pagesize
2732             if start >= nextfirstid:
2733                 continue
2734
2735             startv = (
2736                 start % self._pagesize
2737                 if firstid <= start < nextfirstid
2738                 else 0)
2739             endv = (
2740                 ((end - 1) % self._pagesize) + 1
2741                 if (end is not None and firstid <= end <= nextfirstid)
2742                 else None)
2743
2744             try:
2745                 page_results = self.getpage(pagenum)
2746             except Exception:
2747                 self._pagecount = pagenum - 1
2748                 raise
2749             if startv != 0 or endv is not None:
2750                 page_results = page_results[startv:endv]
2751             yield from page_results
2752
2753             # A little optimization - if current page is not "full", ie. does
2754             # not contain page_size videos then we can assume that this page
2755             # is the last one - there are no more ids on further pages -
2756             # i.e. no need to query again.
2757             if len(page_results) + startv < self._pagesize:
2758                 break
2759
2760             # If we got the whole page, but the next page is not interesting,
2761             # break out early as well
2762             if end == nextfirstid:
2763                 break
2764
2765
2766 class InAdvancePagedList(PagedList):
2767     """PagedList with total number of pages known in advance"""
2768
2769     def __init__(self, pagefunc, pagecount, pagesize):
2770         PagedList.__init__(self, pagefunc, pagesize, True)
2771         self._pagecount = pagecount
2772
2773     def _getslice(self, start, end):
2774         start_page = start // self._pagesize
2775         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2776         skip_elems = start - start_page * self._pagesize
2777         only_more = None if end is None else end - start
2778         for pagenum in range(start_page, end_page):
2779             page_results = self.getpage(pagenum)
2780             if skip_elems:
2781                 page_results = page_results[skip_elems:]
2782                 skip_elems = None
2783             if only_more is not None:
2784                 if len(page_results) < only_more:
2785                     only_more -= len(page_results)
2786                 else:
2787                     yield from page_results[:only_more]
2788                     break
2789             yield from page_results
2790
2791
2792 class PlaylistEntries:
2793     MissingEntry = object()
2794     is_exhausted = False
2795
2796     def __init__(self, ydl, info_dict):
2797         self.ydl = ydl
2798
2799         # _entries must be assigned now since infodict can change during iteration
2800         entries = info_dict.get('entries')
2801         if entries is None:
2802             raise EntryNotInPlaylist('There are no entries')
2803         elif isinstance(entries, list):
2804             self.is_exhausted = True
2805
2806         requested_entries = info_dict.get('requested_entries')
2807         self.is_incomplete = requested_entries is not None
2808         if self.is_incomplete:
2809             assert self.is_exhausted
2810             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2811             for i, entry in zip(requested_entries, entries):
2812                 self._entries[i - 1] = entry
2813         elif isinstance(entries, (list, PagedList, LazyList)):
2814             self._entries = entries
2815         else:
2816             self._entries = LazyList(entries)
2817
2818     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2819         (?P<start>[+-]?\d+)?
2820         (?P<range>[:-]
2821             (?P<end>[+-]?\d+|inf(?:inite)?)?
2822             (?::(?P<step>[+-]?\d+))?
2823         )?''')
2824
2825     @classmethod
2826     def parse_playlist_items(cls, string):
2827         for segment in string.split(','):
2828             if not segment:
2829                 raise ValueError('There is two or more consecutive commas')
2830             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2831             if not mobj:
2832                 raise ValueError(f'{segment!r} is not a valid specification')
2833             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2834             if int_or_none(step) == 0:
2835                 raise ValueError(f'Step in {segment!r} cannot be zero')
2836             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2837
2838     def get_requested_items(self):
2839         playlist_items = self.ydl.params.get('playlist_items')
2840         playlist_start = self.ydl.params.get('playliststart', 1)
2841         playlist_end = self.ydl.params.get('playlistend')
2842         # For backwards compatibility, interpret -1 as whole list
2843         if playlist_end in (-1, None):
2844             playlist_end = ''
2845         if not playlist_items:
2846             playlist_items = f'{playlist_start}:{playlist_end}'
2847         elif playlist_start != 1 or playlist_end:
2848             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2849
2850         for index in self.parse_playlist_items(playlist_items):
2851             for i, entry in self[index]:
2852                 yield i, entry
2853                 if not entry:
2854                     continue
2855                 try:
2856                     # The item may have just been added to archive. Don't break due to it
2857                     if not self.ydl.params.get('lazy_playlist'):
2858                         # TODO: Add auto-generated fields
2859                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2860                 except (ExistingVideoReached, RejectedVideoReached):
2861                     return
2862
2863     def get_full_count(self):
2864         if self.is_exhausted and not self.is_incomplete:
2865             return len(self)
2866         elif isinstance(self._entries, InAdvancePagedList):
2867             if self._entries._pagesize == 1:
2868                 return self._entries._pagecount
2869
2870     @functools.cached_property
2871     def _getter(self):
2872         if isinstance(self._entries, list):
2873             def get_entry(i):
2874                 try:
2875                     entry = self._entries[i]
2876                 except IndexError:
2877                     entry = self.MissingEntry
2878                     if not self.is_incomplete:
2879                         raise self.IndexError()
2880                 if entry is self.MissingEntry:
2881                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2882                 return entry
2883         else:
2884             def get_entry(i):
2885                 try:
2886                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2887                 except (LazyList.IndexError, PagedList.IndexError):
2888                     raise self.IndexError()
2889         return get_entry
2890
2891     def __getitem__(self, idx):
2892         if isinstance(idx, int):
2893             idx = slice(idx, idx)
2894
2895         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2896         step = 1 if idx.step is None else idx.step
2897         if idx.start is None:
2898             start = 0 if step > 0 else len(self) - 1
2899         else:
2900             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2901
2902         # NB: Do not call len(self) when idx == [:]
2903         if idx.stop is None:
2904             stop = 0 if step < 0 else float('inf')
2905         else:
2906             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2907         stop += [-1, 1][step > 0]
2908
2909         for i in frange(start, stop, step):
2910             if i < 0:
2911                 continue
2912             try:
2913                 entry = self._getter(i)
2914             except self.IndexError:
2915                 self.is_exhausted = True
2916                 if step > 0:
2917                     break
2918                 continue
2919             yield i + 1, entry
2920
2921     def __len__(self):
2922         return len(tuple(self[:]))
2923
2924     class IndexError(IndexError):
2925         pass
2926
2927
2928 def uppercase_escape(s):
2929     unicode_escape = codecs.getdecoder('unicode_escape')
2930     return re.sub(
2931         r'\\U[0-9a-fA-F]{8}',
2932         lambda m: unicode_escape(m.group(0))[0],
2933         s)
2934
2935
2936 def lowercase_escape(s):
2937     unicode_escape = codecs.getdecoder('unicode_escape')
2938     return re.sub(
2939         r'\\u[0-9a-fA-F]{4}',
2940         lambda m: unicode_escape(m.group(0))[0],
2941         s)
2942
2943
2944 def escape_rfc3986(s):
2945     """Escape non-ASCII characters as suggested by RFC 3986"""
2946     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2947
2948
2949 def escape_url(url):
2950     """Escape URL as suggested by RFC 3986"""
2951     url_parsed = urllib.parse.urlparse(url)
2952     return url_parsed._replace(
2953         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2954         path=escape_rfc3986(url_parsed.path),
2955         params=escape_rfc3986(url_parsed.params),
2956         query=escape_rfc3986(url_parsed.query),
2957         fragment=escape_rfc3986(url_parsed.fragment)
2958     ).geturl()
2959
2960
2961 def parse_qs(url, **kwargs):
2962     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2963
2964
2965 def read_batch_urls(batch_fd):
2966     def fixup(url):
2967         if not isinstance(url, str):
2968             url = url.decode('utf-8', 'replace')
2969         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2970         for bom in BOM_UTF8:
2971             if url.startswith(bom):
2972                 url = url[len(bom):]
2973         url = url.lstrip()
2974         if not url or url.startswith(('#', ';', ']')):
2975             return False
2976         # "#" cannot be stripped out since it is part of the URI
2977         # However, it can be safely stripped out if following a whitespace
2978         return re.split(r'\s#', url, 1)[0].rstrip()
2979
2980     with contextlib.closing(batch_fd) as fd:
2981         return [url for url in map(fixup, fd) if url]
2982
2983
2984 def urlencode_postdata(*args, **kargs):
2985     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2986
2987
2988 def update_url(url, *, query_update=None, **kwargs):
2989     """Replace URL components specified by kwargs
2990        @param url           str or parse url tuple
2991        @param query_update  update query
2992        @returns             str
2993     """
2994     if isinstance(url, str):
2995         if not kwargs and not query_update:
2996             return url
2997         else:
2998             url = urllib.parse.urlparse(url)
2999     if query_update:
3000         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3001         kwargs['query'] = urllib.parse.urlencode({
3002             **urllib.parse.parse_qs(url.query),
3003             **query_update
3004         }, True)
3005     return urllib.parse.urlunparse(url._replace(**kwargs))
3006
3007
3008 def update_url_query(url, query):
3009     return update_url(url, query_update=query)
3010
3011
3012 def update_Request(req, url=None, data=None, headers=None, query=None):
3013     req_headers = req.headers.copy()
3014     req_headers.update(headers or {})
3015     req_data = data or req.data
3016     req_url = update_url_query(url or req.get_full_url(), query)
3017     req_get_method = req.get_method()
3018     if req_get_method == 'HEAD':
3019         req_type = HEADRequest
3020     elif req_get_method == 'PUT':
3021         req_type = PUTRequest
3022     else:
3023         req_type = urllib.request.Request
3024     new_req = req_type(
3025         req_url, data=req_data, headers=req_headers,
3026         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3027     if hasattr(req, 'timeout'):
3028         new_req.timeout = req.timeout
3029     return new_req
3030
3031
3032 def _multipart_encode_impl(data, boundary):
3033     content_type = 'multipart/form-data; boundary=%s' % boundary
3034
3035     out = b''
3036     for k, v in data.items():
3037         out += b'--' + boundary.encode('ascii') + b'\r\n'
3038         if isinstance(k, str):
3039             k = k.encode()
3040         if isinstance(v, str):
3041             v = v.encode()
3042         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3043         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3044         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3045         if boundary.encode('ascii') in content:
3046             raise ValueError('Boundary overlaps with data')
3047         out += content
3048
3049     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3050
3051     return out, content_type
3052
3053
3054 def multipart_encode(data, boundary=None):
3055     '''
3056     Encode a dict to RFC 7578-compliant form-data
3057
3058     data:
3059         A dict where keys and values can be either Unicode or bytes-like
3060         objects.
3061     boundary:
3062         If specified a Unicode object, it's used as the boundary. Otherwise
3063         a random boundary is generated.
3064
3065     Reference: https://tools.ietf.org/html/rfc7578
3066     '''
3067     has_specified_boundary = boundary is not None
3068
3069     while True:
3070         if boundary is None:
3071             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3072
3073         try:
3074             out, content_type = _multipart_encode_impl(data, boundary)
3075             break
3076         except ValueError:
3077             if has_specified_boundary:
3078                 raise
3079             boundary = None
3080
3081     return out, content_type
3082
3083
3084 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3085     if blocked_types is NO_DEFAULT:
3086         blocked_types = (str, bytes, collections.abc.Mapping)
3087     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3088
3089
3090 def variadic(x, allowed_types=NO_DEFAULT):
3091     if not isinstance(allowed_types, (tuple, type)):
3092         deprecation_warning('allowed_types should be a tuple or a type')
3093         allowed_types = tuple(allowed_types)
3094     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3095
3096
3097 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3098     for f in funcs:
3099         try:
3100             val = f(*args, **kwargs)
3101         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3102             pass
3103         else:
3104             if expected_type is None or isinstance(val, expected_type):
3105                 return val
3106
3107
3108 def try_get(src, getter, expected_type=None):
3109     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3110
3111
3112 def filter_dict(dct, cndn=lambda _, v: v is not None):
3113     return {k: v for k, v in dct.items() if cndn(k, v)}
3114
3115
3116 def merge_dicts(*dicts):
3117     merged = {}
3118     for a_dict in dicts:
3119         for k, v in a_dict.items():
3120             if (v is not None and k not in merged
3121                     or isinstance(v, str) and merged[k] == ''):
3122                 merged[k] = v
3123     return merged
3124
3125
3126 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3127     return string if isinstance(string, str) else str(string, encoding, errors)
3128
3129
3130 US_RATINGS = {
3131     'G': 0,
3132     'PG': 10,
3133     'PG-13': 13,
3134     'R': 16,
3135     'NC': 18,
3136 }
3137
3138
3139 TV_PARENTAL_GUIDELINES = {
3140     'TV-Y': 0,
3141     'TV-Y7': 7,
3142     'TV-G': 0,
3143     'TV-PG': 0,
3144     'TV-14': 14,
3145     'TV-MA': 17,
3146 }
3147
3148
3149 def parse_age_limit(s):
3150     # isinstance(False, int) is True. So type() must be used instead
3151     if type(s) is int:  # noqa: E721
3152         return s if 0 <= s <= 21 else None
3153     elif not isinstance(s, str):
3154         return None
3155     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3156     if m:
3157         return int(m.group('age'))
3158     s = s.upper()
3159     if s in US_RATINGS:
3160         return US_RATINGS[s]
3161     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3162     if m:
3163         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3164     return None
3165
3166
3167 def strip_jsonp(code):
3168     return re.sub(
3169         r'''(?sx)^
3170             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3171             (?:\s*&&\s*(?P=func_name))?
3172             \s*\(\s*(?P<callback_data>.*)\);?
3173             \s*?(?://[^\n]*)*$''',
3174         r'\g<callback_data>', code)
3175
3176
3177 def js_to_json(code, vars={}, *, strict=False):
3178     # vars is a dict of var, val pairs to substitute
3179     STRING_QUOTES = '\'"`'
3180     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3181     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3182     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3183     INTEGER_TABLE = (
3184         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3185         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3186     )
3187
3188     def process_escape(match):
3189         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3190         escape = match.group(1) or match.group(2)
3191
3192         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3193                 else R'\u00' if escape == 'x'
3194                 else '' if escape == '\n'
3195                 else escape)
3196
3197     def template_substitute(match):
3198         evaluated = js_to_json(match.group(1), vars, strict=strict)
3199         if evaluated[0] == '"':
3200             return json.loads(evaluated)
3201         return evaluated
3202
3203     def fix_kv(m):
3204         v = m.group(0)
3205         if v in ('true', 'false', 'null'):
3206             return v
3207         elif v in ('undefined', 'void 0'):
3208             return 'null'
3209         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3210             return ''
3211
3212         if v[0] in STRING_QUOTES:
3213             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3214             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3215             return f'"{escaped}"'
3216
3217         for regex, base in INTEGER_TABLE:
3218             im = re.match(regex, v)
3219             if im:
3220                 i = int(im.group(1), base)
3221                 return f'"{i}":' if v.endswith(':') else str(i)
3222
3223         if v in vars:
3224             try:
3225                 if not strict:
3226                     json.loads(vars[v])
3227             except json.JSONDecodeError:
3228                 return json.dumps(vars[v])
3229             else:
3230                 return vars[v]
3231
3232         if not strict:
3233             return f'"{v}"'
3234
3235         raise ValueError(f'Unknown value: {v}')
3236
3237     def create_map(mobj):
3238         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3239
3240     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3241     if not strict:
3242         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3243         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3244         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3245         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3246
3247     return re.sub(rf'''(?sx)
3248         {STRING_RE}|
3249         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3250         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3251         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3252         [0-9]+(?={SKIP_RE}:)|
3253         !+
3254         ''', fix_kv, code)
3255
3256
3257 def qualities(quality_ids):
3258     """ Get a numeric quality value out of a list of possible values """
3259     def q(qid):
3260         try:
3261             return quality_ids.index(qid)
3262         except ValueError:
3263             return -1
3264     return q
3265
3266
3267 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3268
3269
3270 DEFAULT_OUTTMPL = {
3271     'default': '%(title)s [%(id)s].%(ext)s',
3272     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3273 }
3274 OUTTMPL_TYPES = {
3275     'chapter': None,
3276     'subtitle': None,
3277     'thumbnail': None,
3278     'description': 'description',
3279     'annotation': 'annotations.xml',
3280     'infojson': 'info.json',
3281     'link': None,
3282     'pl_video': None,
3283     'pl_thumbnail': None,
3284     'pl_description': 'description',
3285     'pl_infojson': 'info.json',
3286 }
3287
3288 # As of [1] format syntax is:
3289 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3290 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3291 STR_FORMAT_RE_TMPL = r'''(?x)
3292     (?<!%)(?P<prefix>(?:%%)*)
3293     %
3294     (?P<has_key>\((?P<key>{0})\))?
3295     (?P<format>
3296         (?P<conversion>[#0\-+ ]+)?
3297         (?P<min_width>\d+)?
3298         (?P<precision>\.\d+)?
3299         (?P<len_mod>[hlL])?  # unused in python
3300         {1}  # conversion type
3301     )
3302 '''
3303
3304
3305 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3306
3307
3308 def limit_length(s, length):
3309     """ Add ellipses to overly long strings """
3310     if s is None:
3311         return None
3312     ELLIPSES = '...'
3313     if len(s) > length:
3314         return s[:length - len(ELLIPSES)] + ELLIPSES
3315     return s
3316
3317
3318 def version_tuple(v):
3319     return tuple(int(e) for e in re.split(r'[-.]', v))
3320
3321
3322 def is_outdated_version(version, limit, assume_new=True):
3323     if not version:
3324         return not assume_new
3325     try:
3326         return version_tuple(version) < version_tuple(limit)
3327     except ValueError:
3328         return not assume_new
3329
3330
3331 def ytdl_is_updateable():
3332     """ Returns if yt-dlp can be updated with -U """
3333
3334     from ..update import is_non_updateable
3335
3336     return not is_non_updateable()
3337
3338
3339 def args_to_str(args):
3340     # Get a short string representation for a subprocess command
3341     return ' '.join(compat_shlex_quote(a) for a in args)
3342
3343
3344 def error_to_str(err):
3345     return f'{type(err).__name__}: {err}'
3346
3347
3348 def mimetype2ext(mt, default=NO_DEFAULT):
3349     if not isinstance(mt, str):
3350         if default is not NO_DEFAULT:
3351             return default
3352         return None
3353
3354     MAP = {
3355         # video
3356         '3gpp': '3gp',
3357         'mp2t': 'ts',
3358         'mp4': 'mp4',
3359         'mpeg': 'mpeg',
3360         'mpegurl': 'm3u8',
3361         'quicktime': 'mov',
3362         'webm': 'webm',
3363         'vp9': 'vp9',
3364         'x-flv': 'flv',
3365         'x-m4v': 'm4v',
3366         'x-matroska': 'mkv',
3367         'x-mng': 'mng',
3368         'x-mp4-fragmented': 'mp4',
3369         'x-ms-asf': 'asf',
3370         'x-ms-wmv': 'wmv',
3371         'x-msvideo': 'avi',
3372
3373         # application (streaming playlists)
3374         'dash+xml': 'mpd',
3375         'f4m+xml': 'f4m',
3376         'hds+xml': 'f4m',
3377         'vnd.apple.mpegurl': 'm3u8',
3378         'vnd.ms-sstr+xml': 'ism',
3379         'x-mpegurl': 'm3u8',
3380
3381         # audio
3382         'audio/mp4': 'm4a',
3383         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3384         # Using .mp3 as it's the most popular one
3385         'audio/mpeg': 'mp3',
3386         'audio/webm': 'webm',
3387         'audio/x-matroska': 'mka',
3388         'audio/x-mpegurl': 'm3u',
3389         'midi': 'mid',
3390         'ogg': 'ogg',
3391         'wav': 'wav',
3392         'wave': 'wav',
3393         'x-aac': 'aac',
3394         'x-flac': 'flac',
3395         'x-m4a': 'm4a',
3396         'x-realaudio': 'ra',
3397         'x-wav': 'wav',
3398
3399         # image
3400         'avif': 'avif',
3401         'bmp': 'bmp',
3402         'gif': 'gif',
3403         'jpeg': 'jpg',
3404         'png': 'png',
3405         'svg+xml': 'svg',
3406         'tiff': 'tif',
3407         'vnd.wap.wbmp': 'wbmp',
3408         'webp': 'webp',
3409         'x-icon': 'ico',
3410         'x-jng': 'jng',
3411         'x-ms-bmp': 'bmp',
3412
3413         # caption
3414         'filmstrip+json': 'fs',
3415         'smptett+xml': 'tt',
3416         'ttaf+xml': 'dfxp',
3417         'ttml+xml': 'ttml',
3418         'x-ms-sami': 'sami',
3419
3420         # misc
3421         'gzip': 'gz',
3422         'json': 'json',
3423         'xml': 'xml',
3424         'zip': 'zip',
3425     }
3426
3427     mimetype = mt.partition(';')[0].strip().lower()
3428     _, _, subtype = mimetype.rpartition('/')
3429
3430     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3431     if ext:
3432         return ext
3433     elif default is not NO_DEFAULT:
3434         return default
3435     return subtype.replace('+', '.')
3436
3437
3438 def ext2mimetype(ext_or_url):
3439     if not ext_or_url:
3440         return None
3441     if '.' not in ext_or_url:
3442         ext_or_url = f'file.{ext_or_url}'
3443     return mimetypes.guess_type(ext_or_url)[0]
3444
3445
3446 def parse_codecs(codecs_str):
3447     # http://tools.ietf.org/html/rfc6381
3448     if not codecs_str:
3449         return {}
3450     split_codecs = list(filter(None, map(
3451         str.strip, codecs_str.strip().strip(',').split(','))))
3452     vcodec, acodec, scodec, hdr = None, None, None, None
3453     for full_codec in split_codecs:
3454         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3455         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3456                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3457             if vcodec:
3458                 continue
3459             vcodec = full_codec
3460             if parts[0] in ('dvh1', 'dvhe'):
3461                 hdr = 'DV'
3462             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3463                 hdr = 'HDR10'
3464             elif parts[:2] == ['vp9', '2']:
3465                 hdr = 'HDR10'
3466         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3467                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3468             acodec = acodec or full_codec
3469         elif parts[0] in ('stpp', 'wvtt'):
3470             scodec = scodec or full_codec
3471         else:
3472             write_string(f'WARNING: Unknown codec {full_codec}\n')
3473     if vcodec or acodec or scodec:
3474         return {
3475             'vcodec': vcodec or 'none',
3476             'acodec': acodec or 'none',
3477             'dynamic_range': hdr,
3478             **({'scodec': scodec} if scodec is not None else {}),
3479         }
3480     elif len(split_codecs) == 2:
3481         return {
3482             'vcodec': split_codecs[0],
3483             'acodec': split_codecs[1],
3484         }
3485     return {}
3486
3487
3488 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3489     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3490
3491     allow_mkv = not preferences or 'mkv' in preferences
3492
3493     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3494         return 'mkv'  # TODO: any other format allows this?
3495
3496     # TODO: All codecs supported by parse_codecs isn't handled here
3497     COMPATIBLE_CODECS = {
3498         'mp4': {
3499             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3500             'h264', 'aacl', 'ec-3',  # Set in ISM
3501         },
3502         'webm': {
3503             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3504             'vp9x', 'vp8x',  # in the webm spec
3505         },
3506     }
3507
3508     sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3509     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3510
3511     for ext in preferences or COMPATIBLE_CODECS.keys():
3512         codec_set = COMPATIBLE_CODECS.get(ext, set())
3513         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3514             return ext
3515
3516     COMPATIBLE_EXTS = (
3517         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3518         {'webm', 'weba'},
3519     )
3520     for ext in preferences or vexts:
3521         current_exts = {ext, *vexts, *aexts}
3522         if ext == 'mkv' or current_exts == {ext} or any(
3523                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3524             return ext
3525     return 'mkv' if allow_mkv else preferences[-1]
3526
3527
3528 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3529     getheader = url_handle.headers.get
3530
3531     cd = getheader('Content-Disposition')
3532     if cd:
3533         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3534         if m:
3535             e = determine_ext(m.group('filename'), default_ext=None)
3536             if e:
3537                 return e
3538
3539     meta_ext = getheader('x-amz-meta-name')
3540     if meta_ext:
3541         e = meta_ext.rpartition('.')[2]
3542         if e:
3543             return e
3544
3545     return mimetype2ext(getheader('Content-Type'), default=default)
3546
3547
3548 def encode_data_uri(data, mime_type):
3549     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3550
3551
3552 def age_restricted(content_limit, age_limit):
3553     """ Returns True iff the content should be blocked """
3554
3555     if age_limit is None:  # No limit set
3556         return False
3557     if content_limit is None:
3558         return False  # Content available for everyone
3559     return age_limit < content_limit
3560
3561
3562 # List of known byte-order-marks (BOM)
3563 BOMS = [
3564     (b'\xef\xbb\xbf', 'utf-8'),
3565     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3566     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3567     (b'\xff\xfe', 'utf-16-le'),
3568     (b'\xfe\xff', 'utf-16-be'),
3569 ]
3570
3571
3572 def is_html(first_bytes):
3573     """ Detect whether a file contains HTML by examining its first bytes. """
3574
3575     encoding = 'utf-8'
3576     for bom, enc in BOMS:
3577         while first_bytes.startswith(bom):
3578             encoding, first_bytes = enc, first_bytes[len(bom):]
3579
3580     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3581
3582
3583 def determine_protocol(info_dict):
3584     protocol = info_dict.get('protocol')
3585     if protocol is not None:
3586         return protocol
3587
3588     url = sanitize_url(info_dict['url'])
3589     if url.startswith('rtmp'):
3590         return 'rtmp'
3591     elif url.startswith('mms'):
3592         return 'mms'
3593     elif url.startswith('rtsp'):
3594         return 'rtsp'
3595
3596     ext = determine_ext(url)
3597     if ext == 'm3u8':
3598         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3599     elif ext == 'f4m':
3600         return 'f4m'
3601
3602     return urllib.parse.urlparse(url).scheme
3603
3604
3605 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3606     """ Render a list of rows, each as a list of values.
3607     Text after a \t will be right aligned """
3608     def width(string):
3609         return len(remove_terminal_sequences(string).replace('\t', ''))
3610
3611     def get_max_lens(table):
3612         return [max(width(str(v)) for v in col) for col in zip(*table)]
3613
3614     def filter_using_list(row, filterArray):
3615         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3616
3617     max_lens = get_max_lens(data) if hide_empty else []
3618     header_row = filter_using_list(header_row, max_lens)
3619     data = [filter_using_list(row, max_lens) for row in data]
3620
3621     table = [header_row] + data
3622     max_lens = get_max_lens(table)
3623     extra_gap += 1
3624     if delim:
3625         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3626         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3627     for row in table:
3628         for pos, text in enumerate(map(str, row)):
3629             if '\t' in text:
3630                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3631             else:
3632                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3633     ret = '\n'.join(''.join(row).rstrip() for row in table)
3634     return ret
3635
3636
3637 def _match_one(filter_part, dct, incomplete):
3638     # TODO: Generalize code with YoutubeDL._build_format_filter
3639     STRING_OPERATORS = {
3640         '*=': operator.contains,
3641         '^=': lambda attr, value: attr.startswith(value),
3642         '$=': lambda attr, value: attr.endswith(value),
3643         '~=': lambda attr, value: re.search(value, attr),
3644     }
3645     COMPARISON_OPERATORS = {
3646         **STRING_OPERATORS,
3647         '<=': operator.le,  # "<=" must be defined above "<"
3648         '<': operator.lt,
3649         '>=': operator.ge,
3650         '>': operator.gt,
3651         '=': operator.eq,
3652     }
3653
3654     if isinstance(incomplete, bool):
3655         is_incomplete = lambda _: incomplete
3656     else:
3657         is_incomplete = lambda k: k in incomplete
3658
3659     operator_rex = re.compile(r'''(?x)
3660         (?P<key>[a-z_]+)
3661         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3662         (?:
3663             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3664             (?P<strval>.+?)
3665         )
3666         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3667     m = operator_rex.fullmatch(filter_part.strip())
3668     if m:
3669         m = m.groupdict()
3670         unnegated_op = COMPARISON_OPERATORS[m['op']]
3671         if m['negation']:
3672             op = lambda attr, value: not unnegated_op(attr, value)
3673         else:
3674             op = unnegated_op
3675         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3676         if m['quote']:
3677             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3678         actual_value = dct.get(m['key'])
3679         numeric_comparison = None
3680         if isinstance(actual_value, (int, float)):
3681             # If the original field is a string and matching comparisonvalue is
3682             # a number we should respect the origin of the original field
3683             # and process comparison value as a string (see
3684             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3685             try:
3686                 numeric_comparison = int(comparison_value)
3687             except ValueError:
3688                 numeric_comparison = parse_filesize(comparison_value)
3689                 if numeric_comparison is None:
3690                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3691                 if numeric_comparison is None:
3692                     numeric_comparison = parse_duration(comparison_value)
3693         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3694             raise ValueError('Operator %s only supports string values!' % m['op'])
3695         if actual_value is None:
3696             return is_incomplete(m['key']) or m['none_inclusive']
3697         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3698
3699     UNARY_OPERATORS = {
3700         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3701         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3702     }
3703     operator_rex = re.compile(r'''(?x)
3704         (?P<op>%s)\s*(?P<key>[a-z_]+)
3705         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3706     m = operator_rex.fullmatch(filter_part.strip())
3707     if m:
3708         op = UNARY_OPERATORS[m.group('op')]
3709         actual_value = dct.get(m.group('key'))
3710         if is_incomplete(m.group('key')) and actual_value is None:
3711             return True
3712         return op(actual_value)
3713
3714     raise ValueError('Invalid filter part %r' % filter_part)
3715
3716
3717 def match_str(filter_str, dct, incomplete=False):
3718     """ Filter a dictionary with a simple string syntax.
3719     @returns           Whether the filter passes
3720     @param incomplete  Set of keys that is expected to be missing from dct.
3721                        Can be True/False to indicate all/none of the keys may be missing.
3722                        All conditions on incomplete keys pass if the key is missing
3723     """
3724     return all(
3725         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3726         for filter_part in re.split(r'(?<!\\)&', filter_str))
3727
3728
3729 def match_filter_func(filters, breaking_filters=None):
3730     if not filters and not breaking_filters:
3731         return None
3732     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3733     filters = set(variadic(filters or []))
3734
3735     interactive = '-' in filters
3736     if interactive:
3737         filters.remove('-')
3738
3739     def _match_func(info_dict, incomplete=False):
3740         ret = breaking_filters(info_dict, incomplete)
3741         if ret is not None:
3742             raise RejectedVideoReached(ret)
3743
3744         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3745             return NO_DEFAULT if interactive and not incomplete else None
3746         else:
3747             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3748             filter_str = ') | ('.join(map(str.strip, filters))
3749             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3750     return _match_func
3751
3752
3753 class download_range_func:
3754     def __init__(self, chapters, ranges):
3755         self.chapters, self.ranges = chapters, ranges
3756
3757     def __call__(self, info_dict, ydl):
3758         if not self.ranges and not self.chapters:
3759             yield {}
3760
3761         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3762                    else 'Cannot match chapters since chapter information is unavailable')
3763         for regex in self.chapters or []:
3764             for i, chapter in enumerate(info_dict.get('chapters') or []):
3765                 if re.search(regex, chapter['title']):
3766                     warning = None
3767                     yield {**chapter, 'index': i}
3768         if self.chapters and warning:
3769             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3770
3771         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3772
3773     def __eq__(self, other):
3774         return (isinstance(other, download_range_func)
3775                 and self.chapters == other.chapters and self.ranges == other.ranges)
3776
3777     def __repr__(self):
3778         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3779
3780
3781 def parse_dfxp_time_expr(time_expr):
3782     if not time_expr:
3783         return
3784
3785     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3786     if mobj:
3787         return float(mobj.group('time_offset'))
3788
3789     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3790     if mobj:
3791         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3792
3793
3794 def srt_subtitles_timecode(seconds):
3795     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3796
3797
3798 def ass_subtitles_timecode(seconds):
3799     time = timetuple_from_msec(seconds * 1000)
3800     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3801
3802
3803 def dfxp2srt(dfxp_data):
3804     '''
3805     @param dfxp_data A bytes-like object containing DFXP data
3806     @returns A unicode object containing converted SRT data
3807     '''
3808     LEGACY_NAMESPACES = (
3809         (b'http://www.w3.org/ns/ttml', [
3810             b'http://www.w3.org/2004/11/ttaf1',
3811             b'http://www.w3.org/2006/04/ttaf1',
3812             b'http://www.w3.org/2006/10/ttaf1',
3813         ]),
3814         (b'http://www.w3.org/ns/ttml#styling', [
3815             b'http://www.w3.org/ns/ttml#style',
3816         ]),
3817     )
3818
3819     SUPPORTED_STYLING = [
3820         'color',
3821         'fontFamily',
3822         'fontSize',
3823         'fontStyle',
3824         'fontWeight',
3825         'textDecoration'
3826     ]
3827
3828     _x = functools.partial(xpath_with_ns, ns_map={
3829         'xml': 'http://www.w3.org/XML/1998/namespace',
3830         'ttml': 'http://www.w3.org/ns/ttml',
3831         'tts': 'http://www.w3.org/ns/ttml#styling',
3832     })
3833
3834     styles = {}
3835     default_style = {}
3836
3837     class TTMLPElementParser:
3838         _out = ''
3839         _unclosed_elements = []
3840         _applied_styles = []
3841
3842         def start(self, tag, attrib):
3843             if tag in (_x('ttml:br'), 'br'):
3844                 self._out += '\n'
3845             else:
3846                 unclosed_elements = []
3847                 style = {}
3848                 element_style_id = attrib.get('style')
3849                 if default_style:
3850                     style.update(default_style)
3851                 if element_style_id:
3852                     style.update(styles.get(element_style_id, {}))
3853                 for prop in SUPPORTED_STYLING:
3854                     prop_val = attrib.get(_x('tts:' + prop))
3855                     if prop_val:
3856                         style[prop] = prop_val
3857                 if style:
3858                     font = ''
3859                     for k, v in sorted(style.items()):
3860                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3861                             continue
3862                         if k == 'color':
3863                             font += ' color="%s"' % v
3864                         elif k == 'fontSize':
3865                             font += ' size="%s"' % v
3866                         elif k == 'fontFamily':
3867                             font += ' face="%s"' % v
3868                         elif k == 'fontWeight' and v == 'bold':
3869                             self._out += '<b>'
3870                             unclosed_elements.append('b')
3871                         elif k == 'fontStyle' and v == 'italic':
3872                             self._out += '<i>'
3873                             unclosed_elements.append('i')
3874                         elif k == 'textDecoration' and v == 'underline':
3875                             self._out += '<u>'
3876                             unclosed_elements.append('u')
3877                     if font:
3878                         self._out += '<font' + font + '>'
3879                         unclosed_elements.append('font')
3880                     applied_style = {}
3881                     if self._applied_styles:
3882                         applied_style.update(self._applied_styles[-1])
3883                     applied_style.update(style)
3884                     self._applied_styles.append(applied_style)
3885                 self._unclosed_elements.append(unclosed_elements)
3886
3887         def end(self, tag):
3888             if tag not in (_x('ttml:br'), 'br'):
3889                 unclosed_elements = self._unclosed_elements.pop()
3890                 for element in reversed(unclosed_elements):
3891                     self._out += '</%s>' % element
3892                 if unclosed_elements and self._applied_styles:
3893                     self._applied_styles.pop()
3894
3895         def data(self, data):
3896             self._out += data
3897
3898         def close(self):
3899             return self._out.strip()
3900
3901     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3902     # This will not trigger false positives since only UTF-8 text is being replaced
3903     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3904
3905     def parse_node(node):
3906         target = TTMLPElementParser()
3907         parser = xml.etree.ElementTree.XMLParser(target=target)
3908         parser.feed(xml.etree.ElementTree.tostring(node))
3909         return parser.close()
3910
3911     for k, v in LEGACY_NAMESPACES:
3912         for ns in v:
3913             dfxp_data = dfxp_data.replace(ns, k)
3914
3915     dfxp = compat_etree_fromstring(dfxp_data)
3916     out = []
3917     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3918
3919     if not paras:
3920         raise ValueError('Invalid dfxp/TTML subtitle')
3921
3922     repeat = False
3923     while True:
3924         for style in dfxp.findall(_x('.//ttml:style')):
3925             style_id = style.get('id') or style.get(_x('xml:id'))
3926             if not style_id:
3927                 continue
3928             parent_style_id = style.get('style')
3929             if parent_style_id:
3930                 if parent_style_id not in styles:
3931                     repeat = True
3932                     continue
3933                 styles[style_id] = styles[parent_style_id].copy()
3934             for prop in SUPPORTED_STYLING:
3935                 prop_val = style.get(_x('tts:' + prop))
3936                 if prop_val:
3937                     styles.setdefault(style_id, {})[prop] = prop_val
3938         if repeat:
3939             repeat = False
3940         else:
3941             break
3942
3943     for p in ('body', 'div'):
3944         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3945         if ele is None:
3946             continue
3947         style = styles.get(ele.get('style'))
3948         if not style:
3949             continue
3950         default_style.update(style)
3951
3952     for para, index in zip(paras, itertools.count(1)):
3953         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3954         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3955         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3956         if begin_time is None:
3957             continue
3958         if not end_time:
3959             if not dur:
3960                 continue
3961             end_time = begin_time + dur
3962         out.append('%d\n%s --> %s\n%s\n\n' % (
3963             index,
3964             srt_subtitles_timecode(begin_time),
3965             srt_subtitles_timecode(end_time),
3966             parse_node(para)))
3967
3968     return ''.join(out)
3969
3970
3971 def cli_option(params, command_option, param, separator=None):
3972     param = params.get(param)
3973     return ([] if param is None
3974             else [command_option, str(param)] if separator is None
3975             else [f'{command_option}{separator}{param}'])
3976
3977
3978 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3979     param = params.get(param)
3980     assert param in (True, False, None)
3981     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3982
3983
3984 def cli_valueless_option(params, command_option, param, expected_value=True):
3985     return [command_option] if params.get(param) == expected_value else []
3986
3987
3988 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3989     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3990         if use_compat:
3991             return argdict
3992         else:
3993             argdict = None
3994     if argdict is None:
3995         return default
3996     assert isinstance(argdict, dict)
3997
3998     assert isinstance(keys, (list, tuple))
3999     for key_list in keys:
4000         arg_list = list(filter(
4001             lambda x: x is not None,
4002             [argdict.get(key.lower()) for key in variadic(key_list)]))
4003         if arg_list:
4004             return [arg for args in arg_list for arg in args]
4005     return default
4006
4007
4008 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4009     main_key, exe = main_key.lower(), exe.lower()
4010     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4011     keys = [f'{root_key}{k}' for k in (keys or [''])]
4012     if root_key in keys:
4013         if main_key != exe:
4014             keys.append((main_key, exe))
4015         keys.append('default')
4016     else:
4017         use_compat = False
4018     return cli_configuration_args(argdict, keys, default, use_compat)
4019
4020
4021 class ISO639Utils:
4022     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4023     _lang_map = {
4024         'aa': 'aar',
4025         'ab': 'abk',
4026         'ae': 'ave',
4027         'af': 'afr',
4028         'ak': 'aka',
4029         'am': 'amh',
4030         'an': 'arg',
4031         'ar': 'ara',
4032         'as': 'asm',
4033         'av': 'ava',
4034         'ay': 'aym',
4035         'az': 'aze',
4036         'ba': 'bak',
4037         'be': 'bel',
4038         'bg': 'bul',
4039         'bh': 'bih',
4040         'bi': 'bis',
4041         'bm': 'bam',
4042         'bn': 'ben',
4043         'bo': 'bod',
4044         'br': 'bre',
4045         'bs': 'bos',
4046         'ca': 'cat',
4047         'ce': 'che',
4048         'ch': 'cha',
4049         'co': 'cos',
4050         'cr': 'cre',
4051         'cs': 'ces',
4052         'cu': 'chu',
4053         'cv': 'chv',
4054         'cy': 'cym',
4055         'da': 'dan',
4056         'de': 'deu',
4057         'dv': 'div',
4058         'dz': 'dzo',
4059         'ee': 'ewe',
4060         'el': 'ell',
4061         'en': 'eng',
4062         'eo': 'epo',
4063         'es': 'spa',
4064         'et': 'est',
4065         'eu': 'eus',
4066         'fa': 'fas',
4067         'ff': 'ful',
4068         'fi': 'fin',
4069         'fj': 'fij',
4070         'fo': 'fao',
4071         'fr': 'fra',
4072         'fy': 'fry',
4073         'ga': 'gle',
4074         'gd': 'gla',
4075         'gl': 'glg',
4076         'gn': 'grn',
4077         'gu': 'guj',
4078         'gv': 'glv',
4079         'ha': 'hau',
4080         'he': 'heb',
4081         'iw': 'heb',  # Replaced by he in 1989 revision
4082         'hi': 'hin',
4083         'ho': 'hmo',
4084         'hr': 'hrv',
4085         'ht': 'hat',
4086         'hu': 'hun',
4087         'hy': 'hye',
4088         'hz': 'her',
4089         'ia': 'ina',
4090         'id': 'ind',
4091         'in': 'ind',  # Replaced by id in 1989 revision
4092         'ie': 'ile',
4093         'ig': 'ibo',
4094         'ii': 'iii',
4095         'ik': 'ipk',
4096         'io': 'ido',
4097         'is': 'isl',
4098         'it': 'ita',
4099         'iu': 'iku',
4100         'ja': 'jpn',
4101         'jv': 'jav',
4102         'ka': 'kat',
4103         'kg': 'kon',
4104         'ki': 'kik',
4105         'kj': 'kua',
4106         'kk': 'kaz',
4107         'kl': 'kal',
4108         'km': 'khm',
4109         'kn': 'kan',
4110         'ko': 'kor',
4111         'kr': 'kau',
4112         'ks': 'kas',
4113         'ku': 'kur',
4114         'kv': 'kom',
4115         'kw': 'cor',
4116         'ky': 'kir',
4117         'la': 'lat',
4118         'lb': 'ltz',
4119         'lg': 'lug',
4120         'li': 'lim',
4121         'ln': 'lin',
4122         'lo': 'lao',
4123         'lt': 'lit',
4124         'lu': 'lub',
4125         'lv': 'lav',
4126         'mg': 'mlg',
4127         'mh': 'mah',
4128         'mi': 'mri',
4129         'mk': 'mkd',
4130         'ml': 'mal',
4131         'mn': 'mon',
4132         'mr': 'mar',
4133         'ms': 'msa',
4134         'mt': 'mlt',
4135         'my': 'mya',
4136         'na': 'nau',
4137         'nb': 'nob',
4138         'nd': 'nde',
4139         'ne': 'nep',
4140         'ng': 'ndo',
4141         'nl': 'nld',
4142         'nn': 'nno',
4143         'no': 'nor',
4144         'nr': 'nbl',
4145         'nv': 'nav',
4146         'ny': 'nya',
4147         'oc': 'oci',
4148         'oj': 'oji',
4149         'om': 'orm',
4150         'or': 'ori',
4151         'os': 'oss',
4152         'pa': 'pan',
4153         'pe': 'per',
4154         'pi': 'pli',
4155         'pl': 'pol',
4156         'ps': 'pus',
4157         'pt': 'por',
4158         'qu': 'que',
4159         'rm': 'roh',
4160         'rn': 'run',
4161         'ro': 'ron',
4162         'ru': 'rus',
4163         'rw': 'kin',
4164         'sa': 'san',
4165         'sc': 'srd',
4166         'sd': 'snd',
4167         'se': 'sme',
4168         'sg': 'sag',
4169         'si': 'sin',
4170         'sk': 'slk',
4171         'sl': 'slv',
4172         'sm': 'smo',
4173         'sn': 'sna',
4174         'so': 'som',
4175         'sq': 'sqi',
4176         'sr': 'srp',
4177         'ss': 'ssw',
4178         'st': 'sot',
4179         'su': 'sun',
4180         'sv': 'swe',
4181         'sw': 'swa',
4182         'ta': 'tam',
4183         'te': 'tel',
4184         'tg': 'tgk',
4185         'th': 'tha',
4186         'ti': 'tir',
4187         'tk': 'tuk',
4188         'tl': 'tgl',
4189         'tn': 'tsn',
4190         'to': 'ton',
4191         'tr': 'tur',
4192         'ts': 'tso',
4193         'tt': 'tat',
4194         'tw': 'twi',
4195         'ty': 'tah',
4196         'ug': 'uig',
4197         'uk': 'ukr',
4198         'ur': 'urd',
4199         'uz': 'uzb',
4200         've': 'ven',
4201         'vi': 'vie',
4202         'vo': 'vol',
4203         'wa': 'wln',
4204         'wo': 'wol',
4205         'xh': 'xho',
4206         'yi': 'yid',
4207         'ji': 'yid',  # Replaced by yi in 1989 revision
4208         'yo': 'yor',
4209         'za': 'zha',
4210         'zh': 'zho',
4211         'zu': 'zul',
4212     }
4213
4214     @classmethod
4215     def short2long(cls, code):
4216         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4217         return cls._lang_map.get(code[:2])
4218
4219     @classmethod
4220     def long2short(cls, code):
4221         """Convert language code from ISO 639-2/T to ISO 639-1"""
4222         for short_name, long_name in cls._lang_map.items():
4223             if long_name == code:
4224                 return short_name
4225
4226
4227 class ISO3166Utils:
4228     # From http://data.okfn.org/data/core/country-list
4229     _country_map = {
4230         'AF': 'Afghanistan',
4231         'AX': 'Åland Islands',
4232         'AL': 'Albania',
4233         'DZ': 'Algeria',
4234         'AS': 'American Samoa',
4235         'AD': 'Andorra',
4236         'AO': 'Angola',
4237         'AI': 'Anguilla',
4238         'AQ': 'Antarctica',
4239         'AG': 'Antigua and Barbuda',
4240         'AR': 'Argentina',
4241         'AM': 'Armenia',
4242         'AW': 'Aruba',
4243         'AU': 'Australia',
4244         'AT': 'Austria',
4245         'AZ': 'Azerbaijan',
4246         'BS': 'Bahamas',
4247         'BH': 'Bahrain',
4248         'BD': 'Bangladesh',
4249         'BB': 'Barbados',
4250         'BY': 'Belarus',
4251         'BE': 'Belgium',
4252         'BZ': 'Belize',
4253         'BJ': 'Benin',
4254         'BM': 'Bermuda',
4255         'BT': 'Bhutan',
4256         'BO': 'Bolivia, Plurinational State of',
4257         'BQ': 'Bonaire, Sint Eustatius and Saba',
4258         'BA': 'Bosnia and Herzegovina',
4259         'BW': 'Botswana',
4260         'BV': 'Bouvet Island',
4261         'BR': 'Brazil',
4262         'IO': 'British Indian Ocean Territory',
4263         'BN': 'Brunei Darussalam',
4264         'BG': 'Bulgaria',
4265         'BF': 'Burkina Faso',
4266         'BI': 'Burundi',
4267         'KH': 'Cambodia',
4268         'CM': 'Cameroon',
4269         'CA': 'Canada',
4270         'CV': 'Cape Verde',
4271         'KY': 'Cayman Islands',
4272         'CF': 'Central African Republic',
4273         'TD': 'Chad',
4274         'CL': 'Chile',
4275         'CN': 'China',
4276         'CX': 'Christmas Island',
4277         'CC': 'Cocos (Keeling) Islands',
4278         'CO': 'Colombia',
4279         'KM': 'Comoros',
4280         'CG': 'Congo',
4281         'CD': 'Congo, the Democratic Republic of the',
4282         'CK': 'Cook Islands',
4283         'CR': 'Costa Rica',
4284         'CI': 'Côte d\'Ivoire',
4285         'HR': 'Croatia',
4286         'CU': 'Cuba',
4287         'CW': 'Curaçao',
4288         'CY': 'Cyprus',
4289         'CZ': 'Czech Republic',
4290         'DK': 'Denmark',
4291         'DJ': 'Djibouti',
4292         'DM': 'Dominica',
4293         'DO': 'Dominican Republic',
4294         'EC': 'Ecuador',
4295         'EG': 'Egypt',
4296         'SV': 'El Salvador',
4297         'GQ': 'Equatorial Guinea',
4298         'ER': 'Eritrea',
4299         'EE': 'Estonia',
4300         'ET': 'Ethiopia',
4301         'FK': 'Falkland Islands (Malvinas)',
4302         'FO': 'Faroe Islands',
4303         'FJ': 'Fiji',
4304         'FI': 'Finland',
4305         'FR': 'France',
4306         'GF': 'French Guiana',
4307         'PF': 'French Polynesia',
4308         'TF': 'French Southern Territories',
4309         'GA': 'Gabon',
4310         'GM': 'Gambia',
4311         'GE': 'Georgia',
4312         'DE': 'Germany',
4313         'GH': 'Ghana',
4314         'GI': 'Gibraltar',
4315         'GR': 'Greece',
4316         'GL': 'Greenland',
4317         'GD': 'Grenada',
4318         'GP': 'Guadeloupe',
4319         'GU': 'Guam',
4320         'GT': 'Guatemala',
4321         'GG': 'Guernsey',
4322         'GN': 'Guinea',
4323         'GW': 'Guinea-Bissau',
4324         'GY': 'Guyana',
4325         'HT': 'Haiti',
4326         'HM': 'Heard Island and McDonald Islands',
4327         'VA': 'Holy See (Vatican City State)',
4328         'HN': 'Honduras',
4329         'HK': 'Hong Kong',
4330         'HU': 'Hungary',
4331         'IS': 'Iceland',
4332         'IN': 'India',
4333         'ID': 'Indonesia',
4334         'IR': 'Iran, Islamic Republic of',
4335         'IQ': 'Iraq',
4336         'IE': 'Ireland',
4337         'IM': 'Isle of Man',
4338         'IL': 'Israel',
4339         'IT': 'Italy',
4340         'JM': 'Jamaica',
4341         'JP': 'Japan',
4342         'JE': 'Jersey',
4343         'JO': 'Jordan',
4344         'KZ': 'Kazakhstan',
4345         'KE': 'Kenya',
4346         'KI': 'Kiribati',
4347         'KP': 'Korea, Democratic People\'s Republic of',
4348         'KR': 'Korea, Republic of',
4349         'KW': 'Kuwait',
4350         'KG': 'Kyrgyzstan',
4351         'LA': 'Lao People\'s Democratic Republic',
4352         'LV': 'Latvia',
4353         'LB': 'Lebanon',
4354         'LS': 'Lesotho',
4355         'LR': 'Liberia',
4356         'LY': 'Libya',
4357         'LI': 'Liechtenstein',
4358         'LT': 'Lithuania',
4359         'LU': 'Luxembourg',
4360         'MO': 'Macao',
4361         'MK': 'Macedonia, the Former Yugoslav Republic of',
4362         'MG': 'Madagascar',
4363         'MW': 'Malawi',
4364         'MY': 'Malaysia',
4365         'MV': 'Maldives',
4366         'ML': 'Mali',
4367         'MT': 'Malta',
4368         'MH': 'Marshall Islands',
4369         'MQ': 'Martinique',
4370         'MR': 'Mauritania',
4371         'MU': 'Mauritius',
4372         'YT': 'Mayotte',
4373         'MX': 'Mexico',
4374         'FM': 'Micronesia, Federated States of',
4375         'MD': 'Moldova, Republic of',
4376         'MC': 'Monaco',
4377         'MN': 'Mongolia',
4378         'ME': 'Montenegro',
4379         'MS': 'Montserrat',
4380         'MA': 'Morocco',
4381         'MZ': 'Mozambique',
4382         'MM': 'Myanmar',
4383         'NA': 'Namibia',
4384         'NR': 'Nauru',
4385         'NP': 'Nepal',
4386         'NL': 'Netherlands',
4387         'NC': 'New Caledonia',
4388         'NZ': 'New Zealand',
4389         'NI': 'Nicaragua',
4390         'NE': 'Niger',
4391         'NG': 'Nigeria',
4392         'NU': 'Niue',
4393         'NF': 'Norfolk Island',
4394         'MP': 'Northern Mariana Islands',
4395         'NO': 'Norway',
4396         'OM': 'Oman',
4397         'PK': 'Pakistan',
4398         'PW': 'Palau',
4399         'PS': 'Palestine, State of',
4400         'PA': 'Panama',
4401         'PG': 'Papua New Guinea',
4402         'PY': 'Paraguay',
4403         'PE': 'Peru',
4404         'PH': 'Philippines',
4405         'PN': 'Pitcairn',
4406         'PL': 'Poland',
4407         'PT': 'Portugal',
4408         'PR': 'Puerto Rico',
4409         'QA': 'Qatar',
4410         'RE': 'Réunion',
4411         'RO': 'Romania',
4412         'RU': 'Russian Federation',
4413         'RW': 'Rwanda',
4414         'BL': 'Saint Barthélemy',
4415         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4416         'KN': 'Saint Kitts and Nevis',
4417         'LC': 'Saint Lucia',
4418         'MF': 'Saint Martin (French part)',
4419         'PM': 'Saint Pierre and Miquelon',
4420         'VC': 'Saint Vincent and the Grenadines',
4421         'WS': 'Samoa',
4422         'SM': 'San Marino',
4423         'ST': 'Sao Tome and Principe',
4424         'SA': 'Saudi Arabia',
4425         'SN': 'Senegal',
4426         'RS': 'Serbia',
4427         'SC': 'Seychelles',
4428         'SL': 'Sierra Leone',
4429         'SG': 'Singapore',
4430         'SX': 'Sint Maarten (Dutch part)',
4431         'SK': 'Slovakia',
4432         'SI': 'Slovenia',
4433         'SB': 'Solomon Islands',
4434         'SO': 'Somalia',
4435         'ZA': 'South Africa',
4436         'GS': 'South Georgia and the South Sandwich Islands',
4437         'SS': 'South Sudan',
4438         'ES': 'Spain',
4439         'LK': 'Sri Lanka',
4440         'SD': 'Sudan',
4441         'SR': 'Suriname',
4442         'SJ': 'Svalbard and Jan Mayen',
4443         'SZ': 'Swaziland',
4444         'SE': 'Sweden',
4445         'CH': 'Switzerland',
4446         'SY': 'Syrian Arab Republic',
4447         'TW': 'Taiwan, Province of China',
4448         'TJ': 'Tajikistan',
4449         'TZ': 'Tanzania, United Republic of',
4450         'TH': 'Thailand',
4451         'TL': 'Timor-Leste',
4452         'TG': 'Togo',
4453         'TK': 'Tokelau',
4454         'TO': 'Tonga',
4455         'TT': 'Trinidad and Tobago',
4456         'TN': 'Tunisia',
4457         'TR': 'Turkey',
4458         'TM': 'Turkmenistan',
4459         'TC': 'Turks and Caicos Islands',
4460         'TV': 'Tuvalu',
4461         'UG': 'Uganda',
4462         'UA': 'Ukraine',
4463         'AE': 'United Arab Emirates',
4464         'GB': 'United Kingdom',
4465         'US': 'United States',
4466         'UM': 'United States Minor Outlying Islands',
4467         'UY': 'Uruguay',
4468         'UZ': 'Uzbekistan',
4469         'VU': 'Vanuatu',
4470         'VE': 'Venezuela, Bolivarian Republic of',
4471         'VN': 'Viet Nam',
4472         'VG': 'Virgin Islands, British',
4473         'VI': 'Virgin Islands, U.S.',
4474         'WF': 'Wallis and Futuna',
4475         'EH': 'Western Sahara',
4476         'YE': 'Yemen',
4477         'ZM': 'Zambia',
4478         'ZW': 'Zimbabwe',
4479         # Not ISO 3166 codes, but used for IP blocks
4480         'AP': 'Asia/Pacific Region',
4481         'EU': 'Europe',
4482     }
4483
4484     @classmethod
4485     def short2full(cls, code):
4486         """Convert an ISO 3166-2 country code to the corresponding full name"""
4487         return cls._country_map.get(code.upper())
4488
4489
4490 class GeoUtils:
4491     # Major IPv4 address blocks per country
4492     _country_ip_map = {
4493         'AD': '46.172.224.0/19',
4494         'AE': '94.200.0.0/13',
4495         'AF': '149.54.0.0/17',
4496         'AG': '209.59.64.0/18',
4497         'AI': '204.14.248.0/21',
4498         'AL': '46.99.0.0/16',
4499         'AM': '46.70.0.0/15',
4500         'AO': '105.168.0.0/13',
4501         'AP': '182.50.184.0/21',
4502         'AQ': '23.154.160.0/24',
4503         'AR': '181.0.0.0/12',
4504         'AS': '202.70.112.0/20',
4505         'AT': '77.116.0.0/14',
4506         'AU': '1.128.0.0/11',
4507         'AW': '181.41.0.0/18',
4508         'AX': '185.217.4.0/22',
4509         'AZ': '5.197.0.0/16',
4510         'BA': '31.176.128.0/17',
4511         'BB': '65.48.128.0/17',
4512         'BD': '114.130.0.0/16',
4513         'BE': '57.0.0.0/8',
4514         'BF': '102.178.0.0/15',
4515         'BG': '95.42.0.0/15',
4516         'BH': '37.131.0.0/17',
4517         'BI': '154.117.192.0/18',
4518         'BJ': '137.255.0.0/16',
4519         'BL': '185.212.72.0/23',
4520         'BM': '196.12.64.0/18',
4521         'BN': '156.31.0.0/16',
4522         'BO': '161.56.0.0/16',
4523         'BQ': '161.0.80.0/20',
4524         'BR': '191.128.0.0/12',
4525         'BS': '24.51.64.0/18',
4526         'BT': '119.2.96.0/19',
4527         'BW': '168.167.0.0/16',
4528         'BY': '178.120.0.0/13',
4529         'BZ': '179.42.192.0/18',
4530         'CA': '99.224.0.0/11',
4531         'CD': '41.243.0.0/16',
4532         'CF': '197.242.176.0/21',
4533         'CG': '160.113.0.0/16',
4534         'CH': '85.0.0.0/13',
4535         'CI': '102.136.0.0/14',
4536         'CK': '202.65.32.0/19',
4537         'CL': '152.172.0.0/14',
4538         'CM': '102.244.0.0/14',
4539         'CN': '36.128.0.0/10',
4540         'CO': '181.240.0.0/12',
4541         'CR': '201.192.0.0/12',
4542         'CU': '152.206.0.0/15',
4543         'CV': '165.90.96.0/19',
4544         'CW': '190.88.128.0/17',
4545         'CY': '31.153.0.0/16',
4546         'CZ': '88.100.0.0/14',
4547         'DE': '53.0.0.0/8',
4548         'DJ': '197.241.0.0/17',
4549         'DK': '87.48.0.0/12',
4550         'DM': '192.243.48.0/20',
4551         'DO': '152.166.0.0/15',
4552         'DZ': '41.96.0.0/12',
4553         'EC': '186.68.0.0/15',
4554         'EE': '90.190.0.0/15',
4555         'EG': '156.160.0.0/11',
4556         'ER': '196.200.96.0/20',
4557         'ES': '88.0.0.0/11',
4558         'ET': '196.188.0.0/14',
4559         'EU': '2.16.0.0/13',
4560         'FI': '91.152.0.0/13',
4561         'FJ': '144.120.0.0/16',
4562         'FK': '80.73.208.0/21',
4563         'FM': '119.252.112.0/20',
4564         'FO': '88.85.32.0/19',
4565         'FR': '90.0.0.0/9',
4566         'GA': '41.158.0.0/15',
4567         'GB': '25.0.0.0/8',
4568         'GD': '74.122.88.0/21',
4569         'GE': '31.146.0.0/16',
4570         'GF': '161.22.64.0/18',
4571         'GG': '62.68.160.0/19',
4572         'GH': '154.160.0.0/12',
4573         'GI': '95.164.0.0/16',
4574         'GL': '88.83.0.0/19',
4575         'GM': '160.182.0.0/15',
4576         'GN': '197.149.192.0/18',
4577         'GP': '104.250.0.0/19',
4578         'GQ': '105.235.224.0/20',
4579         'GR': '94.64.0.0/13',
4580         'GT': '168.234.0.0/16',
4581         'GU': '168.123.0.0/16',
4582         'GW': '197.214.80.0/20',
4583         'GY': '181.41.64.0/18',
4584         'HK': '113.252.0.0/14',
4585         'HN': '181.210.0.0/16',
4586         'HR': '93.136.0.0/13',
4587         'HT': '148.102.128.0/17',
4588         'HU': '84.0.0.0/14',
4589         'ID': '39.192.0.0/10',
4590         'IE': '87.32.0.0/12',
4591         'IL': '79.176.0.0/13',
4592         'IM': '5.62.80.0/20',
4593         'IN': '117.192.0.0/10',
4594         'IO': '203.83.48.0/21',
4595         'IQ': '37.236.0.0/14',
4596         'IR': '2.176.0.0/12',
4597         'IS': '82.221.0.0/16',
4598         'IT': '79.0.0.0/10',
4599         'JE': '87.244.64.0/18',
4600         'JM': '72.27.0.0/17',
4601         'JO': '176.29.0.0/16',
4602         'JP': '133.0.0.0/8',
4603         'KE': '105.48.0.0/12',
4604         'KG': '158.181.128.0/17',
4605         'KH': '36.37.128.0/17',
4606         'KI': '103.25.140.0/22',
4607         'KM': '197.255.224.0/20',
4608         'KN': '198.167.192.0/19',
4609         'KP': '175.45.176.0/22',
4610         'KR': '175.192.0.0/10',
4611         'KW': '37.36.0.0/14',
4612         'KY': '64.96.0.0/15',
4613         'KZ': '2.72.0.0/13',
4614         'LA': '115.84.64.0/18',
4615         'LB': '178.135.0.0/16',
4616         'LC': '24.92.144.0/20',
4617         'LI': '82.117.0.0/19',
4618         'LK': '112.134.0.0/15',
4619         'LR': '102.183.0.0/16',
4620         'LS': '129.232.0.0/17',
4621         'LT': '78.56.0.0/13',
4622         'LU': '188.42.0.0/16',
4623         'LV': '46.109.0.0/16',
4624         'LY': '41.252.0.0/14',
4625         'MA': '105.128.0.0/11',
4626         'MC': '88.209.64.0/18',
4627         'MD': '37.246.0.0/16',
4628         'ME': '178.175.0.0/17',
4629         'MF': '74.112.232.0/21',
4630         'MG': '154.126.0.0/17',
4631         'MH': '117.103.88.0/21',
4632         'MK': '77.28.0.0/15',
4633         'ML': '154.118.128.0/18',
4634         'MM': '37.111.0.0/17',
4635         'MN': '49.0.128.0/17',
4636         'MO': '60.246.0.0/16',
4637         'MP': '202.88.64.0/20',
4638         'MQ': '109.203.224.0/19',
4639         'MR': '41.188.64.0/18',
4640         'MS': '208.90.112.0/22',
4641         'MT': '46.11.0.0/16',
4642         'MU': '105.16.0.0/12',
4643         'MV': '27.114.128.0/18',
4644         'MW': '102.70.0.0/15',
4645         'MX': '187.192.0.0/11',
4646         'MY': '175.136.0.0/13',
4647         'MZ': '197.218.0.0/15',
4648         'NA': '41.182.0.0/16',
4649         'NC': '101.101.0.0/18',
4650         'NE': '197.214.0.0/18',
4651         'NF': '203.17.240.0/22',
4652         'NG': '105.112.0.0/12',
4653         'NI': '186.76.0.0/15',
4654         'NL': '145.96.0.0/11',
4655         'NO': '84.208.0.0/13',
4656         'NP': '36.252.0.0/15',
4657         'NR': '203.98.224.0/19',
4658         'NU': '49.156.48.0/22',
4659         'NZ': '49.224.0.0/14',
4660         'OM': '5.36.0.0/15',
4661         'PA': '186.72.0.0/15',
4662         'PE': '186.160.0.0/14',
4663         'PF': '123.50.64.0/18',
4664         'PG': '124.240.192.0/19',
4665         'PH': '49.144.0.0/13',
4666         'PK': '39.32.0.0/11',
4667         'PL': '83.0.0.0/11',
4668         'PM': '70.36.0.0/20',
4669         'PR': '66.50.0.0/16',
4670         'PS': '188.161.0.0/16',
4671         'PT': '85.240.0.0/13',
4672         'PW': '202.124.224.0/20',
4673         'PY': '181.120.0.0/14',
4674         'QA': '37.210.0.0/15',
4675         'RE': '102.35.0.0/16',
4676         'RO': '79.112.0.0/13',
4677         'RS': '93.86.0.0/15',
4678         'RU': '5.136.0.0/13',
4679         'RW': '41.186.0.0/16',
4680         'SA': '188.48.0.0/13',
4681         'SB': '202.1.160.0/19',
4682         'SC': '154.192.0.0/11',
4683         'SD': '102.120.0.0/13',
4684         'SE': '78.64.0.0/12',
4685         'SG': '8.128.0.0/10',
4686         'SI': '188.196.0.0/14',
4687         'SK': '78.98.0.0/15',
4688         'SL': '102.143.0.0/17',
4689         'SM': '89.186.32.0/19',
4690         'SN': '41.82.0.0/15',
4691         'SO': '154.115.192.0/18',
4692         'SR': '186.179.128.0/17',
4693         'SS': '105.235.208.0/21',
4694         'ST': '197.159.160.0/19',
4695         'SV': '168.243.0.0/16',
4696         'SX': '190.102.0.0/20',
4697         'SY': '5.0.0.0/16',
4698         'SZ': '41.84.224.0/19',
4699         'TC': '65.255.48.0/20',
4700         'TD': '154.68.128.0/19',
4701         'TG': '196.168.0.0/14',
4702         'TH': '171.96.0.0/13',
4703         'TJ': '85.9.128.0/18',
4704         'TK': '27.96.24.0/21',
4705         'TL': '180.189.160.0/20',
4706         'TM': '95.85.96.0/19',
4707         'TN': '197.0.0.0/11',
4708         'TO': '175.176.144.0/21',
4709         'TR': '78.160.0.0/11',
4710         'TT': '186.44.0.0/15',
4711         'TV': '202.2.96.0/19',
4712         'TW': '120.96.0.0/11',
4713         'TZ': '156.156.0.0/14',
4714         'UA': '37.52.0.0/14',
4715         'UG': '102.80.0.0/13',
4716         'US': '6.0.0.0/8',
4717         'UY': '167.56.0.0/13',
4718         'UZ': '84.54.64.0/18',
4719         'VA': '212.77.0.0/19',
4720         'VC': '207.191.240.0/21',
4721         'VE': '186.88.0.0/13',
4722         'VG': '66.81.192.0/20',
4723         'VI': '146.226.0.0/16',
4724         'VN': '14.160.0.0/11',
4725         'VU': '202.80.32.0/20',
4726         'WF': '117.20.32.0/21',
4727         'WS': '202.4.32.0/19',
4728         'YE': '134.35.0.0/16',
4729         'YT': '41.242.116.0/22',
4730         'ZA': '41.0.0.0/11',
4731         'ZM': '102.144.0.0/13',
4732         'ZW': '102.177.192.0/18',
4733     }
4734
4735     @classmethod
4736     def random_ipv4(cls, code_or_block):
4737         if len(code_or_block) == 2:
4738             block = cls._country_ip_map.get(code_or_block.upper())
4739             if not block:
4740                 return None
4741         else:
4742             block = code_or_block
4743         addr, preflen = block.split('/')
4744         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4745         addr_max = addr_min | (0xffffffff >> int(preflen))
4746         return str(socket.inet_ntoa(
4747             struct.pack('!L', random.randint(addr_min, addr_max))))
4748
4749
4750 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4751     def __init__(self, proxies=None):
4752         # Set default handlers
4753         for type in ('http', 'https'):
4754             setattr(self, '%s_open' % type,
4755                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4756                         meth(r, proxy, type))
4757         urllib.request.ProxyHandler.__init__(self, proxies)
4758
4759     def proxy_open(self, req, proxy, type):
4760         req_proxy = req.headers.get('Ytdl-request-proxy')
4761         if req_proxy is not None:
4762             proxy = req_proxy
4763             del req.headers['Ytdl-request-proxy']
4764
4765         if proxy == '__noproxy__':
4766             return None  # No Proxy
4767         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4768             req.add_header('Ytdl-socks-proxy', proxy)
4769             # yt-dlp's http/https handlers do wrapping the socket with socks
4770             return None
4771         return urllib.request.ProxyHandler.proxy_open(
4772             self, req, proxy, type)
4773
4774
4775 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4776 # released into Public Domain
4777 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4778
4779 def long_to_bytes(n, blocksize=0):
4780     """long_to_bytes(n:long, blocksize:int) : string
4781     Convert a long integer to a byte string.
4782
4783     If optional blocksize is given and greater than zero, pad the front of the
4784     byte string with binary zeros so that the length is a multiple of
4785     blocksize.
4786     """
4787     # after much testing, this algorithm was deemed to be the fastest
4788     s = b''
4789     n = int(n)
4790     while n > 0:
4791         s = struct.pack('>I', n & 0xffffffff) + s
4792         n = n >> 32
4793     # strip off leading zeros
4794     for i in range(len(s)):
4795         if s[i] != b'\000'[0]:
4796             break
4797     else:
4798         # only happens when n == 0
4799         s = b'\000'
4800         i = 0
4801     s = s[i:]
4802     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4803     # de-padding being done above, but sigh...
4804     if blocksize > 0 and len(s) % blocksize:
4805         s = (blocksize - len(s) % blocksize) * b'\000' + s
4806     return s
4807
4808
4809 def bytes_to_long(s):
4810     """bytes_to_long(string) : long
4811     Convert a byte string to a long integer.
4812
4813     This is (essentially) the inverse of long_to_bytes().
4814     """
4815     acc = 0
4816     length = len(s)
4817     if length % 4:
4818         extra = (4 - length % 4)
4819         s = b'\000' * extra + s
4820         length = length + extra
4821     for i in range(0, length, 4):
4822         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4823     return acc
4824
4825
4826 def ohdave_rsa_encrypt(data, exponent, modulus):
4827     '''
4828     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4829
4830     Input:
4831         data: data to encrypt, bytes-like object
4832         exponent, modulus: parameter e and N of RSA algorithm, both integer
4833     Output: hex string of encrypted data
4834
4835     Limitation: supports one block encryption only
4836     '''
4837
4838     payload = int(binascii.hexlify(data[::-1]), 16)
4839     encrypted = pow(payload, exponent, modulus)
4840     return '%x' % encrypted
4841
4842
4843 def pkcs1pad(data, length):
4844     """
4845     Padding input data with PKCS#1 scheme
4846
4847     @param {int[]} data        input data
4848     @param {int}   length      target length
4849     @returns {int[]}           padded data
4850     """
4851     if len(data) > length - 11:
4852         raise ValueError('Input data too long for PKCS#1 padding')
4853
4854     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4855     return [0, 2] + pseudo_random + [0] + data
4856
4857
4858 def _base_n_table(n, table):
4859     if not table and not n:
4860         raise ValueError('Either table or n must be specified')
4861     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4862
4863     if n and n != len(table):
4864         raise ValueError(f'base {n} exceeds table length {len(table)}')
4865     return table
4866
4867
4868 def encode_base_n(num, n=None, table=None):
4869     """Convert given int to a base-n string"""
4870     table = _base_n_table(n, table)
4871     if not num:
4872         return table[0]
4873
4874     result, base = '', len(table)
4875     while num:
4876         result = table[num % base] + result
4877         num = num // base
4878     return result
4879
4880
4881 def decode_base_n(string, n=None, table=None):
4882     """Convert given base-n string to int"""
4883     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4884     result, base = 0, len(table)
4885     for char in string:
4886         result = result * base + table[char]
4887     return result
4888
4889
4890 def decode_packed_codes(code):
4891     mobj = re.search(PACKED_CODES_RE, code)
4892     obfuscated_code, base, count, symbols = mobj.groups()
4893     base = int(base)
4894     count = int(count)
4895     symbols = symbols.split('|')
4896     symbol_table = {}
4897
4898     while count:
4899         count -= 1
4900         base_n_count = encode_base_n(count, base)
4901         symbol_table[base_n_count] = symbols[count] or base_n_count
4902
4903     return re.sub(
4904         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4905         obfuscated_code)
4906
4907
4908 def caesar(s, alphabet, shift):
4909     if shift == 0:
4910         return s
4911     l = len(alphabet)
4912     return ''.join(
4913         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4914         for c in s)
4915
4916
4917 def rot47(s):
4918     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4919
4920
4921 def parse_m3u8_attributes(attrib):
4922     info = {}
4923     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4924         if val.startswith('"'):
4925             val = val[1:-1]
4926         info[key] = val
4927     return info
4928
4929
4930 def urshift(val, n):
4931     return val >> n if val >= 0 else (val + 0x100000000) >> n
4932
4933
4934 def write_xattr(path, key, value):
4935     # Windows: Write xattrs to NTFS Alternate Data Streams:
4936     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4937     if compat_os_name == 'nt':
4938         assert ':' not in key
4939         assert os.path.exists(path)
4940
4941         try:
4942             with open(f'{path}:{key}', 'wb') as f:
4943                 f.write(value)
4944         except OSError as e:
4945             raise XAttrMetadataError(e.errno, e.strerror)
4946         return
4947
4948     # UNIX Method 1. Use xattrs/pyxattrs modules
4949
4950     setxattr = None
4951     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4952         # Unicode arguments are not supported in pyxattr until version 0.5.0
4953         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4954         if version_tuple(xattr.__version__) >= (0, 5, 0):
4955             setxattr = xattr.set
4956     elif xattr:
4957         setxattr = xattr.setxattr
4958
4959     if setxattr:
4960         try:
4961             setxattr(path, key, value)
4962         except OSError as e:
4963             raise XAttrMetadataError(e.errno, e.strerror)
4964         return
4965
4966     # UNIX Method 2. Use setfattr/xattr executables
4967     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4968            else 'xattr' if check_executable('xattr', ['-h']) else None)
4969     if not exe:
4970         raise XAttrUnavailableError(
4971             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4972             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4973
4974     value = value.decode()
4975     try:
4976         _, stderr, returncode = Popen.run(
4977             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4978             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4979     except OSError as e:
4980         raise XAttrMetadataError(e.errno, e.strerror)
4981     if returncode:
4982         raise XAttrMetadataError(returncode, stderr)
4983
4984
4985 def random_birthday(year_field, month_field, day_field):
4986     start_date = datetime.date(1950, 1, 1)
4987     end_date = datetime.date(1995, 12, 31)
4988     offset = random.randint(0, (end_date - start_date).days)
4989     random_date = start_date + datetime.timedelta(offset)
4990     return {
4991         year_field: str(random_date.year),
4992         month_field: str(random_date.month),
4993         day_field: str(random_date.day),
4994     }
4995
4996
4997 def find_available_port(interface=''):
4998     try:
4999         with socket.socket() as sock:
5000             sock.bind((interface, 0))
5001             return sock.getsockname()[1]
5002     except OSError:
5003         return None
5004
5005
5006 # Templates for internet shortcut files, which are plain text files.
5007 DOT_URL_LINK_TEMPLATE = '''\
5008 [InternetShortcut]
5009 URL=%(url)s
5010 '''
5011
5012 DOT_WEBLOC_LINK_TEMPLATE = '''\
5013 <?xml version="1.0" encoding="UTF-8"?>
5014 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5015 <plist version="1.0">
5016 <dict>
5017 \t<key>URL</key>
5018 \t<string>%(url)s</string>
5019 </dict>
5020 </plist>
5021 '''
5022
5023 DOT_DESKTOP_LINK_TEMPLATE = '''\
5024 [Desktop Entry]
5025 Encoding=UTF-8
5026 Name=%(filename)s
5027 Type=Link
5028 URL=%(url)s
5029 Icon=text-html
5030 '''
5031
5032 LINK_TEMPLATES = {
5033     'url': DOT_URL_LINK_TEMPLATE,
5034     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5035     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5036 }
5037
5038
5039 def iri_to_uri(iri):
5040     """
5041     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5042
5043     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5044     """
5045
5046     iri_parts = urllib.parse.urlparse(iri)
5047
5048     if '[' in iri_parts.netloc:
5049         raise ValueError('IPv6 URIs are not, yet, supported.')
5050         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5051
5052     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5053
5054     net_location = ''
5055     if iri_parts.username:
5056         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5057         if iri_parts.password is not None:
5058             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5059         net_location += '@'
5060
5061     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5062     # The 'idna' encoding produces ASCII text.
5063     if iri_parts.port is not None and iri_parts.port != 80:
5064         net_location += ':' + str(iri_parts.port)
5065
5066     return urllib.parse.urlunparse(
5067         (iri_parts.scheme,
5068             net_location,
5069
5070             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5071
5072             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5073             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5074
5075             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5076             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5077
5078             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5079
5080     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5081
5082
5083 def to_high_limit_path(path):
5084     if sys.platform in ['win32', 'cygwin']:
5085         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5086         return '\\\\?\\' + os.path.abspath(path)
5087
5088     return path
5089
5090
5091 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5092     val = traversal.traverse_obj(obj, *variadic(field))
5093     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5094         return default
5095     return template % func(val)
5096
5097
5098 def clean_podcast_url(url):
5099     return re.sub(r'''(?x)
5100         (?:
5101             (?:
5102                 chtbl\.com/track|
5103                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5104                 play\.podtrac\.com
5105             )/[^/]+|
5106             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5107             flex\.acast\.com|
5108             pd(?:
5109                 cn\.co| # https://podcorn.com/analytics-prefix/
5110                 st\.fm # https://podsights.com/docs/
5111             )/e
5112         )/''', '', url)
5113
5114
5115 _HEX_TABLE = '0123456789abcdef'
5116
5117
5118 def random_uuidv4():
5119     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5120
5121
5122 def make_dir(path, to_screen=None):
5123     try:
5124         dn = os.path.dirname(path)
5125         if dn:
5126             os.makedirs(dn, exist_ok=True)
5127         return True
5128     except OSError as err:
5129         if callable(to_screen) is not None:
5130             to_screen(f'unable to create directory {err}')
5131         return False
5132
5133
5134 def get_executable_path():
5135     from ..update import _get_variant_and_executable_path
5136
5137     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5138
5139
5140 def get_user_config_dirs(package_name):
5141     # .config (e.g. ~/.config/package_name)
5142     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5143     yield os.path.join(xdg_config_home, package_name)
5144
5145     # appdata (%APPDATA%/package_name)
5146     appdata_dir = os.getenv('appdata')
5147     if appdata_dir:
5148         yield os.path.join(appdata_dir, package_name)
5149
5150     # home (~/.package_name)
5151     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5152
5153
5154 def get_system_config_dirs(package_name):
5155     # /etc/package_name
5156     yield os.path.join('/etc', package_name)
5157
5158
5159 def time_seconds(**kwargs):
5160     """
5161     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5162     """
5163     return time.time() + datetime.timedelta(**kwargs).total_seconds()
5164
5165
5166 # create a JSON Web Signature (jws) with HS256 algorithm
5167 # the resulting format is in JWS Compact Serialization
5168 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5169 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5170 def jwt_encode_hs256(payload_data, key, headers={}):
5171     header_data = {
5172         'alg': 'HS256',
5173         'typ': 'JWT',
5174     }
5175     if headers:
5176         header_data.update(headers)
5177     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5178     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5179     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5180     signature_b64 = base64.b64encode(h.digest())
5181     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5182     return token
5183
5184
5185 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5186 def jwt_decode_hs256(jwt):
5187     header_b64, payload_b64, signature_b64 = jwt.split('.')
5188     # add trailing ='s that may have been stripped, superfluous ='s are ignored
5189     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5190     return payload_data
5191
5192
5193 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5194
5195
5196 @functools.cache
5197 def supports_terminal_sequences(stream):
5198     if compat_os_name == 'nt':
5199         if not WINDOWS_VT_MODE:
5200             return False
5201     elif not os.getenv('TERM'):
5202         return False
5203     try:
5204         return stream.isatty()
5205     except BaseException:
5206         return False
5207
5208
5209 def windows_enable_vt_mode():
5210     """Ref: https://bugs.python.org/issue30075 """
5211     if get_windows_version() < (10, 0, 10586):
5212         return
5213
5214     import ctypes
5215     import ctypes.wintypes
5216     import msvcrt
5217
5218     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5219
5220     dll = ctypes.WinDLL('kernel32', use_last_error=False)
5221     handle = os.open('CONOUT$', os.O_RDWR)
5222     try:
5223         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5224         dw_original_mode = ctypes.wintypes.DWORD()
5225         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5226         if not success:
5227             raise Exception('GetConsoleMode failed')
5228
5229         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5230             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5231         if not success:
5232             raise Exception('SetConsoleMode failed')
5233     finally:
5234         os.close(handle)
5235
5236     global WINDOWS_VT_MODE
5237     WINDOWS_VT_MODE = True
5238     supports_terminal_sequences.cache_clear()
5239
5240
5241 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5242
5243
5244 def remove_terminal_sequences(string):
5245     return _terminal_sequences_re.sub('', string)
5246
5247
5248 def number_of_digits(number):
5249     return len('%d' % number)
5250
5251
5252 def join_nonempty(*values, delim='-', from_dict=None):
5253     if from_dict is not None:
5254         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
5255     return delim.join(map(str, filter(None, values)))
5256
5257
5258 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5259     """
5260     Find the largest format dimensions in terms of video width and, for each thumbnail:
5261     * Modify the URL: Match the width with the provided regex and replace with the former width
5262     * Update dimensions
5263
5264     This function is useful with video services that scale the provided thumbnails on demand
5265     """
5266     _keys = ('width', 'height')
5267     max_dimensions = max(
5268         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5269         default=(0, 0))
5270     if not max_dimensions[0]:
5271         return thumbnails
5272     return [
5273         merge_dicts(
5274             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5275             dict(zip(_keys, max_dimensions)), thumbnail)
5276         for thumbnail in thumbnails
5277     ]
5278
5279
5280 def parse_http_range(range):
5281     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5282     if not range:
5283         return None, None, None
5284     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5285     if not crg:
5286         return None, None, None
5287     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5288
5289
5290 def read_stdin(what):
5291     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5292     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5293     return sys.stdin
5294
5295
5296 def determine_file_encoding(data):
5297     """
5298     Detect the text encoding used
5299     @returns (encoding, bytes to skip)
5300     """
5301
5302     # BOM marks are given priority over declarations
5303     for bom, enc in BOMS:
5304         if data.startswith(bom):
5305             return enc, len(bom)
5306
5307     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5308     # We ignore the endianness to get a good enough match
5309     data = data.replace(b'\0', b'')
5310     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5311     return mobj.group(1).decode() if mobj else None, 0
5312
5313
5314 class Config:
5315     own_args = None
5316     parsed_args = None
5317     filename = None
5318     __initialized = False
5319
5320     def __init__(self, parser, label=None):
5321         self.parser, self.label = parser, label
5322         self._loaded_paths, self.configs = set(), []
5323
5324     def init(self, args=None, filename=None):
5325         assert not self.__initialized
5326         self.own_args, self.filename = args, filename
5327         return self.load_configs()
5328
5329     def load_configs(self):
5330         directory = ''
5331         if self.filename:
5332             location = os.path.realpath(self.filename)
5333             directory = os.path.dirname(location)
5334             if location in self._loaded_paths:
5335                 return False
5336             self._loaded_paths.add(location)
5337
5338         self.__initialized = True
5339         opts, _ = self.parser.parse_known_args(self.own_args)
5340         self.parsed_args = self.own_args
5341         for location in opts.config_locations or []:
5342             if location == '-':
5343                 if location in self._loaded_paths:
5344                     continue
5345                 self._loaded_paths.add(location)
5346                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5347                 continue
5348             location = os.path.join(directory, expand_path(location))
5349             if os.path.isdir(location):
5350                 location = os.path.join(location, 'yt-dlp.conf')
5351             if not os.path.exists(location):
5352                 self.parser.error(f'config location {location} does not exist')
5353             self.append_config(self.read_file(location), location)
5354         return True
5355
5356     def __str__(self):
5357         label = join_nonempty(
5358             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5359             delim=' ')
5360         return join_nonempty(
5361             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5362             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5363             delim='\n')
5364
5365     @staticmethod
5366     def read_file(filename, default=[]):
5367         try:
5368             optionf = open(filename, 'rb')
5369         except OSError:
5370             return default  # silently skip if file is not present
5371         try:
5372             enc, skip = determine_file_encoding(optionf.read(512))
5373             optionf.seek(skip, io.SEEK_SET)
5374         except OSError:
5375             enc = None  # silently skip read errors
5376         try:
5377             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5378             contents = optionf.read().decode(enc or preferredencoding())
5379             res = shlex.split(contents, comments=True)
5380         except Exception as err:
5381             raise ValueError(f'Unable to parse "{filename}": {err}')
5382         finally:
5383             optionf.close()
5384         return res
5385
5386     @staticmethod
5387     def hide_login_info(opts):
5388         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5389         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5390
5391         def _scrub_eq(o):
5392             m = eqre.match(o)
5393             if m:
5394                 return m.group('key') + '=PRIVATE'
5395             else:
5396                 return o
5397
5398         opts = list(map(_scrub_eq, opts))
5399         for idx, opt in enumerate(opts):
5400             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5401                 opts[idx + 1] = 'PRIVATE'
5402         return opts
5403
5404     def append_config(self, *args, label=None):
5405         config = type(self)(self.parser, label)
5406         config._loaded_paths = self._loaded_paths
5407         if config.init(*args):
5408             self.configs.append(config)
5409
5410     @property
5411     def all_args(self):
5412         for config in reversed(self.configs):
5413             yield from config.all_args
5414         yield from self.parsed_args or []
5415
5416     def parse_known_args(self, **kwargs):
5417         return self.parser.parse_known_args(self.all_args, **kwargs)
5418
5419     def parse_args(self):
5420         return self.parser.parse_args(self.all_args)
5421
5422
5423 class WebSocketsWrapper:
5424     """Wraps websockets module to use in non-async scopes"""
5425     pool = None
5426
5427     def __init__(self, url, headers=None, connect=True):
5428         self.loop = asyncio.new_event_loop()
5429         # XXX: "loop" is deprecated
5430         self.conn = websockets.connect(
5431             url, extra_headers=headers, ping_interval=None,
5432             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5433         if connect:
5434             self.__enter__()
5435         atexit.register(self.__exit__, None, None, None)
5436
5437     def __enter__(self):
5438         if not self.pool:
5439             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5440         return self
5441
5442     def send(self, *args):
5443         self.run_with_loop(self.pool.send(*args), self.loop)
5444
5445     def recv(self, *args):
5446         return self.run_with_loop(self.pool.recv(*args), self.loop)
5447
5448     def __exit__(self, type, value, traceback):
5449         try:
5450             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5451         finally:
5452             self.loop.close()
5453             self._cancel_all_tasks(self.loop)
5454
5455     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5456     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5457     @staticmethod
5458     def run_with_loop(main, loop):
5459         if not asyncio.iscoroutine(main):
5460             raise ValueError(f'a coroutine was expected, got {main!r}')
5461
5462         try:
5463             return loop.run_until_complete(main)
5464         finally:
5465             loop.run_until_complete(loop.shutdown_asyncgens())
5466             if hasattr(loop, 'shutdown_default_executor'):
5467                 loop.run_until_complete(loop.shutdown_default_executor())
5468
5469     @staticmethod
5470     def _cancel_all_tasks(loop):
5471         to_cancel = asyncio.all_tasks(loop)
5472
5473         if not to_cancel:
5474             return
5475
5476         for task in to_cancel:
5477             task.cancel()
5478
5479         # XXX: "loop" is removed in python 3.10+
5480         loop.run_until_complete(
5481             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5482
5483         for task in to_cancel:
5484             if task.cancelled():
5485                 continue
5486             if task.exception() is not None:
5487                 loop.call_exception_handler({
5488                     'message': 'unhandled exception during asyncio.run() shutdown',
5489                     'exception': task.exception(),
5490                     'task': task,
5491                 })
5492
5493
5494 def merge_headers(*dicts):
5495     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5496     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5497
5498
5499 def cached_method(f):
5500     """Cache a method"""
5501     signature = inspect.signature(f)
5502
5503     @functools.wraps(f)
5504     def wrapper(self, *args, **kwargs):
5505         bound_args = signature.bind(self, *args, **kwargs)
5506         bound_args.apply_defaults()
5507         key = tuple(bound_args.arguments.values())[1:]
5508
5509         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5510         if key not in cache:
5511             cache[key] = f(self, *args, **kwargs)
5512         return cache[key]
5513     return wrapper
5514
5515
5516 class classproperty:
5517     """property access for class methods with optional caching"""
5518     def __new__(cls, func=None, *args, **kwargs):
5519         if not func:
5520             return functools.partial(cls, *args, **kwargs)
5521         return super().__new__(cls)
5522
5523     def __init__(self, func, *, cache=False):
5524         functools.update_wrapper(self, func)
5525         self.func = func
5526         self._cache = {} if cache else None
5527
5528     def __get__(self, _, cls):
5529         if self._cache is None:
5530             return self.func(cls)
5531         elif cls not in self._cache:
5532             self._cache[cls] = self.func(cls)
5533         return self._cache[cls]
5534
5535
5536 class function_with_repr:
5537     def __init__(self, func, repr_=None):
5538         functools.update_wrapper(self, func)
5539         self.func, self.__repr = func, repr_
5540
5541     def __call__(self, *args, **kwargs):
5542         return self.func(*args, **kwargs)
5543
5544     def __repr__(self):
5545         if self.__repr:
5546             return self.__repr
5547         return f'{self.func.__module__}.{self.func.__qualname__}'
5548
5549
5550 class Namespace(types.SimpleNamespace):
5551     """Immutable namespace"""
5552
5553     def __iter__(self):
5554         return iter(self.__dict__.values())
5555
5556     @property
5557     def items_(self):
5558         return self.__dict__.items()
5559
5560
5561 MEDIA_EXTENSIONS = Namespace(
5562     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5563     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5564     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5565     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5566     thumbnails=('jpg', 'png', 'webp'),
5567     storyboards=('mhtml', ),
5568     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5569     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5570 )
5571 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5572 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5573
5574 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5575
5576
5577 class RetryManager:
5578     """Usage:
5579         for retry in RetryManager(...):
5580             try:
5581                 ...
5582             except SomeException as err:
5583                 retry.error = err
5584                 continue
5585     """
5586     attempt, _error = 0, None
5587
5588     def __init__(self, _retries, _error_callback, **kwargs):
5589         self.retries = _retries or 0
5590         self.error_callback = functools.partial(_error_callback, **kwargs)
5591
5592     def _should_retry(self):
5593         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5594
5595     @property
5596     def error(self):
5597         if self._error is NO_DEFAULT:
5598             return None
5599         return self._error
5600
5601     @error.setter
5602     def error(self, value):
5603         self._error = value
5604
5605     def __iter__(self):
5606         while self._should_retry():
5607             self.error = NO_DEFAULT
5608             self.attempt += 1
5609             yield self
5610             if self.error:
5611                 self.error_callback(self.error, self.attempt, self.retries)
5612
5613     @staticmethod
5614     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5615         """Utility function for reporting retries"""
5616         if count > retries:
5617             if error:
5618                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5619             raise e
5620
5621         if not count:
5622             return warn(e)
5623         elif isinstance(e, ExtractorError):
5624             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5625         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5626
5627         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5628         if delay:
5629             info(f'Sleeping {delay:.2f} seconds ...')
5630             time.sleep(delay)
5631
5632
5633 def make_archive_id(ie, video_id):
5634     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5635     return f'{ie_key.lower()} {video_id}'
5636
5637
5638 def truncate_string(s, left, right=0):
5639     assert left > 3 and right >= 0
5640     if s is None or len(s) <= left + right:
5641         return s
5642     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5643
5644
5645 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5646     assert 'all' in alias_dict, '"all" alias is required'
5647     requested = list(start or [])
5648     for val in options:
5649         discard = val.startswith('-')
5650         if discard:
5651             val = val[1:]
5652
5653         if val in alias_dict:
5654             val = alias_dict[val] if not discard else [
5655                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5656             # NB: Do not allow regex in aliases for performance
5657             requested = orderedSet_from_options(val, alias_dict, start=requested)
5658             continue
5659
5660         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5661                    else [val] if val in alias_dict['all'] else None)
5662         if current is None:
5663             raise ValueError(val)
5664
5665         if discard:
5666             for item in current:
5667                 while item in requested:
5668                     requested.remove(item)
5669         else:
5670             requested.extend(current)
5671
5672     return orderedSet(requested)
5673
5674
5675 # TODO: Rewrite
5676 class FormatSorter:
5677     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5678
5679     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5680                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5681                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5682     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5683                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5684                     'fps', 'fs_approx', 'source', 'id')
5685
5686     settings = {
5687         'vcodec': {'type': 'ordered', 'regex': True,
5688                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5689         'acodec': {'type': 'ordered', 'regex': True,
5690                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5691         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5692                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5693         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5694                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5695         'vext': {'type': 'ordered', 'field': 'video_ext',
5696                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5697                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5698         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5699                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5700                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5701         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5702         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5703                        'field': ('vcodec', 'acodec'),
5704                        'function': lambda it: int(any(v != 'none' for v in it))},
5705         'ie_pref': {'priority': True, 'type': 'extractor'},
5706         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5707         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5708         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5709         'quality': {'convert': 'float', 'default': -1},
5710         'filesize': {'convert': 'bytes'},
5711         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5712         'id': {'convert': 'string', 'field': 'format_id'},
5713         'height': {'convert': 'float_none'},
5714         'width': {'convert': 'float_none'},
5715         'fps': {'convert': 'float_none'},
5716         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5717         'tbr': {'convert': 'float_none'},
5718         'vbr': {'convert': 'float_none'},
5719         'abr': {'convert': 'float_none'},
5720         'asr': {'convert': 'float_none'},
5721         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5722
5723         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5724         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'),
5725                'function': lambda it: next(filter(None, it), None)},
5726         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'),
5727                  'function': lambda it: next(filter(None, it), None)},
5728         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5729         'res': {'type': 'multiple', 'field': ('height', 'width'),
5730                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5731
5732         # Actual field names
5733         'format_id': {'type': 'alias', 'field': 'id'},
5734         'preference': {'type': 'alias', 'field': 'ie_pref'},
5735         'language_preference': {'type': 'alias', 'field': 'lang'},
5736         'source_preference': {'type': 'alias', 'field': 'source'},
5737         'protocol': {'type': 'alias', 'field': 'proto'},
5738         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5739         'audio_channels': {'type': 'alias', 'field': 'channels'},
5740
5741         # Deprecated
5742         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5743         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5744         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5745         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5746         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5747         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5748         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5749         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5750         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5751         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5752         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5753         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5754         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5755         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5756         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5757         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5758         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5759         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5760         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5761         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5762     }
5763
5764     def __init__(self, ydl, field_preference):
5765         self.ydl = ydl
5766         self._order = []
5767         self.evaluate_params(self.ydl.params, field_preference)
5768         if ydl.params.get('verbose'):
5769             self.print_verbose_info(self.ydl.write_debug)
5770
5771     def _get_field_setting(self, field, key):
5772         if field not in self.settings:
5773             if key in ('forced', 'priority'):
5774                 return False
5775             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5776                                         'deprecated and may be removed in a future version')
5777             self.settings[field] = {}
5778         propObj = self.settings[field]
5779         if key not in propObj:
5780             type = propObj.get('type')
5781             if key == 'field':
5782                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5783             elif key == 'convert':
5784                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5785             else:
5786                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5787             propObj[key] = default
5788         return propObj[key]
5789
5790     def _resolve_field_value(self, field, value, convertNone=False):
5791         if value is None:
5792             if not convertNone:
5793                 return None
5794         else:
5795             value = value.lower()
5796         conversion = self._get_field_setting(field, 'convert')
5797         if conversion == 'ignore':
5798             return None
5799         if conversion == 'string':
5800             return value
5801         elif conversion == 'float_none':
5802             return float_or_none(value)
5803         elif conversion == 'bytes':
5804             return parse_bytes(value)
5805         elif conversion == 'order':
5806             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5807             use_regex = self._get_field_setting(field, 'regex')
5808             list_length = len(order_list)
5809             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5810             if use_regex and value is not None:
5811                 for i, regex in enumerate(order_list):
5812                     if regex and re.match(regex, value):
5813                         return list_length - i
5814                 return list_length - empty_pos  # not in list
5815             else:  # not regex or  value = None
5816                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5817         else:
5818             if value.isnumeric():
5819                 return float(value)
5820             else:
5821                 self.settings[field]['convert'] = 'string'
5822                 return value
5823
5824     def evaluate_params(self, params, sort_extractor):
5825         self._use_free_order = params.get('prefer_free_formats', False)
5826         self._sort_user = params.get('format_sort', [])
5827         self._sort_extractor = sort_extractor
5828
5829         def add_item(field, reverse, closest, limit_text):
5830             field = field.lower()
5831             if field in self._order:
5832                 return
5833             self._order.append(field)
5834             limit = self._resolve_field_value(field, limit_text)
5835             data = {
5836                 'reverse': reverse,
5837                 'closest': False if limit is None else closest,
5838                 'limit_text': limit_text,
5839                 'limit': limit}
5840             if field in self.settings:
5841                 self.settings[field].update(data)
5842             else:
5843                 self.settings[field] = data
5844
5845         sort_list = (
5846             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5847             + (tuple() if params.get('format_sort_force', False)
5848                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5849             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5850
5851         for item in sort_list:
5852             match = re.match(self.regex, item)
5853             if match is None:
5854                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5855             field = match.group('field')
5856             if field is None:
5857                 continue
5858             if self._get_field_setting(field, 'type') == 'alias':
5859                 alias, field = field, self._get_field_setting(field, 'field')
5860                 if self._get_field_setting(alias, 'deprecated'):
5861                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5862                                                 f'be removed in a future version. Please use {field} instead')
5863             reverse = match.group('reverse') is not None
5864             closest = match.group('separator') == '~'
5865             limit_text = match.group('limit')
5866
5867             has_limit = limit_text is not None
5868             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5869             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5870
5871             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5872             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5873             limit_count = len(limits)
5874             for (i, f) in enumerate(fields):
5875                 add_item(f, reverse, closest,
5876                          limits[i] if i < limit_count
5877                          else limits[0] if has_limit and not has_multiple_limits
5878                          else None)
5879
5880     def print_verbose_info(self, write_debug):
5881         if self._sort_user:
5882             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5883         if self._sort_extractor:
5884             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5885         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5886             '+' if self._get_field_setting(field, 'reverse') else '', field,
5887             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5888                           self._get_field_setting(field, 'limit_text'),
5889                           self._get_field_setting(field, 'limit'))
5890             if self._get_field_setting(field, 'limit_text') is not None else '')
5891             for field in self._order if self._get_field_setting(field, 'visible')]))
5892
5893     def _calculate_field_preference_from_value(self, format, field, type, value):
5894         reverse = self._get_field_setting(field, 'reverse')
5895         closest = self._get_field_setting(field, 'closest')
5896         limit = self._get_field_setting(field, 'limit')
5897
5898         if type == 'extractor':
5899             maximum = self._get_field_setting(field, 'max')
5900             if value is None or (maximum is not None and value >= maximum):
5901                 value = -1
5902         elif type == 'boolean':
5903             in_list = self._get_field_setting(field, 'in_list')
5904             not_in_list = self._get_field_setting(field, 'not_in_list')
5905             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5906         elif type == 'ordered':
5907             value = self._resolve_field_value(field, value, True)
5908
5909         # try to convert to number
5910         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5911         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5912         if is_num:
5913             value = val_num
5914
5915         return ((-10, 0) if value is None
5916                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5917                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5918                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5919                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5920                 else (-1, value, 0))
5921
5922     def _calculate_field_preference(self, format, field):
5923         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5924         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5925         if type == 'multiple':
5926             type = 'field'  # Only 'field' is allowed in multiple for now
5927             actual_fields = self._get_field_setting(field, 'field')
5928
5929             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5930         else:
5931             value = get_value(field)
5932         return self._calculate_field_preference_from_value(format, field, type, value)
5933
5934     def calculate_preference(self, format):
5935         # Determine missing protocol
5936         if not format.get('protocol'):
5937             format['protocol'] = determine_protocol(format)
5938
5939         # Determine missing ext
5940         if not format.get('ext') and 'url' in format:
5941             format['ext'] = determine_ext(format['url'])
5942         if format.get('vcodec') == 'none':
5943             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5944             format['video_ext'] = 'none'
5945         else:
5946             format['video_ext'] = format['ext']
5947             format['audio_ext'] = 'none'
5948         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5949         #    format['preference'] = -1000
5950
5951         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5952             # HEVC-over-FLV is out-of-spec by FLV's original spec
5953             # ref. https://trac.ffmpeg.org/ticket/6389
5954             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5955             format['preference'] = -100
5956
5957         # Determine missing bitrates
5958         if format.get('vcodec') == 'none':
5959             format['vbr'] = 0
5960         if format.get('acodec') == 'none':
5961             format['abr'] = 0
5962         if not format.get('vbr') and format.get('vcodec') != 'none':
5963             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5964         if not format.get('abr') and format.get('acodec') != 'none':
5965             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5966         if not format.get('tbr'):
5967             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5968
5969         return tuple(self._calculate_field_preference(format, field) for field in self._order)