yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import unicodedata
  44 import urllib.error
  45 import urllib.parse
  46 import urllib.request
  47 import xml.etree.ElementTree
  48 import zlib
  49
  50 from . import traversal
  51
  52 from ..compat import functools  # isort: split
  53 from ..compat import (
  54     compat_etree_fromstring,
  55     compat_expanduser,
  56     compat_HTMLParseError,
  57     compat_os_name,
  58     compat_shlex_quote,
  59 )
  60 from ..dependencies import brotli, certifi, websockets, xattr
  61 from ..socks import ProxyType, sockssocket
  62
  63 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  64
  65 # This is not clearly defined otherwise
  66 compiled_regex_type = type(re.compile(''))
  67
  68
  69 def random_user_agent():
  70     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  71     _CHROME_VERSIONS = (
  72         '90.0.4430.212',
  73         '90.0.4430.24',
  74         '90.0.4430.70',
  75         '90.0.4430.72',
  76         '90.0.4430.85',
  77         '90.0.4430.93',
  78         '91.0.4472.101',
  79         '91.0.4472.106',
  80         '91.0.4472.114',
  81         '91.0.4472.124',
  82         '91.0.4472.164',
  83         '91.0.4472.19',
  84         '91.0.4472.77',
  85         '92.0.4515.107',
  86         '92.0.4515.115',
  87         '92.0.4515.131',
  88         '92.0.4515.159',
  89         '92.0.4515.43',
  90         '93.0.4556.0',
  91         '93.0.4577.15',
  92         '93.0.4577.63',
  93         '93.0.4577.82',
  94         '94.0.4606.41',
  95         '94.0.4606.54',
  96         '94.0.4606.61',
  97         '94.0.4606.71',
  98         '94.0.4606.81',
  99         '94.0.4606.85',
 100         '95.0.4638.17',
 101         '95.0.4638.50',
 102         '95.0.4638.54',
 103         '95.0.4638.69',
 104         '95.0.4638.74',
 105         '96.0.4664.18',
 106         '96.0.4664.45',
 107         '96.0.4664.55',
 108         '96.0.4664.93',
 109         '97.0.4692.20',
 110     )
 111     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 112
 113
 114 SUPPORTED_ENCODINGS = [
 115     'gzip', 'deflate'
 116 ]
 117 if brotli:
 118     SUPPORTED_ENCODINGS.append('br')
 119
 120 std_headers = {
 121     'User-Agent': random_user_agent(),
 122     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 123     'Accept-Language': 'en-us,en;q=0.5',
 124     'Sec-Fetch-Mode': 'navigate',
 125 }
 126
 127
 128 USER_AGENTS = {
 129     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 130 }
 131
 132
 133 class NO_DEFAULT:
 134     pass
 135
 136
 137 def IDENTITY(x):
 138     return x
 139
 140
 141 ENGLISH_MONTH_NAMES = [
 142     'January', 'February', 'March', 'April', 'May', 'June',
 143     'July', 'August', 'September', 'October', 'November', 'December']
 144
 145 MONTH_NAMES = {
 146     'en': ENGLISH_MONTH_NAMES,
 147     'fr': [
 148         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 149         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 150     # these follow the genitive grammatical case (dopełniacz)
 151     # some websites might be using nominative, which will require another month list
 152     # https://en.wikibooks.org/wiki/Polish/Noun_cases
 153     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
 154            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
 155 }
 156
 157 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 158 TIMEZONE_NAMES = {
 159     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 160     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 161     'EST': -5, 'EDT': -4,  # Eastern
 162     'CST': -6, 'CDT': -5,  # Central
 163     'MST': -7, 'MDT': -6,  # Mountain
 164     'PST': -8, 'PDT': -7   # Pacific
 165 }
 166
 167 # needed for sanitizing filenames in restricted mode
 168 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 169                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 170                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 171
 172 DATE_FORMATS = (
 173     '%d %B %Y',
 174     '%d %b %Y',
 175     '%B %d %Y',
 176     '%B %dst %Y',
 177     '%B %dnd %Y',
 178     '%B %drd %Y',
 179     '%B %dth %Y',
 180     '%b %d %Y',
 181     '%b %dst %Y',
 182     '%b %dnd %Y',
 183     '%b %drd %Y',
 184     '%b %dth %Y',
 185     '%b %dst %Y %I:%M',
 186     '%b %dnd %Y %I:%M',
 187     '%b %drd %Y %I:%M',
 188     '%b %dth %Y %I:%M',
 189     '%Y %m %d',
 190     '%Y-%m-%d',
 191     '%Y.%m.%d.',
 192     '%Y/%m/%d',
 193     '%Y/%m/%d %H:%M',
 194     '%Y/%m/%d %H:%M:%S',
 195     '%Y%m%d%H%M',
 196     '%Y%m%d%H%M%S',
 197     '%Y%m%d',
 198     '%Y-%m-%d %H:%M',
 199     '%Y-%m-%d %H:%M:%S',
 200     '%Y-%m-%d %H:%M:%S.%f',
 201     '%Y-%m-%d %H:%M:%S:%f',
 202     '%d.%m.%Y %H:%M',
 203     '%d.%m.%Y %H.%M',
 204     '%Y-%m-%dT%H:%M:%SZ',
 205     '%Y-%m-%dT%H:%M:%S.%fZ',
 206     '%Y-%m-%dT%H:%M:%S.%f0Z',
 207     '%Y-%m-%dT%H:%M:%S',
 208     '%Y-%m-%dT%H:%M:%S.%f',
 209     '%Y-%m-%dT%H:%M',
 210     '%b %d %Y at %H:%M',
 211     '%b %d %Y at %H:%M:%S',
 212     '%B %d %Y at %H:%M',
 213     '%B %d %Y at %H:%M:%S',
 214     '%H:%M %d-%b-%Y',
 215 )
 216
 217 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 218 DATE_FORMATS_DAY_FIRST.extend([
 219     '%d-%m-%Y',
 220     '%d.%m.%Y',
 221     '%d.%m.%y',
 222     '%d/%m/%Y',
 223     '%d/%m/%y',
 224     '%d/%m/%Y %H:%M:%S',
 225     '%d-%m-%Y %H:%M',
 226 ])
 227
 228 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 229 DATE_FORMATS_MONTH_FIRST.extend([
 230     '%m-%d-%Y',
 231     '%m.%d.%Y',
 232     '%m/%d/%Y',
 233     '%m/%d/%y',
 234     '%m/%d/%Y %H:%M:%S',
 235 ])
 236
 237 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 238 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 239
 240 NUMBER_RE = r'\d+(?:\.\d+)?'
 241
 242
 243 @functools.cache
 244 def preferredencoding():
 245     """Get preferred encoding.
 246
 247     Returns the best encoding scheme for the system, based on
 248     locale.getpreferredencoding() and some further tweaks.
 249     """
 250     try:
 251         pref = locale.getpreferredencoding()
 252         'TEST'.encode(pref)
 253     except Exception:
 254         pref = 'UTF-8'
 255
 256     return pref
 257
 258
 259 def write_json_file(obj, fn):
 260     """ Encode obj as JSON and write it to fn, atomically if possible """
 261
 262     tf = tempfile.NamedTemporaryFile(
 263         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 264         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 265
 266     try:
 267         with tf:
 268             json.dump(obj, tf, ensure_ascii=False)
 269         if sys.platform == 'win32':
 270             # Need to remove existing file on Windows, else os.rename raises
 271             # WindowsError or FileExistsError.
 272             with contextlib.suppress(OSError):
 273                 os.unlink(fn)
 274         with contextlib.suppress(OSError):
 275             mask = os.umask(0)
 276             os.umask(mask)
 277             os.chmod(tf.name, 0o666 & ~mask)
 278         os.rename(tf.name, fn)
 279     except Exception:
 280         with contextlib.suppress(OSError):
 281             os.remove(tf.name)
 282         raise
 283
 284
 285 def find_xpath_attr(node, xpath, key, val=None):
 286     """ Find the xpath xpath[@key=val] """
 287     assert re.match(r'^[a-zA-Z_-]+$', key)
 288     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 289     return node.find(expr)
 290
 291 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 292 # the namespace parameter
 293
 294
 295 def xpath_with_ns(path, ns_map):
 296     components = [c.split(':') for c in path.split('/')]
 297     replaced = []
 298     for c in components:
 299         if len(c) == 1:
 300             replaced.append(c[0])
 301         else:
 302             ns, tag = c
 303             replaced.append('{%s}%s' % (ns_map[ns], tag))
 304     return '/'.join(replaced)
 305
 306
 307 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 308     def _find_xpath(xpath):
 309         return node.find(xpath)
 310
 311     if isinstance(xpath, str):
 312         n = _find_xpath(xpath)
 313     else:
 314         for xp in xpath:
 315             n = _find_xpath(xp)
 316             if n is not None:
 317                 break
 318
 319     if n is None:
 320         if default is not NO_DEFAULT:
 321             return default
 322         elif fatal:
 323             name = xpath if name is None else name
 324             raise ExtractorError('Could not find XML element %s' % name)
 325         else:
 326             return None
 327     return n
 328
 329
 330 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 331     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 332     if n is None or n == default:
 333         return n
 334     if n.text is None:
 335         if default is not NO_DEFAULT:
 336             return default
 337         elif fatal:
 338             name = xpath if name is None else name
 339             raise ExtractorError('Could not find XML element\'s text %s' % name)
 340         else:
 341             return None
 342     return n.text
 343
 344
 345 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 346     n = find_xpath_attr(node, xpath, key)
 347     if n is None:
 348         if default is not NO_DEFAULT:
 349             return default
 350         elif fatal:
 351             name = f'{xpath}[@{key}]' if name is None else name
 352             raise ExtractorError('Could not find XML attribute %s' % name)
 353         else:
 354             return None
 355     return n.attrib[key]
 356
 357
 358 def get_element_by_id(id, html, **kwargs):
 359     """Return the content of the tag with the specified ID in the passed HTML document"""
 360     return get_element_by_attribute('id', id, html, **kwargs)
 361
 362
 363 def get_element_html_by_id(id, html, **kwargs):
 364     """Return the html of the tag with the specified ID in the passed HTML document"""
 365     return get_element_html_by_attribute('id', id, html, **kwargs)
 366
 367
 368 def get_element_by_class(class_name, html):
 369     """Return the content of the first tag with the specified class in the passed HTML document"""
 370     retval = get_elements_by_class(class_name, html)
 371     return retval[0] if retval else None
 372
 373
 374 def get_element_html_by_class(class_name, html):
 375     """Return the html of the first tag with the specified class in the passed HTML document"""
 376     retval = get_elements_html_by_class(class_name, html)
 377     return retval[0] if retval else None
 378
 379
 380 def get_element_by_attribute(attribute, value, html, **kwargs):
 381     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 382     return retval[0] if retval else None
 383
 384
 385 def get_element_html_by_attribute(attribute, value, html, **kargs):
 386     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 387     return retval[0] if retval else None
 388
 389
 390 def get_elements_by_class(class_name, html, **kargs):
 391     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 392     return get_elements_by_attribute(
 393         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 394         html, escape_value=False)
 395
 396
 397 def get_elements_html_by_class(class_name, html):
 398     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 399     return get_elements_html_by_attribute(
 400         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 401         html, escape_value=False)
 402
 403
 404 def get_elements_by_attribute(*args, **kwargs):
 405     """Return the content of the tag with the specified attribute in the passed HTML document"""
 406     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 407
 408
 409 def get_elements_html_by_attribute(*args, **kwargs):
 410     """Return the html of the tag with the specified attribute in the passed HTML document"""
 411     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 412
 413
 414 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 415     """
 416     Return the text (content) and the html (whole) of the tag with the specified
 417     attribute in the passed HTML document
 418     """
 419     if not value:
 420         return
 421
 422     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 423
 424     value = re.escape(value) if escape_value else value
 425
 426     partial_element_re = rf'''(?x)
 427         <(?P<tag>{tag})
 428          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 429          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 430         '''
 431
 432     for m in re.finditer(partial_element_re, html):
 433         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 434
 435         yield (
 436             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 437             whole
 438         )
 439
 440
 441 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 442     """
 443     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 444     closing tag for the first opening tag it has encountered, and can be used
 445     as a context manager
 446     """
 447
 448     class HTMLBreakOnClosingTagException(Exception):
 449         pass
 450
 451     def __init__(self):
 452         self.tagstack = collections.deque()
 453         html.parser.HTMLParser.__init__(self)
 454
 455     def __enter__(self):
 456         return self
 457
 458     def __exit__(self, *_):
 459         self.close()
 460
 461     def close(self):
 462         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 463         # so data remains buffered; we no longer have any interest in it, thus
 464         # override this method to discard it
 465         pass
 466
 467     def handle_starttag(self, tag, _):
 468         self.tagstack.append(tag)
 469
 470     def handle_endtag(self, tag):
 471         if not self.tagstack:
 472             raise compat_HTMLParseError('no tags in the stack')
 473         while self.tagstack:
 474             inner_tag = self.tagstack.pop()
 475             if inner_tag == tag:
 476                 break
 477         else:
 478             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 479         if not self.tagstack:
 480             raise self.HTMLBreakOnClosingTagException()
 481
 482
 483 # XXX: This should be far less strict
 484 def get_element_text_and_html_by_tag(tag, html):
 485     """
 486     For the first element with the specified tag in the passed HTML document
 487     return its' content (text) and the whole element (html)
 488     """
 489     def find_or_raise(haystack, needle, exc):
 490         try:
 491             return haystack.index(needle)
 492         except ValueError:
 493             raise exc
 494     closing_tag = f'</{tag}>'
 495     whole_start = find_or_raise(
 496         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 497     content_start = find_or_raise(
 498         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 499     content_start += whole_start + 1
 500     with HTMLBreakOnClosingTagParser() as parser:
 501         parser.feed(html[whole_start:content_start])
 502         if not parser.tagstack or parser.tagstack[0] != tag:
 503             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 504         offset = content_start
 505         while offset < len(html):
 506             next_closing_tag_start = find_or_raise(
 507                 html[offset:], closing_tag,
 508                 compat_HTMLParseError(f'closing {tag} tag not found'))
 509             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 510             try:
 511                 parser.feed(html[offset:offset + next_closing_tag_end])
 512                 offset += next_closing_tag_end
 513             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 514                 return html[content_start:offset + next_closing_tag_start], \
 515                     html[whole_start:offset + next_closing_tag_end]
 516         raise compat_HTMLParseError('unexpected end of html')
 517
 518
 519 class HTMLAttributeParser(html.parser.HTMLParser):
 520     """Trivial HTML parser to gather the attributes for a single element"""
 521
 522     def __init__(self):
 523         self.attrs = {}
 524         html.parser.HTMLParser.__init__(self)
 525
 526     def handle_starttag(self, tag, attrs):
 527         self.attrs = dict(attrs)
 528         raise compat_HTMLParseError('done')
 529
 530
 531 class HTMLListAttrsParser(html.parser.HTMLParser):
 532     """HTML parser to gather the attributes for the elements of a list"""
 533
 534     def __init__(self):
 535         html.parser.HTMLParser.__init__(self)
 536         self.items = []
 537         self._level = 0
 538
 539     def handle_starttag(self, tag, attrs):
 540         if tag == 'li' and self._level == 0:
 541             self.items.append(dict(attrs))
 542         self._level += 1
 543
 544     def handle_endtag(self, tag):
 545         self._level -= 1
 546
 547
 548 def extract_attributes(html_element):
 549     """Given a string for an HTML element such as
 550     <el
 551          a="foo" B="bar" c="&98;az" d=boz
 552          empty= noval entity="&amp;"
 553          sq='"' dq="'"
 554     >
 555     Decode and return a dictionary of attributes.
 556     {
 557         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 558         'empty': '', 'noval': None, 'entity': '&',
 559         'sq': '"', 'dq': '\''
 560     }.
 561     """
 562     parser = HTMLAttributeParser()
 563     with contextlib.suppress(compat_HTMLParseError):
 564         parser.feed(html_element)
 565         parser.close()
 566     return parser.attrs
 567
 568
 569 def parse_list(webpage):
 570     """Given a string for an series of HTML <li> elements,
 571     return a dictionary of their attributes"""
 572     parser = HTMLListAttrsParser()
 573     parser.feed(webpage)
 574     parser.close()
 575     return parser.items
 576
 577
 578 def clean_html(html):
 579     """Clean an HTML snippet into a readable string"""
 580
 581     if html is None:  # Convenience for sanitizing descriptions etc.
 582         return html
 583
 584     html = re.sub(r'\s+', ' ', html)
 585     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 586     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 587     # Strip html tags
 588     html = re.sub('<.*?>', '', html)
 589     # Replace html entities
 590     html = unescapeHTML(html)
 591     return html.strip()
 592
 593
 594 class LenientJSONDecoder(json.JSONDecoder):
 595     # TODO: Write tests
 596     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 597         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 598         self._close_attempts = 2 * close_objects
 599         super().__init__(*args, **kwargs)
 600
 601     @staticmethod
 602     def _close_object(err):
 603         doc = err.doc[:err.pos]
 604         # We need to add comma first to get the correct error message
 605         if err.msg.startswith('Expecting \',\''):
 606             return doc + ','
 607         elif not doc.endswith(','):
 608             return
 609
 610         if err.msg.startswith('Expecting property name'):
 611             return doc[:-1] + '}'
 612         elif err.msg.startswith('Expecting value'):
 613             return doc[:-1] + ']'
 614
 615     def decode(self, s):
 616         if self.transform_source:
 617             s = self.transform_source(s)
 618         for attempt in range(self._close_attempts + 1):
 619             try:
 620                 if self.ignore_extra:
 621                     return self.raw_decode(s.lstrip())[0]
 622                 return super().decode(s)
 623             except json.JSONDecodeError as e:
 624                 if e.pos is None:
 625                     raise
 626                 elif attempt < self._close_attempts:
 627                     s = self._close_object(e)
 628                     if s is not None:
 629                         continue
 630                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 631         assert False, 'Too many attempts to decode JSON'
 632
 633
 634 def sanitize_open(filename, open_mode):
 635     """Try to open the given filename, and slightly tweak it if this fails.
 636
 637     Attempts to open the given filename. If this fails, it tries to change
 638     the filename slightly, step by step, until it's either able to open it
 639     or it fails and raises a final exception, like the standard open()
 640     function.
 641
 642     It returns the tuple (stream, definitive_file_name).
 643     """
 644     if filename == '-':
 645         if sys.platform == 'win32':
 646             import msvcrt
 647
 648             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 649             with contextlib.suppress(io.UnsupportedOperation):
 650                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 651         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 652
 653     for attempt in range(2):
 654         try:
 655             try:
 656                 if sys.platform == 'win32':
 657                     # FIXME: An exclusive lock also locks the file from being read.
 658                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 659                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 660                     raise LockingUnsupportedError()
 661                 stream = locked_file(filename, open_mode, block=False).__enter__()
 662             except OSError:
 663                 stream = open(filename, open_mode)
 664             return stream, filename
 665         except OSError as err:
 666             if attempt or err.errno in (errno.EACCES,):
 667                 raise
 668             old_filename, filename = filename, sanitize_path(filename)
 669             if old_filename == filename:
 670                 raise
 671
 672
 673 def timeconvert(timestr):
 674     """Convert RFC 2822 defined time string into system timestamp"""
 675     timestamp = None
 676     timetuple = email.utils.parsedate_tz(timestr)
 677     if timetuple is not None:
 678         timestamp = email.utils.mktime_tz(timetuple)
 679     return timestamp
 680
 681
 682 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 683     """Sanitizes a string so it could be used as part of a filename.
 684     @param restricted   Use a stricter subset of allowed characters
 685     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 686                         If unset, yt-dlp's new sanitization rules are in effect
 687     """
 688     if s == '':
 689         return ''
 690
 691     def replace_insane(char):
 692         if restricted and char in ACCENT_CHARS:
 693             return ACCENT_CHARS[char]
 694         elif not restricted and char == '\n':
 695             return '\0 '
 696         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 697             # Replace with their full-width unicode counterparts
 698             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 699         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 700             return ''
 701         elif char == '"':
 702             return '' if restricted else '\''
 703         elif char == ':':
 704             return '\0_\0-' if restricted else '\0 \0-'
 705         elif char in '\\/|*<>':
 706             return '\0_'
 707         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 708             return '\0_'
 709         return char
 710
 711     # Replace look-alike Unicode glyphs
 712     if restricted and (is_id is NO_DEFAULT or not is_id):
 713         s = unicodedata.normalize('NFKC', s)
 714     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 715     result = ''.join(map(replace_insane, s))
 716     if is_id is NO_DEFAULT:
 717         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 718         STRIP_RE = r'(?:\0.|[ _-])*'
 719         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 720     result = result.replace('\0', '') or '_'
 721
 722     if not is_id:
 723         while '__' in result:
 724             result = result.replace('__', '_')
 725         result = result.strip('_')
 726         # Common case of "Foreign band name - English song title"
 727         if restricted and result.startswith('-_'):
 728             result = result[2:]
 729         if result.startswith('-'):
 730             result = '_' + result[len('-'):]
 731         result = result.lstrip('.')
 732         if not result:
 733             result = '_'
 734     return result
 735
 736
 737 def sanitize_path(s, force=False):
 738     """Sanitizes and normalizes path on Windows"""
 739     if sys.platform == 'win32':
 740         force = False
 741         drive_or_unc, _ = os.path.splitdrive(s)
 742     elif force:
 743         drive_or_unc = ''
 744     else:
 745         return s
 746
 747     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 748     if drive_or_unc:
 749         norm_path.pop(0)
 750     sanitized_path = [
 751         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 752         for path_part in norm_path]
 753     if drive_or_unc:
 754         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 755     elif force and s and s[0] == os.path.sep:
 756         sanitized_path.insert(0, os.path.sep)
 757     return os.path.join(*sanitized_path)
 758
 759
 760 def sanitize_url(url, *, scheme='http'):
 761     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 762     # the number of unwanted failures due to missing protocol
 763     if url is None:
 764         return
 765     elif url.startswith('//'):
 766         return f'{scheme}:{url}'
 767     # Fix some common typos seen so far
 768     COMMON_TYPOS = (
 769         # https://github.com/ytdl-org/youtube-dl/issues/15649
 770         (r'^httpss://', r'https://'),
 771         # https://bx1.be/lives/direct-tv/
 772         (r'^rmtp([es]?)://', r'rtmp\1://'),
 773     )
 774     for mistake, fixup in COMMON_TYPOS:
 775         if re.match(mistake, url):
 776             return re.sub(mistake, fixup, url)
 777     return url
 778
 779
 780 def extract_basic_auth(url):
 781     parts = urllib.parse.urlsplit(url)
 782     if parts.username is None:
 783         return url, None
 784     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 785         parts.hostname if parts.port is None
 786         else '%s:%d' % (parts.hostname, parts.port))))
 787     auth_payload = base64.b64encode(
 788         ('%s:%s' % (parts.username, parts.password or '')).encode())
 789     return url, f'Basic {auth_payload.decode()}'
 790
 791
 792 def sanitized_Request(url, *args, **kwargs):
 793     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 794     if auth_header is not None:
 795         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 796         headers['Authorization'] = auth_header
 797     return urllib.request.Request(url, *args, **kwargs)
 798
 799
 800 def expand_path(s):
 801     """Expand shell variables and ~"""
 802     return os.path.expandvars(compat_expanduser(s))
 803
 804
 805 def orderedSet(iterable, *, lazy=False):
 806     """Remove all duplicates from the input iterable"""
 807     def _iter():
 808         seen = []  # Do not use set since the items can be unhashable
 809         for x in iterable:
 810             if x not in seen:
 811                 seen.append(x)
 812                 yield x
 813
 814     return _iter() if lazy else list(_iter())
 815
 816
 817 def _htmlentity_transform(entity_with_semicolon):
 818     """Transforms an HTML entity to a character."""
 819     entity = entity_with_semicolon[:-1]
 820
 821     # Known non-numeric HTML entity
 822     if entity in html.entities.name2codepoint:
 823         return chr(html.entities.name2codepoint[entity])
 824
 825     # TODO: HTML5 allows entities without a semicolon.
 826     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 827     if entity_with_semicolon in html.entities.html5:
 828         return html.entities.html5[entity_with_semicolon]
 829
 830     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 831     if mobj is not None:
 832         numstr = mobj.group(1)
 833         if numstr.startswith('x'):
 834             base = 16
 835             numstr = '0%s' % numstr
 836         else:
 837             base = 10
 838         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 839         with contextlib.suppress(ValueError):
 840             return chr(int(numstr, base))
 841
 842     # Unknown entity in name, return its literal representation
 843     return '&%s;' % entity
 844
 845
 846 def unescapeHTML(s):
 847     if s is None:
 848         return None
 849     assert isinstance(s, str)
 850
 851     return re.sub(
 852         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 853
 854
 855 def escapeHTML(text):
 856     return (
 857         text
 858         .replace('&', '&amp;')
 859         .replace('<', '&lt;')
 860         .replace('>', '&gt;')
 861         .replace('"', '&quot;')
 862         .replace("'", '&#39;')
 863     )
 864
 865
 866 def process_communicate_or_kill(p, *args, **kwargs):
 867     deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
 868                         f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
 869     return Popen.communicate_or_kill(p, *args, **kwargs)
 870
 871
 872 class Popen(subprocess.Popen):
 873     if sys.platform == 'win32':
 874         _startupinfo = subprocess.STARTUPINFO()
 875         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 876     else:
 877         _startupinfo = None
 878
 879     @staticmethod
 880     def _fix_pyinstaller_ld_path(env):
 881         """Restore LD_LIBRARY_PATH when using PyInstaller
 882             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 883                  https://github.com/yt-dlp/yt-dlp/issues/4573
 884         """
 885         if not hasattr(sys, '_MEIPASS'):
 886             return
 887
 888         def _fix(key):
 889             orig = env.get(f'{key}_ORIG')
 890             if orig is None:
 891                 env.pop(key, None)
 892             else:
 893                 env[key] = orig
 894
 895         _fix('LD_LIBRARY_PATH')  # Linux
 896         _fix('DYLD_LIBRARY_PATH')  # macOS
 897
 898     def __init__(self, *args, env=None, text=False, **kwargs):
 899         if env is None:
 900             env = os.environ.copy()
 901         self._fix_pyinstaller_ld_path(env)
 902
 903         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 904         if text is True:
 905             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 906             kwargs.setdefault('encoding', 'utf-8')
 907             kwargs.setdefault('errors', 'replace')
 908         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 909
 910     def communicate_or_kill(self, *args, **kwargs):
 911         try:
 912             return self.communicate(*args, **kwargs)
 913         except BaseException:  # Including KeyboardInterrupt
 914             self.kill(timeout=None)
 915             raise
 916
 917     def kill(self, *, timeout=0):
 918         super().kill()
 919         if timeout != 0:
 920             self.wait(timeout=timeout)
 921
 922     @classmethod
 923     def run(cls, *args, timeout=None, **kwargs):
 924         with cls(*args, **kwargs) as proc:
 925             default = '' if proc.__text_mode else b''
 926             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 927             return stdout or default, stderr or default, proc.returncode
 928
 929
 930 def encodeArgument(s):
 931     # Legacy code that uses byte strings
 932     # Uncomment the following line after fixing all post processors
 933     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 934     return s if isinstance(s, str) else s.decode('ascii')
 935
 936
 937 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 938
 939
 940 def timetuple_from_msec(msec):
 941     secs, msec = divmod(msec, 1000)
 942     mins, secs = divmod(secs, 60)
 943     hrs, mins = divmod(mins, 60)
 944     return _timetuple(hrs, mins, secs, msec)
 945
 946
 947 def formatSeconds(secs, delim=':', msec=False):
 948     time = timetuple_from_msec(secs * 1000)
 949     if time.hours:
 950         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 951     elif time.minutes:
 952         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 953     else:
 954         ret = '%d' % time.seconds
 955     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 956
 957
 958 def _ssl_load_windows_store_certs(ssl_context, storename):
 959     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 960     try:
 961         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 962                  if encoding == 'x509_asn' and (
 963                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 964     except PermissionError:
 965         return
 966     for cert in certs:
 967         with contextlib.suppress(ssl.SSLError):
 968             ssl_context.load_verify_locations(cadata=cert)
 969
 970
 971 def make_HTTPS_handler(params, **kwargs):
 972     opts_check_certificate = not params.get('nocheckcertificate')
 973     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 974     context.check_hostname = opts_check_certificate
 975     if params.get('legacyserverconnect'):
 976         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 977         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 978         context.set_ciphers('DEFAULT')
 979     elif (
 980         sys.version_info < (3, 10)
 981         and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
 982         and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
 983     ):
 984         # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
 985         # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
 986         # in some situations [2][3].
 987         # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
 988         # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
 989         # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
 990         # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
 991         # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
 992         # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
 993         # 4. https://peps.python.org/pep-0644/
 994         # 5. https://peps.python.org/pep-0644/#libressl-support
 995         # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
 996         context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
 997         context.minimum_version = ssl.TLSVersion.TLSv1_2
 998
 999     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1000     if opts_check_certificate:
1001         if certifi and 'no-certifi' not in params.get('compat_opts', []):
1002             context.load_verify_locations(cafile=certifi.where())
1003         else:
1004             try:
1005                 context.load_default_certs()
1006                 # Work around the issue in load_default_certs when there are bad certificates. See:
1007                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1008                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1009             except ssl.SSLError:
1010                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1011                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1012                     for storename in ('CA', 'ROOT'):
1013                         _ssl_load_windows_store_certs(context, storename)
1014                 context.set_default_verify_paths()
1015
1016     client_certfile = params.get('client_certificate')
1017     if client_certfile:
1018         try:
1019             context.load_cert_chain(
1020                 client_certfile, keyfile=params.get('client_certificate_key'),
1021                 password=params.get('client_certificate_password'))
1022         except ssl.SSLError:
1023             raise YoutubeDLError('Unable to load client certificate')
1024
1025     # Some servers may reject requests if ALPN extension is not sent. See:
1026     # https://github.com/python/cpython/issues/85140
1027     # https://github.com/yt-dlp/yt-dlp/issues/3878
1028     with contextlib.suppress(NotImplementedError):
1029         context.set_alpn_protocols(['http/1.1'])
1030
1031     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1032
1033
1034 def bug_reports_message(before=';'):
1035     from ..update import REPOSITORY
1036
1037     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1038            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1039
1040     before = before.rstrip()
1041     if not before or before.endswith(('.', '!', '?')):
1042         msg = msg[0].title() + msg[1:]
1043
1044     return (before + ' ' if before else '') + msg
1045
1046
1047 class YoutubeDLError(Exception):
1048     """Base exception for YoutubeDL errors."""
1049     msg = None
1050
1051     def __init__(self, msg=None):
1052         if msg is not None:
1053             self.msg = msg
1054         elif self.msg is None:
1055             self.msg = type(self).__name__
1056         super().__init__(self.msg)
1057
1058
1059 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1060 if hasattr(ssl, 'CertificateError'):
1061     network_exceptions.append(ssl.CertificateError)
1062 network_exceptions = tuple(network_exceptions)
1063
1064
1065 class ExtractorError(YoutubeDLError):
1066     """Error during info extraction."""
1067
1068     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1069         """ tb, if given, is the original traceback (so that it can be printed out).
1070         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1071         """
1072         if sys.exc_info()[0] in network_exceptions:
1073             expected = True
1074
1075         self.orig_msg = str(msg)
1076         self.traceback = tb
1077         self.expected = expected
1078         self.cause = cause
1079         self.video_id = video_id
1080         self.ie = ie
1081         self.exc_info = sys.exc_info()  # preserve original exception
1082         if isinstance(self.exc_info[1], ExtractorError):
1083             self.exc_info = self.exc_info[1].exc_info
1084         super().__init__(self.__msg)
1085
1086     @property
1087     def __msg(self):
1088         return ''.join((
1089             format_field(self.ie, None, '[%s] '),
1090             format_field(self.video_id, None, '%s: '),
1091             self.orig_msg,
1092             format_field(self.cause, None, ' (caused by %r)'),
1093             '' if self.expected else bug_reports_message()))
1094
1095     def format_traceback(self):
1096         return join_nonempty(
1097             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1098             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1099             delim='\n') or None
1100
1101     def __setattr__(self, name, value):
1102         super().__setattr__(name, value)
1103         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1104             self.msg = self.__msg or type(self).__name__
1105             self.args = (self.msg, )  # Cannot be property
1106
1107
1108 class UnsupportedError(ExtractorError):
1109     def __init__(self, url):
1110         super().__init__(
1111             'Unsupported URL: %s' % url, expected=True)
1112         self.url = url
1113
1114
1115 class RegexNotFoundError(ExtractorError):
1116     """Error when a regex didn't match"""
1117     pass
1118
1119
1120 class GeoRestrictedError(ExtractorError):
1121     """Geographic restriction Error exception.
1122
1123     This exception may be thrown when a video is not available from your
1124     geographic location due to geographic restrictions imposed by a website.
1125     """
1126
1127     def __init__(self, msg, countries=None, **kwargs):
1128         kwargs['expected'] = True
1129         super().__init__(msg, **kwargs)
1130         self.countries = countries
1131
1132
1133 class UserNotLive(ExtractorError):
1134     """Error when a channel/user is not live"""
1135
1136     def __init__(self, msg=None, **kwargs):
1137         kwargs['expected'] = True
1138         super().__init__(msg or 'The channel is not currently live', **kwargs)
1139
1140
1141 class DownloadError(YoutubeDLError):
1142     """Download Error exception.
1143
1144     This exception may be thrown by FileDownloader objects if they are not
1145     configured to continue on errors. They will contain the appropriate
1146     error message.
1147     """
1148
1149     def __init__(self, msg, exc_info=None):
1150         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1151         super().__init__(msg)
1152         self.exc_info = exc_info
1153
1154
1155 class EntryNotInPlaylist(YoutubeDLError):
1156     """Entry not in playlist exception.
1157
1158     This exception will be thrown by YoutubeDL when a requested entry
1159     is not found in the playlist info_dict
1160     """
1161     msg = 'Entry not found in info'
1162
1163
1164 class SameFileError(YoutubeDLError):
1165     """Same File exception.
1166
1167     This exception will be thrown by FileDownloader objects if they detect
1168     multiple files would have to be downloaded to the same file on disk.
1169     """
1170     msg = 'Fixed output name but more than one file to download'
1171
1172     def __init__(self, filename=None):
1173         if filename is not None:
1174             self.msg += f': {filename}'
1175         super().__init__(self.msg)
1176
1177
1178 class PostProcessingError(YoutubeDLError):
1179     """Post Processing exception.
1180
1181     This exception may be raised by PostProcessor's .run() method to
1182     indicate an error in the postprocessing task.
1183     """
1184
1185
1186 class DownloadCancelled(YoutubeDLError):
1187     """ Exception raised when the download queue should be interrupted """
1188     msg = 'The download was cancelled'
1189
1190
1191 class ExistingVideoReached(DownloadCancelled):
1192     """ --break-on-existing triggered """
1193     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1194
1195
1196 class RejectedVideoReached(DownloadCancelled):
1197     """ --break-match-filter triggered """
1198     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1199
1200
1201 class MaxDownloadsReached(DownloadCancelled):
1202     """ --max-downloads limit has been reached. """
1203     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1204
1205
1206 class ReExtractInfo(YoutubeDLError):
1207     """ Video info needs to be re-extracted. """
1208
1209     def __init__(self, msg, expected=False):
1210         super().__init__(msg)
1211         self.expected = expected
1212
1213
1214 class ThrottledDownload(ReExtractInfo):
1215     """ Download speed below --throttled-rate. """
1216     msg = 'The download speed is below throttle limit'
1217
1218     def __init__(self):
1219         super().__init__(self.msg, expected=False)
1220
1221
1222 class UnavailableVideoError(YoutubeDLError):
1223     """Unavailable Format exception.
1224
1225     This exception will be thrown when a video is requested
1226     in a format that is not available for that video.
1227     """
1228     msg = 'Unable to download video'
1229
1230     def __init__(self, err=None):
1231         if err is not None:
1232             self.msg += f': {err}'
1233         super().__init__(self.msg)
1234
1235
1236 class ContentTooShortError(YoutubeDLError):
1237     """Content Too Short exception.
1238
1239     This exception may be raised by FileDownloader objects when a file they
1240     download is too small for what the server announced first, indicating
1241     the connection was probably interrupted.
1242     """
1243
1244     def __init__(self, downloaded, expected):
1245         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1246         # Both in bytes
1247         self.downloaded = downloaded
1248         self.expected = expected
1249
1250
1251 class XAttrMetadataError(YoutubeDLError):
1252     def __init__(self, code=None, msg='Unknown error'):
1253         super().__init__(msg)
1254         self.code = code
1255         self.msg = msg
1256
1257         # Parsing code and msg
1258         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1259                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1260             self.reason = 'NO_SPACE'
1261         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1262             self.reason = 'VALUE_TOO_LONG'
1263         else:
1264             self.reason = 'NOT_SUPPORTED'
1265
1266
1267 class XAttrUnavailableError(YoutubeDLError):
1268     pass
1269
1270
1271 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1272     hc = http_class(*args, **kwargs)
1273     source_address = ydl_handler._params.get('source_address')
1274
1275     if source_address is not None:
1276         # This is to workaround _create_connection() from socket where it will try all
1277         # address data from getaddrinfo() including IPv6. This filters the result from
1278         # getaddrinfo() based on the source_address value.
1279         # This is based on the cpython socket.create_connection() function.
1280         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1281         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1282             host, port = address
1283             err = None
1284             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1285             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1286             ip_addrs = [addr for addr in addrs if addr[0] == af]
1287             if addrs and not ip_addrs:
1288                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1289                 raise OSError(
1290                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1291                     % (ip_version, source_address[0]))
1292             for res in ip_addrs:
1293                 af, socktype, proto, canonname, sa = res
1294                 sock = None
1295                 try:
1296                     sock = socket.socket(af, socktype, proto)
1297                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1298                         sock.settimeout(timeout)
1299                     sock.bind(source_address)
1300                     sock.connect(sa)
1301                     err = None  # Explicitly break reference cycle
1302                     return sock
1303                 except OSError as _:
1304                     err = _
1305                     if sock is not None:
1306                         sock.close()
1307             if err is not None:
1308                 raise err
1309             else:
1310                 raise OSError('getaddrinfo returns an empty list')
1311         if hasattr(hc, '_create_connection'):
1312             hc._create_connection = _create_connection
1313         hc.source_address = (source_address, 0)
1314
1315     return hc
1316
1317
1318 class YoutubeDLHandler(urllib.request.HTTPHandler):
1319     """Handler for HTTP requests and responses.
1320
1321     This class, when installed with an OpenerDirector, automatically adds
1322     the standard headers to every HTTP request and handles gzipped, deflated and
1323     brotli responses from web servers.
1324
1325     Part of this code was copied from:
1326
1327     http://techknack.net/python-urllib2-handlers/
1328
1329     Andrew Rowls, the author of that code, agreed to release it to the
1330     public domain.
1331     """
1332
1333     def __init__(self, params, *args, **kwargs):
1334         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1335         self._params = params
1336
1337     def http_open(self, req):
1338         conn_class = http.client.HTTPConnection
1339
1340         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1341         if socks_proxy:
1342             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1343             del req.headers['Ytdl-socks-proxy']
1344
1345         return self.do_open(functools.partial(
1346             _create_http_connection, self, conn_class, False),
1347             req)
1348
1349     @staticmethod
1350     def deflate(data):
1351         if not data:
1352             return data
1353         try:
1354             return zlib.decompress(data, -zlib.MAX_WBITS)
1355         except zlib.error:
1356             return zlib.decompress(data)
1357
1358     @staticmethod
1359     def brotli(data):
1360         if not data:
1361             return data
1362         return brotli.decompress(data)
1363
1364     @staticmethod
1365     def gz(data):
1366         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
1367         try:
1368             return gz.read()
1369         except OSError as original_oserror:
1370             # There may be junk add the end of the file
1371             # See http://stackoverflow.com/q/4928560/35070 for details
1372             for i in range(1, 1024):
1373                 try:
1374                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
1375                     return gz.read()
1376                 except OSError:
1377                     continue
1378             else:
1379                 raise original_oserror
1380
1381     def http_request(self, req):
1382         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1383         # always respected by websites, some tend to give out URLs with non percent-encoded
1384         # non-ASCII characters (see telemb.py, ard.py [#3412])
1385         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1386         # To work around aforementioned issue we will replace request's original URL with
1387         # percent-encoded one
1388         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1389         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1390         url = req.get_full_url()
1391         url_escaped = escape_url(url)
1392
1393         # Substitute URL if any change after escaping
1394         if url != url_escaped:
1395             req = update_Request(req, url=url_escaped)
1396
1397         for h, v in self._params.get('http_headers', std_headers).items():
1398             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1399             # The dict keys are capitalized because of this bug by urllib
1400             if h.capitalize() not in req.headers:
1401                 req.add_header(h, v)
1402
1403         if 'Youtubedl-no-compression' in req.headers:  # deprecated
1404             req.headers.pop('Youtubedl-no-compression', None)
1405             req.add_header('Accept-encoding', 'identity')
1406
1407         if 'Accept-encoding' not in req.headers:
1408             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1409
1410         return super().do_request_(req)
1411
1412     def http_response(self, req, resp):
1413         old_resp = resp
1414
1415         # Content-Encoding header lists the encodings in order that they were applied [1].
1416         # To decompress, we simply do the reverse.
1417         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
1418         decoded_response = None
1419         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
1420             if encoding == 'gzip':
1421                 decoded_response = self.gz(decoded_response or resp.read())
1422             elif encoding == 'deflate':
1423                 decoded_response = self.deflate(decoded_response or resp.read())
1424             elif encoding == 'br' and brotli:
1425                 decoded_response = self.brotli(decoded_response or resp.read())
1426
1427         if decoded_response is not None:
1428             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
1429             resp.msg = old_resp.msg
1430         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1431         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1432         if 300 <= resp.code < 400:
1433             location = resp.headers.get('Location')
1434             if location:
1435                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1436                 location = location.encode('iso-8859-1').decode()
1437                 location_escaped = escape_url(location)
1438                 if location != location_escaped:
1439                     del resp.headers['Location']
1440                     resp.headers['Location'] = location_escaped
1441         return resp
1442
1443     https_request = http_request
1444     https_response = http_response
1445
1446
1447 def make_socks_conn_class(base_class, socks_proxy):
1448     assert issubclass(base_class, (
1449         http.client.HTTPConnection, http.client.HTTPSConnection))
1450
1451     url_components = urllib.parse.urlparse(socks_proxy)
1452     if url_components.scheme.lower() == 'socks5':
1453         socks_type = ProxyType.SOCKS5
1454     elif url_components.scheme.lower() in ('socks', 'socks4'):
1455         socks_type = ProxyType.SOCKS4
1456     elif url_components.scheme.lower() == 'socks4a':
1457         socks_type = ProxyType.SOCKS4A
1458
1459     def unquote_if_non_empty(s):
1460         if not s:
1461             return s
1462         return urllib.parse.unquote_plus(s)
1463
1464     proxy_args = (
1465         socks_type,
1466         url_components.hostname, url_components.port or 1080,
1467         True,  # Remote DNS
1468         unquote_if_non_empty(url_components.username),
1469         unquote_if_non_empty(url_components.password),
1470     )
1471
1472     class SocksConnection(base_class):
1473         def connect(self):
1474             self.sock = sockssocket()
1475             self.sock.setproxy(*proxy_args)
1476             if isinstance(self.timeout, (int, float)):
1477                 self.sock.settimeout(self.timeout)
1478             self.sock.connect((self.host, self.port))
1479
1480             if isinstance(self, http.client.HTTPSConnection):
1481                 if hasattr(self, '_context'):  # Python > 2.6
1482                     self.sock = self._context.wrap_socket(
1483                         self.sock, server_hostname=self.host)
1484                 else:
1485                     self.sock = ssl.wrap_socket(self.sock)
1486
1487     return SocksConnection
1488
1489
1490 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1491     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1492         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1493         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1494         self._params = params
1495
1496     def https_open(self, req):
1497         kwargs = {}
1498         conn_class = self._https_conn_class
1499
1500         if hasattr(self, '_context'):  # python > 2.6
1501             kwargs['context'] = self._context
1502         if hasattr(self, '_check_hostname'):  # python 3.x
1503             kwargs['check_hostname'] = self._check_hostname
1504
1505         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1506         if socks_proxy:
1507             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1508             del req.headers['Ytdl-socks-proxy']
1509
1510         try:
1511             return self.do_open(
1512                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1513         except urllib.error.URLError as e:
1514             if (isinstance(e.reason, ssl.SSLError)
1515                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1516                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1517             raise
1518
1519
1520 def is_path_like(f):
1521     return isinstance(f, (str, bytes, os.PathLike))
1522
1523
1524 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1525     def __init__(self, cookiejar=None):
1526         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1527
1528     def http_response(self, request, response):
1529         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1530
1531     https_request = urllib.request.HTTPCookieProcessor.http_request
1532     https_response = http_response
1533
1534
1535 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1536     """YoutubeDL redirect handler
1537
1538     The code is based on HTTPRedirectHandler implementation from CPython [1].
1539
1540     This redirect handler fixes and improves the logic to better align with RFC7261
1541      and what browsers tend to do [2][3]
1542
1543     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1544     2. https://datatracker.ietf.org/doc/html/rfc7231
1545     3. https://github.com/python/cpython/issues/91306
1546     """
1547
1548     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1549
1550     def redirect_request(self, req, fp, code, msg, headers, newurl):
1551         if code not in (301, 302, 303, 307, 308):
1552             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1553
1554         new_method = req.get_method()
1555         new_data = req.data
1556         remove_headers = []
1557         # A 303 must either use GET or HEAD for subsequent request
1558         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1559         if code == 303 and req.get_method() != 'HEAD':
1560             new_method = 'GET'
1561         # 301 and 302 redirects are commonly turned into a GET from a POST
1562         # for subsequent requests by browsers, so we'll do the same.
1563         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1564         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1565         elif code in (301, 302) and req.get_method() == 'POST':
1566             new_method = 'GET'
1567
1568         # only remove payload if method changed (e.g. POST to GET)
1569         if new_method != req.get_method():
1570             new_data = None
1571             remove_headers.extend(['Content-Length', 'Content-Type'])
1572
1573         new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
1574
1575         return urllib.request.Request(
1576             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1577             unverifiable=True, method=new_method, data=new_data)
1578
1579
1580 def extract_timezone(date_str):
1581     m = re.search(
1582         r'''(?x)
1583             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1584             (?P<tz>Z|                                            # just the UTC Z, or
1585                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1586                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1587                    [ ]?                                          # optional space
1588                 (?P<sign>\+|-)                                   # +/-
1589                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1590             $)
1591         ''', date_str)
1592     if not m:
1593         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1594         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1595         if timezone is not None:
1596             date_str = date_str[:-len(m.group('tz'))]
1597         timezone = datetime.timedelta(hours=timezone or 0)
1598     else:
1599         date_str = date_str[:-len(m.group('tz'))]
1600         if not m.group('sign'):
1601             timezone = datetime.timedelta()
1602         else:
1603             sign = 1 if m.group('sign') == '+' else -1
1604             timezone = datetime.timedelta(
1605                 hours=sign * int(m.group('hours')),
1606                 minutes=sign * int(m.group('minutes')))
1607     return timezone, date_str
1608
1609
1610 def parse_iso8601(date_str, delimiter='T', timezone=None):
1611     """ Return a UNIX timestamp from the given date """
1612
1613     if date_str is None:
1614         return None
1615
1616     date_str = re.sub(r'\.[0-9]+', '', date_str)
1617
1618     if timezone is None:
1619         timezone, date_str = extract_timezone(date_str)
1620
1621     with contextlib.suppress(ValueError):
1622         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1623         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1624         return calendar.timegm(dt.timetuple())
1625
1626
1627 def date_formats(day_first=True):
1628     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1629
1630
1631 def unified_strdate(date_str, day_first=True):
1632     """Return a string with the date in the format YYYYMMDD"""
1633
1634     if date_str is None:
1635         return None
1636     upload_date = None
1637     # Replace commas
1638     date_str = date_str.replace(',', ' ')
1639     # Remove AM/PM + timezone
1640     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1641     _, date_str = extract_timezone(date_str)
1642
1643     for expression in date_formats(day_first):
1644         with contextlib.suppress(ValueError):
1645             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1646     if upload_date is None:
1647         timetuple = email.utils.parsedate_tz(date_str)
1648         if timetuple:
1649             with contextlib.suppress(ValueError):
1650                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1651     if upload_date is not None:
1652         return str(upload_date)
1653
1654
1655 def unified_timestamp(date_str, day_first=True):
1656     if date_str is None:
1657         return None
1658
1659     date_str = re.sub(r'\s+', ' ', re.sub(
1660         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1661
1662     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1663     timezone, date_str = extract_timezone(date_str)
1664
1665     # Remove AM/PM + timezone
1666     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1667
1668     # Remove unrecognized timezones from ISO 8601 alike timestamps
1669     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1670     if m:
1671         date_str = date_str[:-len(m.group('tz'))]
1672
1673     # Python only supports microseconds, so remove nanoseconds
1674     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1675     if m:
1676         date_str = m.group(1)
1677
1678     for expression in date_formats(day_first):
1679         with contextlib.suppress(ValueError):
1680             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1681             return calendar.timegm(dt.timetuple())
1682
1683     timetuple = email.utils.parsedate_tz(date_str)
1684     if timetuple:
1685         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1686
1687
1688 def determine_ext(url, default_ext='unknown_video'):
1689     if url is None or '.' not in url:
1690         return default_ext
1691     guess = url.partition('?')[0].rpartition('.')[2]
1692     if re.match(r'^[A-Za-z0-9]+$', guess):
1693         return guess
1694     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1695     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1696         return guess.rstrip('/')
1697     else:
1698         return default_ext
1699
1700
1701 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1702     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1703
1704
1705 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1706     R"""
1707     Return a datetime object from a string.
1708     Supported format:
1709         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1710
1711     @param format       strftime format of DATE
1712     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1713                         auto: round to the unit provided in date_str (if applicable).
1714     """
1715     auto_precision = False
1716     if precision == 'auto':
1717         auto_precision = True
1718         precision = 'microsecond'
1719     today = datetime_round(datetime.datetime.utcnow(), precision)
1720     if date_str in ('now', 'today'):
1721         return today
1722     if date_str == 'yesterday':
1723         return today - datetime.timedelta(days=1)
1724     match = re.match(
1725         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1726         date_str)
1727     if match is not None:
1728         start_time = datetime_from_str(match.group('start'), precision, format)
1729         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1730         unit = match.group('unit')
1731         if unit == 'month' or unit == 'year':
1732             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1733             unit = 'day'
1734         else:
1735             if unit == 'week':
1736                 unit = 'day'
1737                 time *= 7
1738             delta = datetime.timedelta(**{unit + 's': time})
1739             new_date = start_time + delta
1740         if auto_precision:
1741             return datetime_round(new_date, unit)
1742         return new_date
1743
1744     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1745
1746
1747 def date_from_str(date_str, format='%Y%m%d', strict=False):
1748     R"""
1749     Return a date object from a string using datetime_from_str
1750
1751     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1752                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1753     """
1754     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1755         raise ValueError(f'Invalid date format "{date_str}"')
1756     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1757
1758
1759 def datetime_add_months(dt, months):
1760     """Increment/Decrement a datetime object by months."""
1761     month = dt.month + months - 1
1762     year = dt.year + month // 12
1763     month = month % 12 + 1
1764     day = min(dt.day, calendar.monthrange(year, month)[1])
1765     return dt.replace(year, month, day)
1766
1767
1768 def datetime_round(dt, precision='day'):
1769     """
1770     Round a datetime object's time to a specific precision
1771     """
1772     if precision == 'microsecond':
1773         return dt
1774
1775     unit_seconds = {
1776         'day': 86400,
1777         'hour': 3600,
1778         'minute': 60,
1779         'second': 1,
1780     }
1781     roundto = lambda x, n: ((x + n / 2) // n) * n
1782     timestamp = calendar.timegm(dt.timetuple())
1783     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1784
1785
1786 def hyphenate_date(date_str):
1787     """
1788     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1789     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1790     if match is not None:
1791         return '-'.join(match.groups())
1792     else:
1793         return date_str
1794
1795
1796 class DateRange:
1797     """Represents a time interval between two dates"""
1798
1799     def __init__(self, start=None, end=None):
1800         """start and end must be strings in the format accepted by date"""
1801         if start is not None:
1802             self.start = date_from_str(start, strict=True)
1803         else:
1804             self.start = datetime.datetime.min.date()
1805         if end is not None:
1806             self.end = date_from_str(end, strict=True)
1807         else:
1808             self.end = datetime.datetime.max.date()
1809         if self.start > self.end:
1810             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1811
1812     @classmethod
1813     def day(cls, day):
1814         """Returns a range that only contains the given day"""
1815         return cls(day, day)
1816
1817     def __contains__(self, date):
1818         """Check if the date is in the range"""
1819         if not isinstance(date, datetime.date):
1820             date = date_from_str(date)
1821         return self.start <= date <= self.end
1822
1823     def __repr__(self):
1824         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1825
1826     def __eq__(self, other):
1827         return (isinstance(other, DateRange)
1828                 and self.start == other.start and self.end == other.end)
1829
1830
1831 @functools.cache
1832 def system_identifier():
1833     python_implementation = platform.python_implementation()
1834     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1835         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1836     libc_ver = []
1837     with contextlib.suppress(OSError):  # We may not have access to the executable
1838         libc_ver = platform.libc_ver()
1839
1840     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1841         platform.python_version(),
1842         python_implementation,
1843         platform.machine(),
1844         platform.architecture()[0],
1845         platform.platform(),
1846         ssl.OPENSSL_VERSION,
1847         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1848     )
1849
1850
1851 @functools.cache
1852 def get_windows_version():
1853     ''' Get Windows version. returns () if it's not running on Windows '''
1854     if compat_os_name == 'nt':
1855         return version_tuple(platform.win32_ver()[1])
1856     else:
1857         return ()
1858
1859
1860 def write_string(s, out=None, encoding=None):
1861     assert isinstance(s, str)
1862     out = out or sys.stderr
1863     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1864     if not out:
1865         return
1866
1867     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1868         s = re.sub(r'([\r\n]+)', r' \1', s)
1869
1870     enc, buffer = None, out
1871     if 'b' in getattr(out, 'mode', ''):
1872         enc = encoding or preferredencoding()
1873     elif hasattr(out, 'buffer'):
1874         buffer = out.buffer
1875         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1876
1877     buffer.write(s.encode(enc, 'ignore') if enc else s)
1878     out.flush()
1879
1880
1881 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1882     from .. import _IN_CLI
1883     if _IN_CLI:
1884         if msg in deprecation_warning._cache:
1885             return
1886         deprecation_warning._cache.add(msg)
1887         if printer:
1888             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1889         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1890     else:
1891         import warnings
1892         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1893
1894
1895 deprecation_warning._cache = set()
1896
1897
1898 def bytes_to_intlist(bs):
1899     if not bs:
1900         return []
1901     if isinstance(bs[0], int):  # Python 3
1902         return list(bs)
1903     else:
1904         return [ord(c) for c in bs]
1905
1906
1907 def intlist_to_bytes(xs):
1908     if not xs:
1909         return b''
1910     return struct.pack('%dB' % len(xs), *xs)
1911
1912
1913 class LockingUnsupportedError(OSError):
1914     msg = 'File locking is not supported'
1915
1916     def __init__(self):
1917         super().__init__(self.msg)
1918
1919
1920 # Cross-platform file locking
1921 if sys.platform == 'win32':
1922     import ctypes
1923     import ctypes.wintypes
1924     import msvcrt
1925
1926     class OVERLAPPED(ctypes.Structure):
1927         _fields_ = [
1928             ('Internal', ctypes.wintypes.LPVOID),
1929             ('InternalHigh', ctypes.wintypes.LPVOID),
1930             ('Offset', ctypes.wintypes.DWORD),
1931             ('OffsetHigh', ctypes.wintypes.DWORD),
1932             ('hEvent', ctypes.wintypes.HANDLE),
1933         ]
1934
1935     kernel32 = ctypes.WinDLL('kernel32')
1936     LockFileEx = kernel32.LockFileEx
1937     LockFileEx.argtypes = [
1938         ctypes.wintypes.HANDLE,     # hFile
1939         ctypes.wintypes.DWORD,      # dwFlags
1940         ctypes.wintypes.DWORD,      # dwReserved
1941         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1942         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1943         ctypes.POINTER(OVERLAPPED)  # Overlapped
1944     ]
1945     LockFileEx.restype = ctypes.wintypes.BOOL
1946     UnlockFileEx = kernel32.UnlockFileEx
1947     UnlockFileEx.argtypes = [
1948         ctypes.wintypes.HANDLE,     # hFile
1949         ctypes.wintypes.DWORD,      # dwReserved
1950         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1951         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1952         ctypes.POINTER(OVERLAPPED)  # Overlapped
1953     ]
1954     UnlockFileEx.restype = ctypes.wintypes.BOOL
1955     whole_low = 0xffffffff
1956     whole_high = 0x7fffffff
1957
1958     def _lock_file(f, exclusive, block):
1959         overlapped = OVERLAPPED()
1960         overlapped.Offset = 0
1961         overlapped.OffsetHigh = 0
1962         overlapped.hEvent = 0
1963         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1964
1965         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1966                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1967                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1968             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1969             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1970
1971     def _unlock_file(f):
1972         assert f._lock_file_overlapped_p
1973         handle = msvcrt.get_osfhandle(f.fileno())
1974         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1975             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1976
1977 else:
1978     try:
1979         import fcntl
1980
1981         def _lock_file(f, exclusive, block):
1982             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1983             if not block:
1984                 flags |= fcntl.LOCK_NB
1985             try:
1986                 fcntl.flock(f, flags)
1987             except BlockingIOError:
1988                 raise
1989             except OSError:  # AOSP does not have flock()
1990                 fcntl.lockf(f, flags)
1991
1992         def _unlock_file(f):
1993             with contextlib.suppress(OSError):
1994                 return fcntl.flock(f, fcntl.LOCK_UN)
1995             with contextlib.suppress(OSError):
1996                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1997             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1998
1999     except ImportError:
2000
2001         def _lock_file(f, exclusive, block):
2002             raise LockingUnsupportedError()
2003
2004         def _unlock_file(f):
2005             raise LockingUnsupportedError()
2006
2007
2008 class locked_file:
2009     locked = False
2010
2011     def __init__(self, filename, mode, block=True, encoding=None):
2012         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2013             raise NotImplementedError(mode)
2014         self.mode, self.block = mode, block
2015
2016         writable = any(f in mode for f in 'wax+')
2017         readable = any(f in mode for f in 'r+')
2018         flags = functools.reduce(operator.ior, (
2019             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2020             getattr(os, 'O_BINARY', 0),  # Windows only
2021             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2022             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2023             os.O_APPEND if 'a' in mode else 0,
2024             os.O_EXCL if 'x' in mode else 0,
2025             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2026         ))
2027
2028         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2029
2030     def __enter__(self):
2031         exclusive = 'r' not in self.mode
2032         try:
2033             _lock_file(self.f, exclusive, self.block)
2034             self.locked = True
2035         except OSError:
2036             self.f.close()
2037             raise
2038         if 'w' in self.mode:
2039             try:
2040                 self.f.truncate()
2041             except OSError as e:
2042                 if e.errno not in (
2043                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2044                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2045                 ):
2046                     raise
2047         return self
2048
2049     def unlock(self):
2050         if not self.locked:
2051             return
2052         try:
2053             _unlock_file(self.f)
2054         finally:
2055             self.locked = False
2056
2057     def __exit__(self, *_):
2058         try:
2059             self.unlock()
2060         finally:
2061             self.f.close()
2062
2063     open = __enter__
2064     close = __exit__
2065
2066     def __getattr__(self, attr):
2067         return getattr(self.f, attr)
2068
2069     def __iter__(self):
2070         return iter(self.f)
2071
2072
2073 @functools.cache
2074 def get_filesystem_encoding():
2075     encoding = sys.getfilesystemencoding()
2076     return encoding if encoding is not None else 'utf-8'
2077
2078
2079 def shell_quote(args):
2080     quoted_args = []
2081     encoding = get_filesystem_encoding()
2082     for a in args:
2083         if isinstance(a, bytes):
2084             # We may get a filename encoded with 'encodeFilename'
2085             a = a.decode(encoding)
2086         quoted_args.append(compat_shlex_quote(a))
2087     return ' '.join(quoted_args)
2088
2089
2090 def smuggle_url(url, data):
2091     """ Pass additional data in a URL for internal use. """
2092
2093     url, idata = unsmuggle_url(url, {})
2094     data.update(idata)
2095     sdata = urllib.parse.urlencode(
2096         {'__youtubedl_smuggle': json.dumps(data)})
2097     return url + '#' + sdata
2098
2099
2100 def unsmuggle_url(smug_url, default=None):
2101     if '#__youtubedl_smuggle' not in smug_url:
2102         return smug_url, default
2103     url, _, sdata = smug_url.rpartition('#')
2104     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2105     data = json.loads(jsond)
2106     return url, data
2107
2108
2109 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2110     """ Formats numbers with decimal sufixes like K, M, etc """
2111     num, factor = float_or_none(num), float(factor)
2112     if num is None or num < 0:
2113         return None
2114     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2115     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2116     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2117     if factor == 1024:
2118         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2119     converted = num / (factor ** exponent)
2120     return fmt % (converted, suffix)
2121
2122
2123 def format_bytes(bytes):
2124     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2125
2126
2127 def lookup_unit_table(unit_table, s, strict=False):
2128     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2129     units_re = '|'.join(re.escape(u) for u in unit_table)
2130     m = (re.fullmatch if strict else re.match)(
2131         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2132     if not m:
2133         return None
2134
2135     num = float(m.group('num').replace(',', '.'))
2136     mult = unit_table[m.group('unit')]
2137     return round(num * mult)
2138
2139
2140 def parse_bytes(s):
2141     """Parse a string indicating a byte quantity into an integer"""
2142     return lookup_unit_table(
2143         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2144         s.upper(), strict=True)
2145
2146
2147 def parse_filesize(s):
2148     if s is None:
2149         return None
2150
2151     # The lower-case forms are of course incorrect and unofficial,
2152     # but we support those too
2153     _UNIT_TABLE = {
2154         'B': 1,
2155         'b': 1,
2156         'bytes': 1,
2157         'KiB': 1024,
2158         'KB': 1000,
2159         'kB': 1024,
2160         'Kb': 1000,
2161         'kb': 1000,
2162         'kilobytes': 1000,
2163         'kibibytes': 1024,
2164         'MiB': 1024 ** 2,
2165         'MB': 1000 ** 2,
2166         'mB': 1024 ** 2,
2167         'Mb': 1000 ** 2,
2168         'mb': 1000 ** 2,
2169         'megabytes': 1000 ** 2,
2170         'mebibytes': 1024 ** 2,
2171         'GiB': 1024 ** 3,
2172         'GB': 1000 ** 3,
2173         'gB': 1024 ** 3,
2174         'Gb': 1000 ** 3,
2175         'gb': 1000 ** 3,
2176         'gigabytes': 1000 ** 3,
2177         'gibibytes': 1024 ** 3,
2178         'TiB': 1024 ** 4,
2179         'TB': 1000 ** 4,
2180         'tB': 1024 ** 4,
2181         'Tb': 1000 ** 4,
2182         'tb': 1000 ** 4,
2183         'terabytes': 1000 ** 4,
2184         'tebibytes': 1024 ** 4,
2185         'PiB': 1024 ** 5,
2186         'PB': 1000 ** 5,
2187         'pB': 1024 ** 5,
2188         'Pb': 1000 ** 5,
2189         'pb': 1000 ** 5,
2190         'petabytes': 1000 ** 5,
2191         'pebibytes': 1024 ** 5,
2192         'EiB': 1024 ** 6,
2193         'EB': 1000 ** 6,
2194         'eB': 1024 ** 6,
2195         'Eb': 1000 ** 6,
2196         'eb': 1000 ** 6,
2197         'exabytes': 1000 ** 6,
2198         'exbibytes': 1024 ** 6,
2199         'ZiB': 1024 ** 7,
2200         'ZB': 1000 ** 7,
2201         'zB': 1024 ** 7,
2202         'Zb': 1000 ** 7,
2203         'zb': 1000 ** 7,
2204         'zettabytes': 1000 ** 7,
2205         'zebibytes': 1024 ** 7,
2206         'YiB': 1024 ** 8,
2207         'YB': 1000 ** 8,
2208         'yB': 1024 ** 8,
2209         'Yb': 1000 ** 8,
2210         'yb': 1000 ** 8,
2211         'yottabytes': 1000 ** 8,
2212         'yobibytes': 1024 ** 8,
2213     }
2214
2215     return lookup_unit_table(_UNIT_TABLE, s)
2216
2217
2218 def parse_count(s):
2219     if s is None:
2220         return None
2221
2222     s = re.sub(r'^[^\d]+\s', '', s).strip()
2223
2224     if re.match(r'^[\d,.]+$', s):
2225         return str_to_int(s)
2226
2227     _UNIT_TABLE = {
2228         'k': 1000,
2229         'K': 1000,
2230         'm': 1000 ** 2,
2231         'M': 1000 ** 2,
2232         'kk': 1000 ** 2,
2233         'KK': 1000 ** 2,
2234         'b': 1000 ** 3,
2235         'B': 1000 ** 3,
2236     }
2237
2238     ret = lookup_unit_table(_UNIT_TABLE, s)
2239     if ret is not None:
2240         return ret
2241
2242     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2243     if mobj:
2244         return str_to_int(mobj.group(1))
2245
2246
2247 def parse_resolution(s, *, lenient=False):
2248     if s is None:
2249         return {}
2250
2251     if lenient:
2252         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2253     else:
2254         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2255     if mobj:
2256         return {
2257             'width': int(mobj.group('w')),
2258             'height': int(mobj.group('h')),
2259         }
2260
2261     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2262     if mobj:
2263         return {'height': int(mobj.group(1))}
2264
2265     mobj = re.search(r'\b([48])[kK]\b', s)
2266     if mobj:
2267         return {'height': int(mobj.group(1)) * 540}
2268
2269     return {}
2270
2271
2272 def parse_bitrate(s):
2273     if not isinstance(s, str):
2274         return
2275     mobj = re.search(r'\b(\d+)\s*kbps', s)
2276     if mobj:
2277         return int(mobj.group(1))
2278
2279
2280 def month_by_name(name, lang='en'):
2281     """ Return the number of a month by (locale-independently) English name """
2282
2283     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2284
2285     try:
2286         return month_names.index(name) + 1
2287     except ValueError:
2288         return None
2289
2290
2291 def month_by_abbreviation(abbrev):
2292     """ Return the number of a month by (locale-independently) English
2293         abbreviations """
2294
2295     try:
2296         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2297     except ValueError:
2298         return None
2299
2300
2301 def fix_xml_ampersands(xml_str):
2302     """Replace all the '&' by '&amp;' in XML"""
2303     return re.sub(
2304         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2305         '&amp;',
2306         xml_str)
2307
2308
2309 def setproctitle(title):
2310     assert isinstance(title, str)
2311
2312     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2313     try:
2314         import ctypes
2315     except ImportError:
2316         return
2317
2318     try:
2319         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2320     except OSError:
2321         return
2322     except TypeError:
2323         # LoadLibrary in Windows Python 2.7.13 only expects
2324         # a bytestring, but since unicode_literals turns
2325         # every string into a unicode string, it fails.
2326         return
2327     title_bytes = title.encode()
2328     buf = ctypes.create_string_buffer(len(title_bytes))
2329     buf.value = title_bytes
2330     try:
2331         libc.prctl(15, buf, 0, 0, 0)
2332     except AttributeError:
2333         return  # Strange libc, just skip this
2334
2335
2336 def remove_start(s, start):
2337     return s[len(start):] if s is not None and s.startswith(start) else s
2338
2339
2340 def remove_end(s, end):
2341     return s[:-len(end)] if s is not None and s.endswith(end) else s
2342
2343
2344 def remove_quotes(s):
2345     if s is None or len(s) < 2:
2346         return s
2347     for quote in ('"', "'", ):
2348         if s[0] == quote and s[-1] == quote:
2349             return s[1:-1]
2350     return s
2351
2352
2353 def get_domain(url):
2354     """
2355     This implementation is inconsistent, but is kept for compatibility.
2356     Use this only for "webpage_url_domain"
2357     """
2358     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2359
2360
2361 def url_basename(url):
2362     path = urllib.parse.urlparse(url).path
2363     return path.strip('/').split('/')[-1]
2364
2365
2366 def base_url(url):
2367     return re.match(r'https?://[^?#]+/', url).group()
2368
2369
2370 def urljoin(base, path):
2371     if isinstance(path, bytes):
2372         path = path.decode()
2373     if not isinstance(path, str) or not path:
2374         return None
2375     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2376         return path
2377     if isinstance(base, bytes):
2378         base = base.decode()
2379     if not isinstance(base, str) or not re.match(
2380             r'^(?:https?:)?//', base):
2381         return None
2382     return urllib.parse.urljoin(base, path)
2383
2384
2385 class HEADRequest(urllib.request.Request):
2386     def get_method(self):
2387         return 'HEAD'
2388
2389
2390 class PUTRequest(urllib.request.Request):
2391     def get_method(self):
2392         return 'PUT'
2393
2394
2395 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2396     if get_attr and v is not None:
2397         v = getattr(v, get_attr, None)
2398     try:
2399         return int(v) * invscale // scale
2400     except (ValueError, TypeError, OverflowError):
2401         return default
2402
2403
2404 def str_or_none(v, default=None):
2405     return default if v is None else str(v)
2406
2407
2408 def str_to_int(int_str):
2409     """ A more relaxed version of int_or_none """
2410     if isinstance(int_str, int):
2411         return int_str
2412     elif isinstance(int_str, str):
2413         int_str = re.sub(r'[,\.\+]', '', int_str)
2414         return int_or_none(int_str)
2415
2416
2417 def float_or_none(v, scale=1, invscale=1, default=None):
2418     if v is None:
2419         return default
2420     try:
2421         return float(v) * invscale / scale
2422     except (ValueError, TypeError):
2423         return default
2424
2425
2426 def bool_or_none(v, default=None):
2427     return v if isinstance(v, bool) else default
2428
2429
2430 def strip_or_none(v, default=None):
2431     return v.strip() if isinstance(v, str) else default
2432
2433
2434 def url_or_none(url):
2435     if not url or not isinstance(url, str):
2436         return None
2437     url = url.strip()
2438     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2439
2440
2441 def request_to_url(req):
2442     if isinstance(req, urllib.request.Request):
2443         return req.get_full_url()
2444     else:
2445         return req
2446
2447
2448 def strftime_or_none(timestamp, date_format, default=None):
2449     datetime_object = None
2450     try:
2451         if isinstance(timestamp, (int, float)):  # unix timestamp
2452             # Using naive datetime here can break timestamp() in Windows
2453             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2454             datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2455         elif isinstance(timestamp, str):  # assume YYYYMMDD
2456             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2457         date_format = re.sub(  # Support %s on windows
2458             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2459         return datetime_object.strftime(date_format)
2460     except (ValueError, TypeError, AttributeError):
2461         return default
2462
2463
2464 def parse_duration(s):
2465     if not isinstance(s, str):
2466         return None
2467     s = s.strip()
2468     if not s:
2469         return None
2470
2471     days, hours, mins, secs, ms = [None] * 5
2472     m = re.match(r'''(?x)
2473             (?P<before_secs>
2474                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2475             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2476             (?P<ms>[.:][0-9]+)?Z?$
2477         ''', s)
2478     if m:
2479         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2480     else:
2481         m = re.match(
2482             r'''(?ix)(?:P?
2483                 (?:
2484                     [0-9]+\s*y(?:ears?)?,?\s*
2485                 )?
2486                 (?:
2487                     [0-9]+\s*m(?:onths?)?,?\s*
2488                 )?
2489                 (?:
2490                     [0-9]+\s*w(?:eeks?)?,?\s*
2491                 )?
2492                 (?:
2493                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2494                 )?
2495                 T)?
2496                 (?:
2497                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2498                 )?
2499                 (?:
2500                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2501                 )?
2502                 (?:
2503                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2504                 )?Z?$''', s)
2505         if m:
2506             days, hours, mins, secs, ms = m.groups()
2507         else:
2508             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2509             if m:
2510                 hours, mins = m.groups()
2511             else:
2512                 return None
2513
2514     if ms:
2515         ms = ms.replace(':', '.')
2516     return sum(float(part or 0) * mult for part, mult in (
2517         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2518
2519
2520 def prepend_extension(filename, ext, expected_real_ext=None):
2521     name, real_ext = os.path.splitext(filename)
2522     return (
2523         f'{name}.{ext}{real_ext}'
2524         if not expected_real_ext or real_ext[1:] == expected_real_ext
2525         else f'{filename}.{ext}')
2526
2527
2528 def replace_extension(filename, ext, expected_real_ext=None):
2529     name, real_ext = os.path.splitext(filename)
2530     return '{}.{}'.format(
2531         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2532         ext)
2533
2534
2535 def check_executable(exe, args=[]):
2536     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2537     args can be a list of arguments for a short output (like -version) """
2538     try:
2539         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2540     except OSError:
2541         return False
2542     return exe
2543
2544
2545 def _get_exe_version_output(exe, args):
2546     try:
2547         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2548         # SIGTTOU if yt-dlp is run in the background.
2549         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2550         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2551                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2552         if ret:
2553             return None
2554     except OSError:
2555         return False
2556     return stdout
2557
2558
2559 def detect_exe_version(output, version_re=None, unrecognized='present'):
2560     assert isinstance(output, str)
2561     if version_re is None:
2562         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2563     m = re.search(version_re, output)
2564     if m:
2565         return m.group(1)
2566     else:
2567         return unrecognized
2568
2569
2570 def get_exe_version(exe, args=['--version'],
2571                     version_re=None, unrecognized=('present', 'broken')):
2572     """ Returns the version of the specified executable,
2573     or False if the executable is not present """
2574     unrecognized = variadic(unrecognized)
2575     assert len(unrecognized) in (1, 2)
2576     out = _get_exe_version_output(exe, args)
2577     if out is None:
2578         return unrecognized[-1]
2579     return out and detect_exe_version(out, version_re, unrecognized[0])
2580
2581
2582 def frange(start=0, stop=None, step=1):
2583     """Float range"""
2584     if stop is None:
2585         start, stop = 0, start
2586     sign = [-1, 1][step > 0] if step else 0
2587     while sign * start < sign * stop:
2588         yield start
2589         start += step
2590
2591
2592 class LazyList(collections.abc.Sequence):
2593     """Lazy immutable list from an iterable
2594     Note that slices of a LazyList are lists and not LazyList"""
2595
2596     class IndexError(IndexError):
2597         pass
2598
2599     def __init__(self, iterable, *, reverse=False, _cache=None):
2600         self._iterable = iter(iterable)
2601         self._cache = [] if _cache is None else _cache
2602         self._reversed = reverse
2603
2604     def __iter__(self):
2605         if self._reversed:
2606             # We need to consume the entire iterable to iterate in reverse
2607             yield from self.exhaust()
2608             return
2609         yield from self._cache
2610         for item in self._iterable:
2611             self._cache.append(item)
2612             yield item
2613
2614     def _exhaust(self):
2615         self._cache.extend(self._iterable)
2616         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2617         return self._cache
2618
2619     def exhaust(self):
2620         """Evaluate the entire iterable"""
2621         return self._exhaust()[::-1 if self._reversed else 1]
2622
2623     @staticmethod
2624     def _reverse_index(x):
2625         return None if x is None else ~x
2626
2627     def __getitem__(self, idx):
2628         if isinstance(idx, slice):
2629             if self._reversed:
2630                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2631             start, stop, step = idx.start, idx.stop, idx.step or 1
2632         elif isinstance(idx, int):
2633             if self._reversed:
2634                 idx = self._reverse_index(idx)
2635             start, stop, step = idx, idx, 0
2636         else:
2637             raise TypeError('indices must be integers or slices')
2638         if ((start or 0) < 0 or (stop or 0) < 0
2639                 or (start is None and step < 0)
2640                 or (stop is None and step > 0)):
2641             # We need to consume the entire iterable to be able to slice from the end
2642             # Obviously, never use this with infinite iterables
2643             self._exhaust()
2644             try:
2645                 return self._cache[idx]
2646             except IndexError as e:
2647                 raise self.IndexError(e) from e
2648         n = max(start or 0, stop or 0) - len(self._cache) + 1
2649         if n > 0:
2650             self._cache.extend(itertools.islice(self._iterable, n))
2651         try:
2652             return self._cache[idx]
2653         except IndexError as e:
2654             raise self.IndexError(e) from e
2655
2656     def __bool__(self):
2657         try:
2658             self[-1] if self._reversed else self[0]
2659         except self.IndexError:
2660             return False
2661         return True
2662
2663     def __len__(self):
2664         self._exhaust()
2665         return len(self._cache)
2666
2667     def __reversed__(self):
2668         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2669
2670     def __copy__(self):
2671         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2672
2673     def __repr__(self):
2674         # repr and str should mimic a list. So we exhaust the iterable
2675         return repr(self.exhaust())
2676
2677     def __str__(self):
2678         return repr(self.exhaust())
2679
2680
2681 class PagedList:
2682
2683     class IndexError(IndexError):
2684         pass
2685
2686     def __len__(self):
2687         # This is only useful for tests
2688         return len(self.getslice())
2689
2690     def __init__(self, pagefunc, pagesize, use_cache=True):
2691         self._pagefunc = pagefunc
2692         self._pagesize = pagesize
2693         self._pagecount = float('inf')
2694         self._use_cache = use_cache
2695         self._cache = {}
2696
2697     def getpage(self, pagenum):
2698         page_results = self._cache.get(pagenum)
2699         if page_results is None:
2700             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2701         if self._use_cache:
2702             self._cache[pagenum] = page_results
2703         return page_results
2704
2705     def getslice(self, start=0, end=None):
2706         return list(self._getslice(start, end))
2707
2708     def _getslice(self, start, end):
2709         raise NotImplementedError('This method must be implemented by subclasses')
2710
2711     def __getitem__(self, idx):
2712         assert self._use_cache, 'Indexing PagedList requires cache'
2713         if not isinstance(idx, int) or idx < 0:
2714             raise TypeError('indices must be non-negative integers')
2715         entries = self.getslice(idx, idx + 1)
2716         if not entries:
2717             raise self.IndexError()
2718         return entries[0]
2719
2720
2721 class OnDemandPagedList(PagedList):
2722     """Download pages until a page with less than maximum results"""
2723
2724     def _getslice(self, start, end):
2725         for pagenum in itertools.count(start // self._pagesize):
2726             firstid = pagenum * self._pagesize
2727             nextfirstid = pagenum * self._pagesize + self._pagesize
2728             if start >= nextfirstid:
2729                 continue
2730
2731             startv = (
2732                 start % self._pagesize
2733                 if firstid <= start < nextfirstid
2734                 else 0)
2735             endv = (
2736                 ((end - 1) % self._pagesize) + 1
2737                 if (end is not None and firstid <= end <= nextfirstid)
2738                 else None)
2739
2740             try:
2741                 page_results = self.getpage(pagenum)
2742             except Exception:
2743                 self._pagecount = pagenum - 1
2744                 raise
2745             if startv != 0 or endv is not None:
2746                 page_results = page_results[startv:endv]
2747             yield from page_results
2748
2749             # A little optimization - if current page is not "full", ie. does
2750             # not contain page_size videos then we can assume that this page
2751             # is the last one - there are no more ids on further pages -
2752             # i.e. no need to query again.
2753             if len(page_results) + startv < self._pagesize:
2754                 break
2755
2756             # If we got the whole page, but the next page is not interesting,
2757             # break out early as well
2758             if end == nextfirstid:
2759                 break
2760
2761
2762 class InAdvancePagedList(PagedList):
2763     """PagedList with total number of pages known in advance"""
2764
2765     def __init__(self, pagefunc, pagecount, pagesize):
2766         PagedList.__init__(self, pagefunc, pagesize, True)
2767         self._pagecount = pagecount
2768
2769     def _getslice(self, start, end):
2770         start_page = start // self._pagesize
2771         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2772         skip_elems = start - start_page * self._pagesize
2773         only_more = None if end is None else end - start
2774         for pagenum in range(start_page, end_page):
2775             page_results = self.getpage(pagenum)
2776             if skip_elems:
2777                 page_results = page_results[skip_elems:]
2778                 skip_elems = None
2779             if only_more is not None:
2780                 if len(page_results) < only_more:
2781                     only_more -= len(page_results)
2782                 else:
2783                     yield from page_results[:only_more]
2784                     break
2785             yield from page_results
2786
2787
2788 class PlaylistEntries:
2789     MissingEntry = object()
2790     is_exhausted = False
2791
2792     def __init__(self, ydl, info_dict):
2793         self.ydl = ydl
2794
2795         # _entries must be assigned now since infodict can change during iteration
2796         entries = info_dict.get('entries')
2797         if entries is None:
2798             raise EntryNotInPlaylist('There are no entries')
2799         elif isinstance(entries, list):
2800             self.is_exhausted = True
2801
2802         requested_entries = info_dict.get('requested_entries')
2803         self.is_incomplete = requested_entries is not None
2804         if self.is_incomplete:
2805             assert self.is_exhausted
2806             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2807             for i, entry in zip(requested_entries, entries):
2808                 self._entries[i - 1] = entry
2809         elif isinstance(entries, (list, PagedList, LazyList)):
2810             self._entries = entries
2811         else:
2812             self._entries = LazyList(entries)
2813
2814     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2815         (?P<start>[+-]?\d+)?
2816         (?P<range>[:-]
2817             (?P<end>[+-]?\d+|inf(?:inite)?)?
2818             (?::(?P<step>[+-]?\d+))?
2819         )?''')
2820
2821     @classmethod
2822     def parse_playlist_items(cls, string):
2823         for segment in string.split(','):
2824             if not segment:
2825                 raise ValueError('There is two or more consecutive commas')
2826             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2827             if not mobj:
2828                 raise ValueError(f'{segment!r} is not a valid specification')
2829             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2830             if int_or_none(step) == 0:
2831                 raise ValueError(f'Step in {segment!r} cannot be zero')
2832             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2833
2834     def get_requested_items(self):
2835         playlist_items = self.ydl.params.get('playlist_items')
2836         playlist_start = self.ydl.params.get('playliststart', 1)
2837         playlist_end = self.ydl.params.get('playlistend')
2838         # For backwards compatibility, interpret -1 as whole list
2839         if playlist_end in (-1, None):
2840             playlist_end = ''
2841         if not playlist_items:
2842             playlist_items = f'{playlist_start}:{playlist_end}'
2843         elif playlist_start != 1 or playlist_end:
2844             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2845
2846         for index in self.parse_playlist_items(playlist_items):
2847             for i, entry in self[index]:
2848                 yield i, entry
2849                 if not entry:
2850                     continue
2851                 try:
2852                     # The item may have just been added to archive. Don't break due to it
2853                     if not self.ydl.params.get('lazy_playlist'):
2854                         # TODO: Add auto-generated fields
2855                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2856                 except (ExistingVideoReached, RejectedVideoReached):
2857                     return
2858
2859     def get_full_count(self):
2860         if self.is_exhausted and not self.is_incomplete:
2861             return len(self)
2862         elif isinstance(self._entries, InAdvancePagedList):
2863             if self._entries._pagesize == 1:
2864                 return self._entries._pagecount
2865
2866     @functools.cached_property
2867     def _getter(self):
2868         if isinstance(self._entries, list):
2869             def get_entry(i):
2870                 try:
2871                     entry = self._entries[i]
2872                 except IndexError:
2873                     entry = self.MissingEntry
2874                     if not self.is_incomplete:
2875                         raise self.IndexError()
2876                 if entry is self.MissingEntry:
2877                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2878                 return entry
2879         else:
2880             def get_entry(i):
2881                 try:
2882                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2883                 except (LazyList.IndexError, PagedList.IndexError):
2884                     raise self.IndexError()
2885         return get_entry
2886
2887     def __getitem__(self, idx):
2888         if isinstance(idx, int):
2889             idx = slice(idx, idx)
2890
2891         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2892         step = 1 if idx.step is None else idx.step
2893         if idx.start is None:
2894             start = 0 if step > 0 else len(self) - 1
2895         else:
2896             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2897
2898         # NB: Do not call len(self) when idx == [:]
2899         if idx.stop is None:
2900             stop = 0 if step < 0 else float('inf')
2901         else:
2902             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2903         stop += [-1, 1][step > 0]
2904
2905         for i in frange(start, stop, step):
2906             if i < 0:
2907                 continue
2908             try:
2909                 entry = self._getter(i)
2910             except self.IndexError:
2911                 self.is_exhausted = True
2912                 if step > 0:
2913                     break
2914                 continue
2915             yield i + 1, entry
2916
2917     def __len__(self):
2918         return len(tuple(self[:]))
2919
2920     class IndexError(IndexError):
2921         pass
2922
2923
2924 def uppercase_escape(s):
2925     unicode_escape = codecs.getdecoder('unicode_escape')
2926     return re.sub(
2927         r'\\U[0-9a-fA-F]{8}',
2928         lambda m: unicode_escape(m.group(0))[0],
2929         s)
2930
2931
2932 def lowercase_escape(s):
2933     unicode_escape = codecs.getdecoder('unicode_escape')
2934     return re.sub(
2935         r'\\u[0-9a-fA-F]{4}',
2936         lambda m: unicode_escape(m.group(0))[0],
2937         s)
2938
2939
2940 def escape_rfc3986(s):
2941     """Escape non-ASCII characters as suggested by RFC 3986"""
2942     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2943
2944
2945 def escape_url(url):
2946     """Escape URL as suggested by RFC 3986"""
2947     url_parsed = urllib.parse.urlparse(url)
2948     return url_parsed._replace(
2949         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2950         path=escape_rfc3986(url_parsed.path),
2951         params=escape_rfc3986(url_parsed.params),
2952         query=escape_rfc3986(url_parsed.query),
2953         fragment=escape_rfc3986(url_parsed.fragment)
2954     ).geturl()
2955
2956
2957 def parse_qs(url, **kwargs):
2958     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2959
2960
2961 def read_batch_urls(batch_fd):
2962     def fixup(url):
2963         if not isinstance(url, str):
2964             url = url.decode('utf-8', 'replace')
2965         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2966         for bom in BOM_UTF8:
2967             if url.startswith(bom):
2968                 url = url[len(bom):]
2969         url = url.lstrip()
2970         if not url or url.startswith(('#', ';', ']')):
2971             return False
2972         # "#" cannot be stripped out since it is part of the URI
2973         # However, it can be safely stripped out if following a whitespace
2974         return re.split(r'\s#', url, 1)[0].rstrip()
2975
2976     with contextlib.closing(batch_fd) as fd:
2977         return [url for url in map(fixup, fd) if url]
2978
2979
2980 def urlencode_postdata(*args, **kargs):
2981     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2982
2983
2984 def update_url(url, *, query_update=None, **kwargs):
2985     """Replace URL components specified by kwargs
2986        @param url           str or parse url tuple
2987        @param query_update  update query
2988        @returns             str
2989     """
2990     if isinstance(url, str):
2991         if not kwargs and not query_update:
2992             return url
2993         else:
2994             url = urllib.parse.urlparse(url)
2995     if query_update:
2996         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2997         kwargs['query'] = urllib.parse.urlencode({
2998             **urllib.parse.parse_qs(url.query),
2999             **query_update
3000         }, True)
3001     return urllib.parse.urlunparse(url._replace(**kwargs))
3002
3003
3004 def update_url_query(url, query):
3005     return update_url(url, query_update=query)
3006
3007
3008 def update_Request(req, url=None, data=None, headers=None, query=None):
3009     req_headers = req.headers.copy()
3010     req_headers.update(headers or {})
3011     req_data = data or req.data
3012     req_url = update_url_query(url or req.get_full_url(), query)
3013     req_get_method = req.get_method()
3014     if req_get_method == 'HEAD':
3015         req_type = HEADRequest
3016     elif req_get_method == 'PUT':
3017         req_type = PUTRequest
3018     else:
3019         req_type = urllib.request.Request
3020     new_req = req_type(
3021         req_url, data=req_data, headers=req_headers,
3022         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3023     if hasattr(req, 'timeout'):
3024         new_req.timeout = req.timeout
3025     return new_req
3026
3027
3028 def _multipart_encode_impl(data, boundary):
3029     content_type = 'multipart/form-data; boundary=%s' % boundary
3030
3031     out = b''
3032     for k, v in data.items():
3033         out += b'--' + boundary.encode('ascii') + b'\r\n'
3034         if isinstance(k, str):
3035             k = k.encode()
3036         if isinstance(v, str):
3037             v = v.encode()
3038         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3039         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3040         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3041         if boundary.encode('ascii') in content:
3042             raise ValueError('Boundary overlaps with data')
3043         out += content
3044
3045     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3046
3047     return out, content_type
3048
3049
3050 def multipart_encode(data, boundary=None):
3051     '''
3052     Encode a dict to RFC 7578-compliant form-data
3053
3054     data:
3055         A dict where keys and values can be either Unicode or bytes-like
3056         objects.
3057     boundary:
3058         If specified a Unicode object, it's used as the boundary. Otherwise
3059         a random boundary is generated.
3060
3061     Reference: https://tools.ietf.org/html/rfc7578
3062     '''
3063     has_specified_boundary = boundary is not None
3064
3065     while True:
3066         if boundary is None:
3067             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3068
3069         try:
3070             out, content_type = _multipart_encode_impl(data, boundary)
3071             break
3072         except ValueError:
3073             if has_specified_boundary:
3074                 raise
3075             boundary = None
3076
3077     return out, content_type
3078
3079
3080 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3081     if blocked_types is NO_DEFAULT:
3082         blocked_types = (str, bytes, collections.abc.Mapping)
3083     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3084
3085
3086 def variadic(x, allowed_types=NO_DEFAULT):
3087     if not isinstance(allowed_types, (tuple, type)):
3088         deprecation_warning('allowed_types should be a tuple or a type')
3089         allowed_types = tuple(allowed_types)
3090     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3091
3092
3093 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3094     for f in funcs:
3095         try:
3096             val = f(*args, **kwargs)
3097         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3098             pass
3099         else:
3100             if expected_type is None or isinstance(val, expected_type):
3101                 return val
3102
3103
3104 def try_get(src, getter, expected_type=None):
3105     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3106
3107
3108 def filter_dict(dct, cndn=lambda _, v: v is not None):
3109     return {k: v for k, v in dct.items() if cndn(k, v)}
3110
3111
3112 def merge_dicts(*dicts):
3113     merged = {}
3114     for a_dict in dicts:
3115         for k, v in a_dict.items():
3116             if (v is not None and k not in merged
3117                     or isinstance(v, str) and merged[k] == ''):
3118                 merged[k] = v
3119     return merged
3120
3121
3122 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3123     return string if isinstance(string, str) else str(string, encoding, errors)
3124
3125
3126 US_RATINGS = {
3127     'G': 0,
3128     'PG': 10,
3129     'PG-13': 13,
3130     'R': 16,
3131     'NC': 18,
3132 }
3133
3134
3135 TV_PARENTAL_GUIDELINES = {
3136     'TV-Y': 0,
3137     'TV-Y7': 7,
3138     'TV-G': 0,
3139     'TV-PG': 0,
3140     'TV-14': 14,
3141     'TV-MA': 17,
3142 }
3143
3144
3145 def parse_age_limit(s):
3146     # isinstance(False, int) is True. So type() must be used instead
3147     if type(s) is int:  # noqa: E721
3148         return s if 0 <= s <= 21 else None
3149     elif not isinstance(s, str):
3150         return None
3151     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3152     if m:
3153         return int(m.group('age'))
3154     s = s.upper()
3155     if s in US_RATINGS:
3156         return US_RATINGS[s]
3157     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3158     if m:
3159         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3160     return None
3161
3162
3163 def strip_jsonp(code):
3164     return re.sub(
3165         r'''(?sx)^
3166             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3167             (?:\s*&&\s*(?P=func_name))?
3168             \s*\(\s*(?P<callback_data>.*)\);?
3169             \s*?(?://[^\n]*)*$''',
3170         r'\g<callback_data>', code)
3171
3172
3173 def js_to_json(code, vars={}, *, strict=False):
3174     # vars is a dict of var, val pairs to substitute
3175     STRING_QUOTES = '\'"`'
3176     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3177     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3178     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3179     INTEGER_TABLE = (
3180         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3181         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3182     )
3183
3184     def process_escape(match):
3185         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3186         escape = match.group(1) or match.group(2)
3187
3188         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3189                 else R'\u00' if escape == 'x'
3190                 else '' if escape == '\n'
3191                 else escape)
3192
3193     def template_substitute(match):
3194         evaluated = js_to_json(match.group(1), vars, strict=strict)
3195         if evaluated[0] == '"':
3196             return json.loads(evaluated)
3197         return evaluated
3198
3199     def fix_kv(m):
3200         v = m.group(0)
3201         if v in ('true', 'false', 'null'):
3202             return v
3203         elif v in ('undefined', 'void 0'):
3204             return 'null'
3205         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3206             return ''
3207
3208         if v[0] in STRING_QUOTES:
3209             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3210             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3211             return f'"{escaped}"'
3212
3213         for regex, base in INTEGER_TABLE:
3214             im = re.match(regex, v)
3215             if im:
3216                 i = int(im.group(1), base)
3217                 return f'"{i}":' if v.endswith(':') else str(i)
3218
3219         if v in vars:
3220             try:
3221                 if not strict:
3222                     json.loads(vars[v])
3223             except json.JSONDecodeError:
3224                 return json.dumps(vars[v])
3225             else:
3226                 return vars[v]
3227
3228         if not strict:
3229             return f'"{v}"'
3230
3231         raise ValueError(f'Unknown value: {v}')
3232
3233     def create_map(mobj):
3234         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3235
3236     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3237     if not strict:
3238         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3239         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3240         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3241         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3242
3243     return re.sub(rf'''(?sx)
3244         {STRING_RE}|
3245         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3246         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3247         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3248         [0-9]+(?={SKIP_RE}:)|
3249         !+
3250         ''', fix_kv, code)
3251
3252
3253 def qualities(quality_ids):
3254     """ Get a numeric quality value out of a list of possible values """
3255     def q(qid):
3256         try:
3257             return quality_ids.index(qid)
3258         except ValueError:
3259             return -1
3260     return q
3261
3262
3263 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3264
3265
3266 DEFAULT_OUTTMPL = {
3267     'default': '%(title)s [%(id)s].%(ext)s',
3268     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3269 }
3270 OUTTMPL_TYPES = {
3271     'chapter': None,
3272     'subtitle': None,
3273     'thumbnail': None,
3274     'description': 'description',
3275     'annotation': 'annotations.xml',
3276     'infojson': 'info.json',
3277     'link': None,
3278     'pl_video': None,
3279     'pl_thumbnail': None,
3280     'pl_description': 'description',
3281     'pl_infojson': 'info.json',
3282 }
3283
3284 # As of [1] format syntax is:
3285 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3286 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3287 STR_FORMAT_RE_TMPL = r'''(?x)
3288     (?<!%)(?P<prefix>(?:%%)*)
3289     %
3290     (?P<has_key>\((?P<key>{0})\))?
3291     (?P<format>
3292         (?P<conversion>[#0\-+ ]+)?
3293         (?P<min_width>\d+)?
3294         (?P<precision>\.\d+)?
3295         (?P<len_mod>[hlL])?  # unused in python
3296         {1}  # conversion type
3297     )
3298 '''
3299
3300
3301 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3302
3303
3304 def limit_length(s, length):
3305     """ Add ellipses to overly long strings """
3306     if s is None:
3307         return None
3308     ELLIPSES = '...'
3309     if len(s) > length:
3310         return s[:length - len(ELLIPSES)] + ELLIPSES
3311     return s
3312
3313
3314 def version_tuple(v):
3315     return tuple(int(e) for e in re.split(r'[-.]', v))
3316
3317
3318 def is_outdated_version(version, limit, assume_new=True):
3319     if not version:
3320         return not assume_new
3321     try:
3322         return version_tuple(version) < version_tuple(limit)
3323     except ValueError:
3324         return not assume_new
3325
3326
3327 def ytdl_is_updateable():
3328     """ Returns if yt-dlp can be updated with -U """
3329
3330     from ..update import is_non_updateable
3331
3332     return not is_non_updateable()
3333
3334
3335 def args_to_str(args):
3336     # Get a short string representation for a subprocess command
3337     return ' '.join(compat_shlex_quote(a) for a in args)
3338
3339
3340 def error_to_str(err):
3341     return f'{type(err).__name__}: {err}'
3342
3343
3344 def mimetype2ext(mt, default=NO_DEFAULT):
3345     if not isinstance(mt, str):
3346         if default is not NO_DEFAULT:
3347             return default
3348         return None
3349
3350     MAP = {
3351         # video
3352         '3gpp': '3gp',
3353         'mp2t': 'ts',
3354         'mp4': 'mp4',
3355         'mpeg': 'mpeg',
3356         'mpegurl': 'm3u8',
3357         'quicktime': 'mov',
3358         'webm': 'webm',
3359         'vp9': 'vp9',
3360         'x-flv': 'flv',
3361         'x-m4v': 'm4v',
3362         'x-matroska': 'mkv',
3363         'x-mng': 'mng',
3364         'x-mp4-fragmented': 'mp4',
3365         'x-ms-asf': 'asf',
3366         'x-ms-wmv': 'wmv',
3367         'x-msvideo': 'avi',
3368
3369         # application (streaming playlists)
3370         'dash+xml': 'mpd',
3371         'f4m+xml': 'f4m',
3372         'hds+xml': 'f4m',
3373         'vnd.apple.mpegurl': 'm3u8',
3374         'vnd.ms-sstr+xml': 'ism',
3375         'x-mpegurl': 'm3u8',
3376
3377         # audio
3378         'audio/mp4': 'm4a',
3379         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3380         # Using .mp3 as it's the most popular one
3381         'audio/mpeg': 'mp3',
3382         'audio/webm': 'webm',
3383         'audio/x-matroska': 'mka',
3384         'audio/x-mpegurl': 'm3u',
3385         'midi': 'mid',
3386         'ogg': 'ogg',
3387         'wav': 'wav',
3388         'wave': 'wav',
3389         'x-aac': 'aac',
3390         'x-flac': 'flac',
3391         'x-m4a': 'm4a',
3392         'x-realaudio': 'ra',
3393         'x-wav': 'wav',
3394
3395         # image
3396         'avif': 'avif',
3397         'bmp': 'bmp',
3398         'gif': 'gif',
3399         'jpeg': 'jpg',
3400         'png': 'png',
3401         'svg+xml': 'svg',
3402         'tiff': 'tif',
3403         'vnd.wap.wbmp': 'wbmp',
3404         'webp': 'webp',
3405         'x-icon': 'ico',
3406         'x-jng': 'jng',
3407         'x-ms-bmp': 'bmp',
3408
3409         # caption
3410         'filmstrip+json': 'fs',
3411         'smptett+xml': 'tt',
3412         'ttaf+xml': 'dfxp',
3413         'ttml+xml': 'ttml',
3414         'x-ms-sami': 'sami',
3415
3416         # misc
3417         'gzip': 'gz',
3418         'json': 'json',
3419         'xml': 'xml',
3420         'zip': 'zip',
3421     }
3422
3423     mimetype = mt.partition(';')[0].strip().lower()
3424     _, _, subtype = mimetype.rpartition('/')
3425
3426     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3427     if ext:
3428         return ext
3429     elif default is not NO_DEFAULT:
3430         return default
3431     return subtype.replace('+', '.')
3432
3433
3434 def ext2mimetype(ext_or_url):
3435     if not ext_or_url:
3436         return None
3437     if '.' not in ext_or_url:
3438         ext_or_url = f'file.{ext_or_url}'
3439     return mimetypes.guess_type(ext_or_url)[0]
3440
3441
3442 def parse_codecs(codecs_str):
3443     # http://tools.ietf.org/html/rfc6381
3444     if not codecs_str:
3445         return {}
3446     split_codecs = list(filter(None, map(
3447         str.strip, codecs_str.strip().strip(',').split(','))))
3448     vcodec, acodec, scodec, hdr = None, None, None, None
3449     for full_codec in split_codecs:
3450         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3451         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3452                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3453             if vcodec:
3454                 continue
3455             vcodec = full_codec
3456             if parts[0] in ('dvh1', 'dvhe'):
3457                 hdr = 'DV'
3458             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3459                 hdr = 'HDR10'
3460             elif parts[:2] == ['vp9', '2']:
3461                 hdr = 'HDR10'
3462         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3463                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3464             acodec = acodec or full_codec
3465         elif parts[0] in ('stpp', 'wvtt'):
3466             scodec = scodec or full_codec
3467         else:
3468             write_string(f'WARNING: Unknown codec {full_codec}\n')
3469     if vcodec or acodec or scodec:
3470         return {
3471             'vcodec': vcodec or 'none',
3472             'acodec': acodec or 'none',
3473             'dynamic_range': hdr,
3474             **({'scodec': scodec} if scodec is not None else {}),
3475         }
3476     elif len(split_codecs) == 2:
3477         return {
3478             'vcodec': split_codecs[0],
3479             'acodec': split_codecs[1],
3480         }
3481     return {}
3482
3483
3484 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3485     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3486
3487     allow_mkv = not preferences or 'mkv' in preferences
3488
3489     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3490         return 'mkv'  # TODO: any other format allows this?
3491
3492     # TODO: All codecs supported by parse_codecs isn't handled here
3493     COMPATIBLE_CODECS = {
3494         'mp4': {
3495             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3496             'h264', 'aacl', 'ec-3',  # Set in ISM
3497         },
3498         'webm': {
3499             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3500             'vp9x', 'vp8x',  # in the webm spec
3501         },
3502     }
3503
3504     sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3505     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3506
3507     for ext in preferences or COMPATIBLE_CODECS.keys():
3508         codec_set = COMPATIBLE_CODECS.get(ext, set())
3509         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3510             return ext
3511
3512     COMPATIBLE_EXTS = (
3513         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3514         {'webm', 'weba'},
3515     )
3516     for ext in preferences or vexts:
3517         current_exts = {ext, *vexts, *aexts}
3518         if ext == 'mkv' or current_exts == {ext} or any(
3519                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3520             return ext
3521     return 'mkv' if allow_mkv else preferences[-1]
3522
3523
3524 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3525     getheader = url_handle.headers.get
3526
3527     cd = getheader('Content-Disposition')
3528     if cd:
3529         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3530         if m:
3531             e = determine_ext(m.group('filename'), default_ext=None)
3532             if e:
3533                 return e
3534
3535     meta_ext = getheader('x-amz-meta-name')
3536     if meta_ext:
3537         e = meta_ext.rpartition('.')[2]
3538         if e:
3539             return e
3540
3541     return mimetype2ext(getheader('Content-Type'), default=default)
3542
3543
3544 def encode_data_uri(data, mime_type):
3545     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3546
3547
3548 def age_restricted(content_limit, age_limit):
3549     """ Returns True iff the content should be blocked """
3550
3551     if age_limit is None:  # No limit set
3552         return False
3553     if content_limit is None:
3554         return False  # Content available for everyone
3555     return age_limit < content_limit
3556
3557
3558 # List of known byte-order-marks (BOM)
3559 BOMS = [
3560     (b'\xef\xbb\xbf', 'utf-8'),
3561     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3562     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3563     (b'\xff\xfe', 'utf-16-le'),
3564     (b'\xfe\xff', 'utf-16-be'),
3565 ]
3566
3567
3568 def is_html(first_bytes):
3569     """ Detect whether a file contains HTML by examining its first bytes. """
3570
3571     encoding = 'utf-8'
3572     for bom, enc in BOMS:
3573         while first_bytes.startswith(bom):
3574             encoding, first_bytes = enc, first_bytes[len(bom):]
3575
3576     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3577
3578
3579 def determine_protocol(info_dict):
3580     protocol = info_dict.get('protocol')
3581     if protocol is not None:
3582         return protocol
3583
3584     url = sanitize_url(info_dict['url'])
3585     if url.startswith('rtmp'):
3586         return 'rtmp'
3587     elif url.startswith('mms'):
3588         return 'mms'
3589     elif url.startswith('rtsp'):
3590         return 'rtsp'
3591
3592     ext = determine_ext(url)
3593     if ext == 'm3u8':
3594         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3595     elif ext == 'f4m':
3596         return 'f4m'
3597
3598     return urllib.parse.urlparse(url).scheme
3599
3600
3601 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3602     """ Render a list of rows, each as a list of values.
3603     Text after a \t will be right aligned """
3604     def width(string):
3605         return len(remove_terminal_sequences(string).replace('\t', ''))
3606
3607     def get_max_lens(table):
3608         return [max(width(str(v)) for v in col) for col in zip(*table)]
3609
3610     def filter_using_list(row, filterArray):
3611         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3612
3613     max_lens = get_max_lens(data) if hide_empty else []
3614     header_row = filter_using_list(header_row, max_lens)
3615     data = [filter_using_list(row, max_lens) for row in data]
3616
3617     table = [header_row] + data
3618     max_lens = get_max_lens(table)
3619     extra_gap += 1
3620     if delim:
3621         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3622         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3623     for row in table:
3624         for pos, text in enumerate(map(str, row)):
3625             if '\t' in text:
3626                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3627             else:
3628                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3629     ret = '\n'.join(''.join(row).rstrip() for row in table)
3630     return ret
3631
3632
3633 def _match_one(filter_part, dct, incomplete):
3634     # TODO: Generalize code with YoutubeDL._build_format_filter
3635     STRING_OPERATORS = {
3636         '*=': operator.contains,
3637         '^=': lambda attr, value: attr.startswith(value),
3638         '$=': lambda attr, value: attr.endswith(value),
3639         '~=': lambda attr, value: re.search(value, attr),
3640     }
3641     COMPARISON_OPERATORS = {
3642         **STRING_OPERATORS,
3643         '<=': operator.le,  # "<=" must be defined above "<"
3644         '<': operator.lt,
3645         '>=': operator.ge,
3646         '>': operator.gt,
3647         '=': operator.eq,
3648     }
3649
3650     if isinstance(incomplete, bool):
3651         is_incomplete = lambda _: incomplete
3652     else:
3653         is_incomplete = lambda k: k in incomplete
3654
3655     operator_rex = re.compile(r'''(?x)
3656         (?P<key>[a-z_]+)
3657         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3658         (?:
3659             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3660             (?P<strval>.+?)
3661         )
3662         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3663     m = operator_rex.fullmatch(filter_part.strip())
3664     if m:
3665         m = m.groupdict()
3666         unnegated_op = COMPARISON_OPERATORS[m['op']]
3667         if m['negation']:
3668             op = lambda attr, value: not unnegated_op(attr, value)
3669         else:
3670             op = unnegated_op
3671         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3672         if m['quote']:
3673             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3674         actual_value = dct.get(m['key'])
3675         numeric_comparison = None
3676         if isinstance(actual_value, (int, float)):
3677             # If the original field is a string and matching comparisonvalue is
3678             # a number we should respect the origin of the original field
3679             # and process comparison value as a string (see
3680             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3681             try:
3682                 numeric_comparison = int(comparison_value)
3683             except ValueError:
3684                 numeric_comparison = parse_filesize(comparison_value)
3685                 if numeric_comparison is None:
3686                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3687                 if numeric_comparison is None:
3688                     numeric_comparison = parse_duration(comparison_value)
3689         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3690             raise ValueError('Operator %s only supports string values!' % m['op'])
3691         if actual_value is None:
3692             return is_incomplete(m['key']) or m['none_inclusive']
3693         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3694
3695     UNARY_OPERATORS = {
3696         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3697         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3698     }
3699     operator_rex = re.compile(r'''(?x)
3700         (?P<op>%s)\s*(?P<key>[a-z_]+)
3701         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3702     m = operator_rex.fullmatch(filter_part.strip())
3703     if m:
3704         op = UNARY_OPERATORS[m.group('op')]
3705         actual_value = dct.get(m.group('key'))
3706         if is_incomplete(m.group('key')) and actual_value is None:
3707             return True
3708         return op(actual_value)
3709
3710     raise ValueError('Invalid filter part %r' % filter_part)
3711
3712
3713 def match_str(filter_str, dct, incomplete=False):
3714     """ Filter a dictionary with a simple string syntax.
3715     @returns           Whether the filter passes
3716     @param incomplete  Set of keys that is expected to be missing from dct.
3717                        Can be True/False to indicate all/none of the keys may be missing.
3718                        All conditions on incomplete keys pass if the key is missing
3719     """
3720     return all(
3721         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3722         for filter_part in re.split(r'(?<!\\)&', filter_str))
3723
3724
3725 def match_filter_func(filters, breaking_filters=None):
3726     if not filters and not breaking_filters:
3727         return None
3728     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3729     filters = set(variadic(filters or []))
3730
3731     interactive = '-' in filters
3732     if interactive:
3733         filters.remove('-')
3734
3735     def _match_func(info_dict, incomplete=False):
3736         ret = breaking_filters(info_dict, incomplete)
3737         if ret is not None:
3738             raise RejectedVideoReached(ret)
3739
3740         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3741             return NO_DEFAULT if interactive and not incomplete else None
3742         else:
3743             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3744             filter_str = ') | ('.join(map(str.strip, filters))
3745             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3746     return _match_func
3747
3748
3749 class download_range_func:
3750     def __init__(self, chapters, ranges):
3751         self.chapters, self.ranges = chapters, ranges
3752
3753     def __call__(self, info_dict, ydl):
3754         if not self.ranges and not self.chapters:
3755             yield {}
3756
3757         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3758                    else 'Cannot match chapters since chapter information is unavailable')
3759         for regex in self.chapters or []:
3760             for i, chapter in enumerate(info_dict.get('chapters') or []):
3761                 if re.search(regex, chapter['title']):
3762                     warning = None
3763                     yield {**chapter, 'index': i}
3764         if self.chapters and warning:
3765             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3766
3767         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3768
3769     def __eq__(self, other):
3770         return (isinstance(other, download_range_func)
3771                 and self.chapters == other.chapters and self.ranges == other.ranges)
3772
3773     def __repr__(self):
3774         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3775
3776
3777 def parse_dfxp_time_expr(time_expr):
3778     if not time_expr:
3779         return
3780
3781     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3782     if mobj:
3783         return float(mobj.group('time_offset'))
3784
3785     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3786     if mobj:
3787         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3788
3789
3790 def srt_subtitles_timecode(seconds):
3791     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3792
3793
3794 def ass_subtitles_timecode(seconds):
3795     time = timetuple_from_msec(seconds * 1000)
3796     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3797
3798
3799 def dfxp2srt(dfxp_data):
3800     '''
3801     @param dfxp_data A bytes-like object containing DFXP data
3802     @returns A unicode object containing converted SRT data
3803     '''
3804     LEGACY_NAMESPACES = (
3805         (b'http://www.w3.org/ns/ttml', [
3806             b'http://www.w3.org/2004/11/ttaf1',
3807             b'http://www.w3.org/2006/04/ttaf1',
3808             b'http://www.w3.org/2006/10/ttaf1',
3809         ]),
3810         (b'http://www.w3.org/ns/ttml#styling', [
3811             b'http://www.w3.org/ns/ttml#style',
3812         ]),
3813     )
3814
3815     SUPPORTED_STYLING = [
3816         'color',
3817         'fontFamily',
3818         'fontSize',
3819         'fontStyle',
3820         'fontWeight',
3821         'textDecoration'
3822     ]
3823
3824     _x = functools.partial(xpath_with_ns, ns_map={
3825         'xml': 'http://www.w3.org/XML/1998/namespace',
3826         'ttml': 'http://www.w3.org/ns/ttml',
3827         'tts': 'http://www.w3.org/ns/ttml#styling',
3828     })
3829
3830     styles = {}
3831     default_style = {}
3832
3833     class TTMLPElementParser:
3834         _out = ''
3835         _unclosed_elements = []
3836         _applied_styles = []
3837
3838         def start(self, tag, attrib):
3839             if tag in (_x('ttml:br'), 'br'):
3840                 self._out += '\n'
3841             else:
3842                 unclosed_elements = []
3843                 style = {}
3844                 element_style_id = attrib.get('style')
3845                 if default_style:
3846                     style.update(default_style)
3847                 if element_style_id:
3848                     style.update(styles.get(element_style_id, {}))
3849                 for prop in SUPPORTED_STYLING:
3850                     prop_val = attrib.get(_x('tts:' + prop))
3851                     if prop_val:
3852                         style[prop] = prop_val
3853                 if style:
3854                     font = ''
3855                     for k, v in sorted(style.items()):
3856                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3857                             continue
3858                         if k == 'color':
3859                             font += ' color="%s"' % v
3860                         elif k == 'fontSize':
3861                             font += ' size="%s"' % v
3862                         elif k == 'fontFamily':
3863                             font += ' face="%s"' % v
3864                         elif k == 'fontWeight' and v == 'bold':
3865                             self._out += '<b>'
3866                             unclosed_elements.append('b')
3867                         elif k == 'fontStyle' and v == 'italic':
3868                             self._out += '<i>'
3869                             unclosed_elements.append('i')
3870                         elif k == 'textDecoration' and v == 'underline':
3871                             self._out += '<u>'
3872                             unclosed_elements.append('u')
3873                     if font:
3874                         self._out += '<font' + font + '>'
3875                         unclosed_elements.append('font')
3876                     applied_style = {}
3877                     if self._applied_styles:
3878                         applied_style.update(self._applied_styles[-1])
3879                     applied_style.update(style)
3880                     self._applied_styles.append(applied_style)
3881                 self._unclosed_elements.append(unclosed_elements)
3882
3883         def end(self, tag):
3884             if tag not in (_x('ttml:br'), 'br'):
3885                 unclosed_elements = self._unclosed_elements.pop()
3886                 for element in reversed(unclosed_elements):
3887                     self._out += '</%s>' % element
3888                 if unclosed_elements and self._applied_styles:
3889                     self._applied_styles.pop()
3890
3891         def data(self, data):
3892             self._out += data
3893
3894         def close(self):
3895             return self._out.strip()
3896
3897     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3898     # This will not trigger false positives since only UTF-8 text is being replaced
3899     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3900
3901     def parse_node(node):
3902         target = TTMLPElementParser()
3903         parser = xml.etree.ElementTree.XMLParser(target=target)
3904         parser.feed(xml.etree.ElementTree.tostring(node))
3905         return parser.close()
3906
3907     for k, v in LEGACY_NAMESPACES:
3908         for ns in v:
3909             dfxp_data = dfxp_data.replace(ns, k)
3910
3911     dfxp = compat_etree_fromstring(dfxp_data)
3912     out = []
3913     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3914
3915     if not paras:
3916         raise ValueError('Invalid dfxp/TTML subtitle')
3917
3918     repeat = False
3919     while True:
3920         for style in dfxp.findall(_x('.//ttml:style')):
3921             style_id = style.get('id') or style.get(_x('xml:id'))
3922             if not style_id:
3923                 continue
3924             parent_style_id = style.get('style')
3925             if parent_style_id:
3926                 if parent_style_id not in styles:
3927                     repeat = True
3928                     continue
3929                 styles[style_id] = styles[parent_style_id].copy()
3930             for prop in SUPPORTED_STYLING:
3931                 prop_val = style.get(_x('tts:' + prop))
3932                 if prop_val:
3933                     styles.setdefault(style_id, {})[prop] = prop_val
3934         if repeat:
3935             repeat = False
3936         else:
3937             break
3938
3939     for p in ('body', 'div'):
3940         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3941         if ele is None:
3942             continue
3943         style = styles.get(ele.get('style'))
3944         if not style:
3945             continue
3946         default_style.update(style)
3947
3948     for para, index in zip(paras, itertools.count(1)):
3949         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3950         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3951         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3952         if begin_time is None:
3953             continue
3954         if not end_time:
3955             if not dur:
3956                 continue
3957             end_time = begin_time + dur
3958         out.append('%d\n%s --> %s\n%s\n\n' % (
3959             index,
3960             srt_subtitles_timecode(begin_time),
3961             srt_subtitles_timecode(end_time),
3962             parse_node(para)))
3963
3964     return ''.join(out)
3965
3966
3967 def cli_option(params, command_option, param, separator=None):
3968     param = params.get(param)
3969     return ([] if param is None
3970             else [command_option, str(param)] if separator is None
3971             else [f'{command_option}{separator}{param}'])
3972
3973
3974 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3975     param = params.get(param)
3976     assert param in (True, False, None)
3977     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3978
3979
3980 def cli_valueless_option(params, command_option, param, expected_value=True):
3981     return [command_option] if params.get(param) == expected_value else []
3982
3983
3984 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3985     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3986         if use_compat:
3987             return argdict
3988         else:
3989             argdict = None
3990     if argdict is None:
3991         return default
3992     assert isinstance(argdict, dict)
3993
3994     assert isinstance(keys, (list, tuple))
3995     for key_list in keys:
3996         arg_list = list(filter(
3997             lambda x: x is not None,
3998             [argdict.get(key.lower()) for key in variadic(key_list)]))
3999         if arg_list:
4000             return [arg for args in arg_list for arg in args]
4001     return default
4002
4003
4004 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4005     main_key, exe = main_key.lower(), exe.lower()
4006     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4007     keys = [f'{root_key}{k}' for k in (keys or [''])]
4008     if root_key in keys:
4009         if main_key != exe:
4010             keys.append((main_key, exe))
4011         keys.append('default')
4012     else:
4013         use_compat = False
4014     return cli_configuration_args(argdict, keys, default, use_compat)
4015
4016
4017 class ISO639Utils:
4018     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4019     _lang_map = {
4020         'aa': 'aar',
4021         'ab': 'abk',
4022         'ae': 'ave',
4023         'af': 'afr',
4024         'ak': 'aka',
4025         'am': 'amh',
4026         'an': 'arg',
4027         'ar': 'ara',
4028         'as': 'asm',
4029         'av': 'ava',
4030         'ay': 'aym',
4031         'az': 'aze',
4032         'ba': 'bak',
4033         'be': 'bel',
4034         'bg': 'bul',
4035         'bh': 'bih',
4036         'bi': 'bis',
4037         'bm': 'bam',
4038         'bn': 'ben',
4039         'bo': 'bod',
4040         'br': 'bre',
4041         'bs': 'bos',
4042         'ca': 'cat',
4043         'ce': 'che',
4044         'ch': 'cha',
4045         'co': 'cos',
4046         'cr': 'cre',
4047         'cs': 'ces',
4048         'cu': 'chu',
4049         'cv': 'chv',
4050         'cy': 'cym',
4051         'da': 'dan',
4052         'de': 'deu',
4053         'dv': 'div',
4054         'dz': 'dzo',
4055         'ee': 'ewe',
4056         'el': 'ell',
4057         'en': 'eng',
4058         'eo': 'epo',
4059         'es': 'spa',
4060         'et': 'est',
4061         'eu': 'eus',
4062         'fa': 'fas',
4063         'ff': 'ful',
4064         'fi': 'fin',
4065         'fj': 'fij',
4066         'fo': 'fao',
4067         'fr': 'fra',
4068         'fy': 'fry',
4069         'ga': 'gle',
4070         'gd': 'gla',
4071         'gl': 'glg',
4072         'gn': 'grn',
4073         'gu': 'guj',
4074         'gv': 'glv',
4075         'ha': 'hau',
4076         'he': 'heb',
4077         'iw': 'heb',  # Replaced by he in 1989 revision
4078         'hi': 'hin',
4079         'ho': 'hmo',
4080         'hr': 'hrv',
4081         'ht': 'hat',
4082         'hu': 'hun',
4083         'hy': 'hye',
4084         'hz': 'her',
4085         'ia': 'ina',
4086         'id': 'ind',
4087         'in': 'ind',  # Replaced by id in 1989 revision
4088         'ie': 'ile',
4089         'ig': 'ibo',
4090         'ii': 'iii',
4091         'ik': 'ipk',
4092         'io': 'ido',
4093         'is': 'isl',
4094         'it': 'ita',
4095         'iu': 'iku',
4096         'ja': 'jpn',
4097         'jv': 'jav',
4098         'ka': 'kat',
4099         'kg': 'kon',
4100         'ki': 'kik',
4101         'kj': 'kua',
4102         'kk': 'kaz',
4103         'kl': 'kal',
4104         'km': 'khm',
4105         'kn': 'kan',
4106         'ko': 'kor',
4107         'kr': 'kau',
4108         'ks': 'kas',
4109         'ku': 'kur',
4110         'kv': 'kom',
4111         'kw': 'cor',
4112         'ky': 'kir',
4113         'la': 'lat',
4114         'lb': 'ltz',
4115         'lg': 'lug',
4116         'li': 'lim',
4117         'ln': 'lin',
4118         'lo': 'lao',
4119         'lt': 'lit',
4120         'lu': 'lub',
4121         'lv': 'lav',
4122         'mg': 'mlg',
4123         'mh': 'mah',
4124         'mi': 'mri',
4125         'mk': 'mkd',
4126         'ml': 'mal',
4127         'mn': 'mon',
4128         'mr': 'mar',
4129         'ms': 'msa',
4130         'mt': 'mlt',
4131         'my': 'mya',
4132         'na': 'nau',
4133         'nb': 'nob',
4134         'nd': 'nde',
4135         'ne': 'nep',
4136         'ng': 'ndo',
4137         'nl': 'nld',
4138         'nn': 'nno',
4139         'no': 'nor',
4140         'nr': 'nbl',
4141         'nv': 'nav',
4142         'ny': 'nya',
4143         'oc': 'oci',
4144         'oj': 'oji',
4145         'om': 'orm',
4146         'or': 'ori',
4147         'os': 'oss',
4148         'pa': 'pan',
4149         'pi': 'pli',
4150         'pl': 'pol',
4151         'ps': 'pus',
4152         'pt': 'por',
4153         'qu': 'que',
4154         'rm': 'roh',
4155         'rn': 'run',
4156         'ro': 'ron',
4157         'ru': 'rus',
4158         'rw': 'kin',
4159         'sa': 'san',
4160         'sc': 'srd',
4161         'sd': 'snd',
4162         'se': 'sme',
4163         'sg': 'sag',
4164         'si': 'sin',
4165         'sk': 'slk',
4166         'sl': 'slv',
4167         'sm': 'smo',
4168         'sn': 'sna',
4169         'so': 'som',
4170         'sq': 'sqi',
4171         'sr': 'srp',
4172         'ss': 'ssw',
4173         'st': 'sot',
4174         'su': 'sun',
4175         'sv': 'swe',
4176         'sw': 'swa',
4177         'ta': 'tam',
4178         'te': 'tel',
4179         'tg': 'tgk',
4180         'th': 'tha',
4181         'ti': 'tir',
4182         'tk': 'tuk',
4183         'tl': 'tgl',
4184         'tn': 'tsn',
4185         'to': 'ton',
4186         'tr': 'tur',
4187         'ts': 'tso',
4188         'tt': 'tat',
4189         'tw': 'twi',
4190         'ty': 'tah',
4191         'ug': 'uig',
4192         'uk': 'ukr',
4193         'ur': 'urd',
4194         'uz': 'uzb',
4195         've': 'ven',
4196         'vi': 'vie',
4197         'vo': 'vol',
4198         'wa': 'wln',
4199         'wo': 'wol',
4200         'xh': 'xho',
4201         'yi': 'yid',
4202         'ji': 'yid',  # Replaced by yi in 1989 revision
4203         'yo': 'yor',
4204         'za': 'zha',
4205         'zh': 'zho',
4206         'zu': 'zul',
4207     }
4208
4209     @classmethod
4210     def short2long(cls, code):
4211         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4212         return cls._lang_map.get(code[:2])
4213
4214     @classmethod
4215     def long2short(cls, code):
4216         """Convert language code from ISO 639-2/T to ISO 639-1"""
4217         for short_name, long_name in cls._lang_map.items():
4218             if long_name == code:
4219                 return short_name
4220
4221
4222 class ISO3166Utils:
4223     # From http://data.okfn.org/data/core/country-list
4224     _country_map = {
4225         'AF': 'Afghanistan',
4226         'AX': 'Åland Islands',
4227         'AL': 'Albania',
4228         'DZ': 'Algeria',
4229         'AS': 'American Samoa',
4230         'AD': 'Andorra',
4231         'AO': 'Angola',
4232         'AI': 'Anguilla',
4233         'AQ': 'Antarctica',
4234         'AG': 'Antigua and Barbuda',
4235         'AR': 'Argentina',
4236         'AM': 'Armenia',
4237         'AW': 'Aruba',
4238         'AU': 'Australia',
4239         'AT': 'Austria',
4240         'AZ': 'Azerbaijan',
4241         'BS': 'Bahamas',
4242         'BH': 'Bahrain',
4243         'BD': 'Bangladesh',
4244         'BB': 'Barbados',
4245         'BY': 'Belarus',
4246         'BE': 'Belgium',
4247         'BZ': 'Belize',
4248         'BJ': 'Benin',
4249         'BM': 'Bermuda',
4250         'BT': 'Bhutan',
4251         'BO': 'Bolivia, Plurinational State of',
4252         'BQ': 'Bonaire, Sint Eustatius and Saba',
4253         'BA': 'Bosnia and Herzegovina',
4254         'BW': 'Botswana',
4255         'BV': 'Bouvet Island',
4256         'BR': 'Brazil',
4257         'IO': 'British Indian Ocean Territory',
4258         'BN': 'Brunei Darussalam',
4259         'BG': 'Bulgaria',
4260         'BF': 'Burkina Faso',
4261         'BI': 'Burundi',
4262         'KH': 'Cambodia',
4263         'CM': 'Cameroon',
4264         'CA': 'Canada',
4265         'CV': 'Cape Verde',
4266         'KY': 'Cayman Islands',
4267         'CF': 'Central African Republic',
4268         'TD': 'Chad',
4269         'CL': 'Chile',
4270         'CN': 'China',
4271         'CX': 'Christmas Island',
4272         'CC': 'Cocos (Keeling) Islands',
4273         'CO': 'Colombia',
4274         'KM': 'Comoros',
4275         'CG': 'Congo',
4276         'CD': 'Congo, the Democratic Republic of the',
4277         'CK': 'Cook Islands',
4278         'CR': 'Costa Rica',
4279         'CI': 'Côte d\'Ivoire',
4280         'HR': 'Croatia',
4281         'CU': 'Cuba',
4282         'CW': 'Curaçao',
4283         'CY': 'Cyprus',
4284         'CZ': 'Czech Republic',
4285         'DK': 'Denmark',
4286         'DJ': 'Djibouti',
4287         'DM': 'Dominica',
4288         'DO': 'Dominican Republic',
4289         'EC': 'Ecuador',
4290         'EG': 'Egypt',
4291         'SV': 'El Salvador',
4292         'GQ': 'Equatorial Guinea',
4293         'ER': 'Eritrea',
4294         'EE': 'Estonia',
4295         'ET': 'Ethiopia',
4296         'FK': 'Falkland Islands (Malvinas)',
4297         'FO': 'Faroe Islands',
4298         'FJ': 'Fiji',
4299         'FI': 'Finland',
4300         'FR': 'France',
4301         'GF': 'French Guiana',
4302         'PF': 'French Polynesia',
4303         'TF': 'French Southern Territories',
4304         'GA': 'Gabon',
4305         'GM': 'Gambia',
4306         'GE': 'Georgia',
4307         'DE': 'Germany',
4308         'GH': 'Ghana',
4309         'GI': 'Gibraltar',
4310         'GR': 'Greece',
4311         'GL': 'Greenland',
4312         'GD': 'Grenada',
4313         'GP': 'Guadeloupe',
4314         'GU': 'Guam',
4315         'GT': 'Guatemala',
4316         'GG': 'Guernsey',
4317         'GN': 'Guinea',
4318         'GW': 'Guinea-Bissau',
4319         'GY': 'Guyana',
4320         'HT': 'Haiti',
4321         'HM': 'Heard Island and McDonald Islands',
4322         'VA': 'Holy See (Vatican City State)',
4323         'HN': 'Honduras',
4324         'HK': 'Hong Kong',
4325         'HU': 'Hungary',
4326         'IS': 'Iceland',
4327         'IN': 'India',
4328         'ID': 'Indonesia',
4329         'IR': 'Iran, Islamic Republic of',
4330         'IQ': 'Iraq',
4331         'IE': 'Ireland',
4332         'IM': 'Isle of Man',
4333         'IL': 'Israel',
4334         'IT': 'Italy',
4335         'JM': 'Jamaica',
4336         'JP': 'Japan',
4337         'JE': 'Jersey',
4338         'JO': 'Jordan',
4339         'KZ': 'Kazakhstan',
4340         'KE': 'Kenya',
4341         'KI': 'Kiribati',
4342         'KP': 'Korea, Democratic People\'s Republic of',
4343         'KR': 'Korea, Republic of',
4344         'KW': 'Kuwait',
4345         'KG': 'Kyrgyzstan',
4346         'LA': 'Lao People\'s Democratic Republic',
4347         'LV': 'Latvia',
4348         'LB': 'Lebanon',
4349         'LS': 'Lesotho',
4350         'LR': 'Liberia',
4351         'LY': 'Libya',
4352         'LI': 'Liechtenstein',
4353         'LT': 'Lithuania',
4354         'LU': 'Luxembourg',
4355         'MO': 'Macao',
4356         'MK': 'Macedonia, the Former Yugoslav Republic of',
4357         'MG': 'Madagascar',
4358         'MW': 'Malawi',
4359         'MY': 'Malaysia',
4360         'MV': 'Maldives',
4361         'ML': 'Mali',
4362         'MT': 'Malta',
4363         'MH': 'Marshall Islands',
4364         'MQ': 'Martinique',
4365         'MR': 'Mauritania',
4366         'MU': 'Mauritius',
4367         'YT': 'Mayotte',
4368         'MX': 'Mexico',
4369         'FM': 'Micronesia, Federated States of',
4370         'MD': 'Moldova, Republic of',
4371         'MC': 'Monaco',
4372         'MN': 'Mongolia',
4373         'ME': 'Montenegro',
4374         'MS': 'Montserrat',
4375         'MA': 'Morocco',
4376         'MZ': 'Mozambique',
4377         'MM': 'Myanmar',
4378         'NA': 'Namibia',
4379         'NR': 'Nauru',
4380         'NP': 'Nepal',
4381         'NL': 'Netherlands',
4382         'NC': 'New Caledonia',
4383         'NZ': 'New Zealand',
4384         'NI': 'Nicaragua',
4385         'NE': 'Niger',
4386         'NG': 'Nigeria',
4387         'NU': 'Niue',
4388         'NF': 'Norfolk Island',
4389         'MP': 'Northern Mariana Islands',
4390         'NO': 'Norway',
4391         'OM': 'Oman',
4392         'PK': 'Pakistan',
4393         'PW': 'Palau',
4394         'PS': 'Palestine, State of',
4395         'PA': 'Panama',
4396         'PG': 'Papua New Guinea',
4397         'PY': 'Paraguay',
4398         'PE': 'Peru',
4399         'PH': 'Philippines',
4400         'PN': 'Pitcairn',
4401         'PL': 'Poland',
4402         'PT': 'Portugal',
4403         'PR': 'Puerto Rico',
4404         'QA': 'Qatar',
4405         'RE': 'Réunion',
4406         'RO': 'Romania',
4407         'RU': 'Russian Federation',
4408         'RW': 'Rwanda',
4409         'BL': 'Saint Barthélemy',
4410         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4411         'KN': 'Saint Kitts and Nevis',
4412         'LC': 'Saint Lucia',
4413         'MF': 'Saint Martin (French part)',
4414         'PM': 'Saint Pierre and Miquelon',
4415         'VC': 'Saint Vincent and the Grenadines',
4416         'WS': 'Samoa',
4417         'SM': 'San Marino',
4418         'ST': 'Sao Tome and Principe',
4419         'SA': 'Saudi Arabia',
4420         'SN': 'Senegal',
4421         'RS': 'Serbia',
4422         'SC': 'Seychelles',
4423         'SL': 'Sierra Leone',
4424         'SG': 'Singapore',
4425         'SX': 'Sint Maarten (Dutch part)',
4426         'SK': 'Slovakia',
4427         'SI': 'Slovenia',
4428         'SB': 'Solomon Islands',
4429         'SO': 'Somalia',
4430         'ZA': 'South Africa',
4431         'GS': 'South Georgia and the South Sandwich Islands',
4432         'SS': 'South Sudan',
4433         'ES': 'Spain',
4434         'LK': 'Sri Lanka',
4435         'SD': 'Sudan',
4436         'SR': 'Suriname',
4437         'SJ': 'Svalbard and Jan Mayen',
4438         'SZ': 'Swaziland',
4439         'SE': 'Sweden',
4440         'CH': 'Switzerland',
4441         'SY': 'Syrian Arab Republic',
4442         'TW': 'Taiwan, Province of China',
4443         'TJ': 'Tajikistan',
4444         'TZ': 'Tanzania, United Republic of',
4445         'TH': 'Thailand',
4446         'TL': 'Timor-Leste',
4447         'TG': 'Togo',
4448         'TK': 'Tokelau',
4449         'TO': 'Tonga',
4450         'TT': 'Trinidad and Tobago',
4451         'TN': 'Tunisia',
4452         'TR': 'Turkey',
4453         'TM': 'Turkmenistan',
4454         'TC': 'Turks and Caicos Islands',
4455         'TV': 'Tuvalu',
4456         'UG': 'Uganda',
4457         'UA': 'Ukraine',
4458         'AE': 'United Arab Emirates',
4459         'GB': 'United Kingdom',
4460         'US': 'United States',
4461         'UM': 'United States Minor Outlying Islands',
4462         'UY': 'Uruguay',
4463         'UZ': 'Uzbekistan',
4464         'VU': 'Vanuatu',
4465         'VE': 'Venezuela, Bolivarian Republic of',
4466         'VN': 'Viet Nam',
4467         'VG': 'Virgin Islands, British',
4468         'VI': 'Virgin Islands, U.S.',
4469         'WF': 'Wallis and Futuna',
4470         'EH': 'Western Sahara',
4471         'YE': 'Yemen',
4472         'ZM': 'Zambia',
4473         'ZW': 'Zimbabwe',
4474         # Not ISO 3166 codes, but used for IP blocks
4475         'AP': 'Asia/Pacific Region',
4476         'EU': 'Europe',
4477     }
4478
4479     @classmethod
4480     def short2full(cls, code):
4481         """Convert an ISO 3166-2 country code to the corresponding full name"""
4482         return cls._country_map.get(code.upper())
4483
4484
4485 class GeoUtils:
4486     # Major IPv4 address blocks per country
4487     _country_ip_map = {
4488         'AD': '46.172.224.0/19',
4489         'AE': '94.200.0.0/13',
4490         'AF': '149.54.0.0/17',
4491         'AG': '209.59.64.0/18',
4492         'AI': '204.14.248.0/21',
4493         'AL': '46.99.0.0/16',
4494         'AM': '46.70.0.0/15',
4495         'AO': '105.168.0.0/13',
4496         'AP': '182.50.184.0/21',
4497         'AQ': '23.154.160.0/24',
4498         'AR': '181.0.0.0/12',
4499         'AS': '202.70.112.0/20',
4500         'AT': '77.116.0.0/14',
4501         'AU': '1.128.0.0/11',
4502         'AW': '181.41.0.0/18',
4503         'AX': '185.217.4.0/22',
4504         'AZ': '5.197.0.0/16',
4505         'BA': '31.176.128.0/17',
4506         'BB': '65.48.128.0/17',
4507         'BD': '114.130.0.0/16',
4508         'BE': '57.0.0.0/8',
4509         'BF': '102.178.0.0/15',
4510         'BG': '95.42.0.0/15',
4511         'BH': '37.131.0.0/17',
4512         'BI': '154.117.192.0/18',
4513         'BJ': '137.255.0.0/16',
4514         'BL': '185.212.72.0/23',
4515         'BM': '196.12.64.0/18',
4516         'BN': '156.31.0.0/16',
4517         'BO': '161.56.0.0/16',
4518         'BQ': '161.0.80.0/20',
4519         'BR': '191.128.0.0/12',
4520         'BS': '24.51.64.0/18',
4521         'BT': '119.2.96.0/19',
4522         'BW': '168.167.0.0/16',
4523         'BY': '178.120.0.0/13',
4524         'BZ': '179.42.192.0/18',
4525         'CA': '99.224.0.0/11',
4526         'CD': '41.243.0.0/16',
4527         'CF': '197.242.176.0/21',
4528         'CG': '160.113.0.0/16',
4529         'CH': '85.0.0.0/13',
4530         'CI': '102.136.0.0/14',
4531         'CK': '202.65.32.0/19',
4532         'CL': '152.172.0.0/14',
4533         'CM': '102.244.0.0/14',
4534         'CN': '36.128.0.0/10',
4535         'CO': '181.240.0.0/12',
4536         'CR': '201.192.0.0/12',
4537         'CU': '152.206.0.0/15',
4538         'CV': '165.90.96.0/19',
4539         'CW': '190.88.128.0/17',
4540         'CY': '31.153.0.0/16',
4541         'CZ': '88.100.0.0/14',
4542         'DE': '53.0.0.0/8',
4543         'DJ': '197.241.0.0/17',
4544         'DK': '87.48.0.0/12',
4545         'DM': '192.243.48.0/20',
4546         'DO': '152.166.0.0/15',
4547         'DZ': '41.96.0.0/12',
4548         'EC': '186.68.0.0/15',
4549         'EE': '90.190.0.0/15',
4550         'EG': '156.160.0.0/11',
4551         'ER': '196.200.96.0/20',
4552         'ES': '88.0.0.0/11',
4553         'ET': '196.188.0.0/14',
4554         'EU': '2.16.0.0/13',
4555         'FI': '91.152.0.0/13',
4556         'FJ': '144.120.0.0/16',
4557         'FK': '80.73.208.0/21',
4558         'FM': '119.252.112.0/20',
4559         'FO': '88.85.32.0/19',
4560         'FR': '90.0.0.0/9',
4561         'GA': '41.158.0.0/15',
4562         'GB': '25.0.0.0/8',
4563         'GD': '74.122.88.0/21',
4564         'GE': '31.146.0.0/16',
4565         'GF': '161.22.64.0/18',
4566         'GG': '62.68.160.0/19',
4567         'GH': '154.160.0.0/12',
4568         'GI': '95.164.0.0/16',
4569         'GL': '88.83.0.0/19',
4570         'GM': '160.182.0.0/15',
4571         'GN': '197.149.192.0/18',
4572         'GP': '104.250.0.0/19',
4573         'GQ': '105.235.224.0/20',
4574         'GR': '94.64.0.0/13',
4575         'GT': '168.234.0.0/16',
4576         'GU': '168.123.0.0/16',
4577         'GW': '197.214.80.0/20',
4578         'GY': '181.41.64.0/18',
4579         'HK': '113.252.0.0/14',
4580         'HN': '181.210.0.0/16',
4581         'HR': '93.136.0.0/13',
4582         'HT': '148.102.128.0/17',
4583         'HU': '84.0.0.0/14',
4584         'ID': '39.192.0.0/10',
4585         'IE': '87.32.0.0/12',
4586         'IL': '79.176.0.0/13',
4587         'IM': '5.62.80.0/20',
4588         'IN': '117.192.0.0/10',
4589         'IO': '203.83.48.0/21',
4590         'IQ': '37.236.0.0/14',
4591         'IR': '2.176.0.0/12',
4592         'IS': '82.221.0.0/16',
4593         'IT': '79.0.0.0/10',
4594         'JE': '87.244.64.0/18',
4595         'JM': '72.27.0.0/17',
4596         'JO': '176.29.0.0/16',
4597         'JP': '133.0.0.0/8',
4598         'KE': '105.48.0.0/12',
4599         'KG': '158.181.128.0/17',
4600         'KH': '36.37.128.0/17',
4601         'KI': '103.25.140.0/22',
4602         'KM': '197.255.224.0/20',
4603         'KN': '198.167.192.0/19',
4604         'KP': '175.45.176.0/22',
4605         'KR': '175.192.0.0/10',
4606         'KW': '37.36.0.0/14',
4607         'KY': '64.96.0.0/15',
4608         'KZ': '2.72.0.0/13',
4609         'LA': '115.84.64.0/18',
4610         'LB': '178.135.0.0/16',
4611         'LC': '24.92.144.0/20',
4612         'LI': '82.117.0.0/19',
4613         'LK': '112.134.0.0/15',
4614         'LR': '102.183.0.0/16',
4615         'LS': '129.232.0.0/17',
4616         'LT': '78.56.0.0/13',
4617         'LU': '188.42.0.0/16',
4618         'LV': '46.109.0.0/16',
4619         'LY': '41.252.0.0/14',
4620         'MA': '105.128.0.0/11',
4621         'MC': '88.209.64.0/18',
4622         'MD': '37.246.0.0/16',
4623         'ME': '178.175.0.0/17',
4624         'MF': '74.112.232.0/21',
4625         'MG': '154.126.0.0/17',
4626         'MH': '117.103.88.0/21',
4627         'MK': '77.28.0.0/15',
4628         'ML': '154.118.128.0/18',
4629         'MM': '37.111.0.0/17',
4630         'MN': '49.0.128.0/17',
4631         'MO': '60.246.0.0/16',
4632         'MP': '202.88.64.0/20',
4633         'MQ': '109.203.224.0/19',
4634         'MR': '41.188.64.0/18',
4635         'MS': '208.90.112.0/22',
4636         'MT': '46.11.0.0/16',
4637         'MU': '105.16.0.0/12',
4638         'MV': '27.114.128.0/18',
4639         'MW': '102.70.0.0/15',
4640         'MX': '187.192.0.0/11',
4641         'MY': '175.136.0.0/13',
4642         'MZ': '197.218.0.0/15',
4643         'NA': '41.182.0.0/16',
4644         'NC': '101.101.0.0/18',
4645         'NE': '197.214.0.0/18',
4646         'NF': '203.17.240.0/22',
4647         'NG': '105.112.0.0/12',
4648         'NI': '186.76.0.0/15',
4649         'NL': '145.96.0.0/11',
4650         'NO': '84.208.0.0/13',
4651         'NP': '36.252.0.0/15',
4652         'NR': '203.98.224.0/19',
4653         'NU': '49.156.48.0/22',
4654         'NZ': '49.224.0.0/14',
4655         'OM': '5.36.0.0/15',
4656         'PA': '186.72.0.0/15',
4657         'PE': '186.160.0.0/14',
4658         'PF': '123.50.64.0/18',
4659         'PG': '124.240.192.0/19',
4660         'PH': '49.144.0.0/13',
4661         'PK': '39.32.0.0/11',
4662         'PL': '83.0.0.0/11',
4663         'PM': '70.36.0.0/20',
4664         'PR': '66.50.0.0/16',
4665         'PS': '188.161.0.0/16',
4666         'PT': '85.240.0.0/13',
4667         'PW': '202.124.224.0/20',
4668         'PY': '181.120.0.0/14',
4669         'QA': '37.210.0.0/15',
4670         'RE': '102.35.0.0/16',
4671         'RO': '79.112.0.0/13',
4672         'RS': '93.86.0.0/15',
4673         'RU': '5.136.0.0/13',
4674         'RW': '41.186.0.0/16',
4675         'SA': '188.48.0.0/13',
4676         'SB': '202.1.160.0/19',
4677         'SC': '154.192.0.0/11',
4678         'SD': '102.120.0.0/13',
4679         'SE': '78.64.0.0/12',
4680         'SG': '8.128.0.0/10',
4681         'SI': '188.196.0.0/14',
4682         'SK': '78.98.0.0/15',
4683         'SL': '102.143.0.0/17',
4684         'SM': '89.186.32.0/19',
4685         'SN': '41.82.0.0/15',
4686         'SO': '154.115.192.0/18',
4687         'SR': '186.179.128.0/17',
4688         'SS': '105.235.208.0/21',
4689         'ST': '197.159.160.0/19',
4690         'SV': '168.243.0.0/16',
4691         'SX': '190.102.0.0/20',
4692         'SY': '5.0.0.0/16',
4693         'SZ': '41.84.224.0/19',
4694         'TC': '65.255.48.0/20',
4695         'TD': '154.68.128.0/19',
4696         'TG': '196.168.0.0/14',
4697         'TH': '171.96.0.0/13',
4698         'TJ': '85.9.128.0/18',
4699         'TK': '27.96.24.0/21',
4700         'TL': '180.189.160.0/20',
4701         'TM': '95.85.96.0/19',
4702         'TN': '197.0.0.0/11',
4703         'TO': '175.176.144.0/21',
4704         'TR': '78.160.0.0/11',
4705         'TT': '186.44.0.0/15',
4706         'TV': '202.2.96.0/19',
4707         'TW': '120.96.0.0/11',
4708         'TZ': '156.156.0.0/14',
4709         'UA': '37.52.0.0/14',
4710         'UG': '102.80.0.0/13',
4711         'US': '6.0.0.0/8',
4712         'UY': '167.56.0.0/13',
4713         'UZ': '84.54.64.0/18',
4714         'VA': '212.77.0.0/19',
4715         'VC': '207.191.240.0/21',
4716         'VE': '186.88.0.0/13',
4717         'VG': '66.81.192.0/20',
4718         'VI': '146.226.0.0/16',
4719         'VN': '14.160.0.0/11',
4720         'VU': '202.80.32.0/20',
4721         'WF': '117.20.32.0/21',
4722         'WS': '202.4.32.0/19',
4723         'YE': '134.35.0.0/16',
4724         'YT': '41.242.116.0/22',
4725         'ZA': '41.0.0.0/11',
4726         'ZM': '102.144.0.0/13',
4727         'ZW': '102.177.192.0/18',
4728     }
4729
4730     @classmethod
4731     def random_ipv4(cls, code_or_block):
4732         if len(code_or_block) == 2:
4733             block = cls._country_ip_map.get(code_or_block.upper())
4734             if not block:
4735                 return None
4736         else:
4737             block = code_or_block
4738         addr, preflen = block.split('/')
4739         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4740         addr_max = addr_min | (0xffffffff >> int(preflen))
4741         return str(socket.inet_ntoa(
4742             struct.pack('!L', random.randint(addr_min, addr_max))))
4743
4744
4745 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4746     def __init__(self, proxies=None):
4747         # Set default handlers
4748         for type in ('http', 'https'):
4749             setattr(self, '%s_open' % type,
4750                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4751                         meth(r, proxy, type))
4752         urllib.request.ProxyHandler.__init__(self, proxies)
4753
4754     def proxy_open(self, req, proxy, type):
4755         req_proxy = req.headers.get('Ytdl-request-proxy')
4756         if req_proxy is not None:
4757             proxy = req_proxy
4758             del req.headers['Ytdl-request-proxy']
4759
4760         if proxy == '__noproxy__':
4761             return None  # No Proxy
4762         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4763             req.add_header('Ytdl-socks-proxy', proxy)
4764             # yt-dlp's http/https handlers do wrapping the socket with socks
4765             return None
4766         return urllib.request.ProxyHandler.proxy_open(
4767             self, req, proxy, type)
4768
4769
4770 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4771 # released into Public Domain
4772 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4773
4774 def long_to_bytes(n, blocksize=0):
4775     """long_to_bytes(n:long, blocksize:int) : string
4776     Convert a long integer to a byte string.
4777
4778     If optional blocksize is given and greater than zero, pad the front of the
4779     byte string with binary zeros so that the length is a multiple of
4780     blocksize.
4781     """
4782     # after much testing, this algorithm was deemed to be the fastest
4783     s = b''
4784     n = int(n)
4785     while n > 0:
4786         s = struct.pack('>I', n & 0xffffffff) + s
4787         n = n >> 32
4788     # strip off leading zeros
4789     for i in range(len(s)):
4790         if s[i] != b'\000'[0]:
4791             break
4792     else:
4793         # only happens when n == 0
4794         s = b'\000'
4795         i = 0
4796     s = s[i:]
4797     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4798     # de-padding being done above, but sigh...
4799     if blocksize > 0 and len(s) % blocksize:
4800         s = (blocksize - len(s) % blocksize) * b'\000' + s
4801     return s
4802
4803
4804 def bytes_to_long(s):
4805     """bytes_to_long(string) : long
4806     Convert a byte string to a long integer.
4807
4808     This is (essentially) the inverse of long_to_bytes().
4809     """
4810     acc = 0
4811     length = len(s)
4812     if length % 4:
4813         extra = (4 - length % 4)
4814         s = b'\000' * extra + s
4815         length = length + extra
4816     for i in range(0, length, 4):
4817         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4818     return acc
4819
4820
4821 def ohdave_rsa_encrypt(data, exponent, modulus):
4822     '''
4823     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4824
4825     Input:
4826         data: data to encrypt, bytes-like object
4827         exponent, modulus: parameter e and N of RSA algorithm, both integer
4828     Output: hex string of encrypted data
4829
4830     Limitation: supports one block encryption only
4831     '''
4832
4833     payload = int(binascii.hexlify(data[::-1]), 16)
4834     encrypted = pow(payload, exponent, modulus)
4835     return '%x' % encrypted
4836
4837
4838 def pkcs1pad(data, length):
4839     """
4840     Padding input data with PKCS#1 scheme
4841
4842     @param {int[]} data        input data
4843     @param {int}   length      target length
4844     @returns {int[]}           padded data
4845     """
4846     if len(data) > length - 11:
4847         raise ValueError('Input data too long for PKCS#1 padding')
4848
4849     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4850     return [0, 2] + pseudo_random + [0] + data
4851
4852
4853 def _base_n_table(n, table):
4854     if not table and not n:
4855         raise ValueError('Either table or n must be specified')
4856     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4857
4858     if n and n != len(table):
4859         raise ValueError(f'base {n} exceeds table length {len(table)}')
4860     return table
4861
4862
4863 def encode_base_n(num, n=None, table=None):
4864     """Convert given int to a base-n string"""
4865     table = _base_n_table(n, table)
4866     if not num:
4867         return table[0]
4868
4869     result, base = '', len(table)
4870     while num:
4871         result = table[num % base] + result
4872         num = num // base
4873     return result
4874
4875
4876 def decode_base_n(string, n=None, table=None):
4877     """Convert given base-n string to int"""
4878     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4879     result, base = 0, len(table)
4880     for char in string:
4881         result = result * base + table[char]
4882     return result
4883
4884
4885 def decode_packed_codes(code):
4886     mobj = re.search(PACKED_CODES_RE, code)
4887     obfuscated_code, base, count, symbols = mobj.groups()
4888     base = int(base)
4889     count = int(count)
4890     symbols = symbols.split('|')
4891     symbol_table = {}
4892
4893     while count:
4894         count -= 1
4895         base_n_count = encode_base_n(count, base)
4896         symbol_table[base_n_count] = symbols[count] or base_n_count
4897
4898     return re.sub(
4899         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4900         obfuscated_code)
4901
4902
4903 def caesar(s, alphabet, shift):
4904     if shift == 0:
4905         return s
4906     l = len(alphabet)
4907     return ''.join(
4908         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4909         for c in s)
4910
4911
4912 def rot47(s):
4913     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4914
4915
4916 def parse_m3u8_attributes(attrib):
4917     info = {}
4918     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4919         if val.startswith('"'):
4920             val = val[1:-1]
4921         info[key] = val
4922     return info
4923
4924
4925 def urshift(val, n):
4926     return val >> n if val >= 0 else (val + 0x100000000) >> n
4927
4928
4929 def write_xattr(path, key, value):
4930     # Windows: Write xattrs to NTFS Alternate Data Streams:
4931     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4932     if compat_os_name == 'nt':
4933         assert ':' not in key
4934         assert os.path.exists(path)
4935
4936         try:
4937             with open(f'{path}:{key}', 'wb') as f:
4938                 f.write(value)
4939         except OSError as e:
4940             raise XAttrMetadataError(e.errno, e.strerror)
4941         return
4942
4943     # UNIX Method 1. Use xattrs/pyxattrs modules
4944
4945     setxattr = None
4946     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4947         # Unicode arguments are not supported in pyxattr until version 0.5.0
4948         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4949         if version_tuple(xattr.__version__) >= (0, 5, 0):
4950             setxattr = xattr.set
4951     elif xattr:
4952         setxattr = xattr.setxattr
4953
4954     if setxattr:
4955         try:
4956             setxattr(path, key, value)
4957         except OSError as e:
4958             raise XAttrMetadataError(e.errno, e.strerror)
4959         return
4960
4961     # UNIX Method 2. Use setfattr/xattr executables
4962     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4963            else 'xattr' if check_executable('xattr', ['-h']) else None)
4964     if not exe:
4965         raise XAttrUnavailableError(
4966             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4967             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4968
4969     value = value.decode()
4970     try:
4971         _, stderr, returncode = Popen.run(
4972             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4973             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4974     except OSError as e:
4975         raise XAttrMetadataError(e.errno, e.strerror)
4976     if returncode:
4977         raise XAttrMetadataError(returncode, stderr)
4978
4979
4980 def random_birthday(year_field, month_field, day_field):
4981     start_date = datetime.date(1950, 1, 1)
4982     end_date = datetime.date(1995, 12, 31)
4983     offset = random.randint(0, (end_date - start_date).days)
4984     random_date = start_date + datetime.timedelta(offset)
4985     return {
4986         year_field: str(random_date.year),
4987         month_field: str(random_date.month),
4988         day_field: str(random_date.day),
4989     }
4990
4991
4992 def find_available_port(interface=''):
4993     try:
4994         with socket.socket() as sock:
4995             sock.bind((interface, 0))
4996             return sock.getsockname()[1]
4997     except OSError:
4998         return None
4999
5000
5001 # Templates for internet shortcut files, which are plain text files.
5002 DOT_URL_LINK_TEMPLATE = '''\
5003 [InternetShortcut]
5004 URL=%(url)s
5005 '''
5006
5007 DOT_WEBLOC_LINK_TEMPLATE = '''\
5008 <?xml version="1.0" encoding="UTF-8"?>
5009 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5010 <plist version="1.0">
5011 <dict>
5012 \t<key>URL</key>
5013 \t<string>%(url)s</string>
5014 </dict>
5015 </plist>
5016 '''
5017
5018 DOT_DESKTOP_LINK_TEMPLATE = '''\
5019 [Desktop Entry]
5020 Encoding=UTF-8
5021 Name=%(filename)s
5022 Type=Link
5023 URL=%(url)s
5024 Icon=text-html
5025 '''
5026
5027 LINK_TEMPLATES = {
5028     'url': DOT_URL_LINK_TEMPLATE,
5029     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5030     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5031 }
5032
5033
5034 def iri_to_uri(iri):
5035     """
5036     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5037
5038     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5039     """
5040
5041     iri_parts = urllib.parse.urlparse(iri)
5042
5043     if '[' in iri_parts.netloc:
5044         raise ValueError('IPv6 URIs are not, yet, supported.')
5045         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5046
5047     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5048
5049     net_location = ''
5050     if iri_parts.username:
5051         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5052         if iri_parts.password is not None:
5053             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5054         net_location += '@'
5055
5056     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5057     # The 'idna' encoding produces ASCII text.
5058     if iri_parts.port is not None and iri_parts.port != 80:
5059         net_location += ':' + str(iri_parts.port)
5060
5061     return urllib.parse.urlunparse(
5062         (iri_parts.scheme,
5063             net_location,
5064
5065             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5066
5067             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5068             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5069
5070             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5071             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5072
5073             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5074
5075     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5076
5077
5078 def to_high_limit_path(path):
5079     if sys.platform in ['win32', 'cygwin']:
5080         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5081         return '\\\\?\\' + os.path.abspath(path)
5082
5083     return path
5084
5085
5086 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5087     val = traversal.traverse_obj(obj, *variadic(field))
5088     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5089         return default
5090     return template % func(val)
5091
5092
5093 def clean_podcast_url(url):
5094     return re.sub(r'''(?x)
5095         (?:
5096             (?:
5097                 chtbl\.com/track|
5098                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5099                 play\.podtrac\.com
5100             )/[^/]+|
5101             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5102             flex\.acast\.com|
5103             pd(?:
5104                 cn\.co| # https://podcorn.com/analytics-prefix/
5105                 st\.fm # https://podsights.com/docs/
5106             )/e
5107         )/''', '', url)
5108
5109
5110 _HEX_TABLE = '0123456789abcdef'
5111
5112
5113 def random_uuidv4():
5114     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5115
5116
5117 def make_dir(path, to_screen=None):
5118     try:
5119         dn = os.path.dirname(path)
5120         if dn:
5121             os.makedirs(dn, exist_ok=True)
5122         return True
5123     except OSError as err:
5124         if callable(to_screen) is not None:
5125             to_screen(f'unable to create directory {err}')
5126         return False
5127
5128
5129 def get_executable_path():
5130     from ..update import _get_variant_and_executable_path
5131
5132     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5133
5134
5135 def get_user_config_dirs(package_name):
5136     # .config (e.g. ~/.config/package_name)
5137     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5138     yield os.path.join(xdg_config_home, package_name)
5139
5140     # appdata (%APPDATA%/package_name)
5141     appdata_dir = os.getenv('appdata')
5142     if appdata_dir:
5143         yield os.path.join(appdata_dir, package_name)
5144
5145     # home (~/.package_name)
5146     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5147
5148
5149 def get_system_config_dirs(package_name):
5150     # /etc/package_name
5151     yield os.path.join('/etc', package_name)
5152
5153
5154 def time_seconds(**kwargs):
5155     """
5156     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5157     """
5158     return time.time() + datetime.timedelta(**kwargs).total_seconds()
5159
5160
5161 # create a JSON Web Signature (jws) with HS256 algorithm
5162 # the resulting format is in JWS Compact Serialization
5163 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5164 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5165 def jwt_encode_hs256(payload_data, key, headers={}):
5166     header_data = {
5167         'alg': 'HS256',
5168         'typ': 'JWT',
5169     }
5170     if headers:
5171         header_data.update(headers)
5172     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5173     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5174     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5175     signature_b64 = base64.b64encode(h.digest())
5176     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5177     return token
5178
5179
5180 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5181 def jwt_decode_hs256(jwt):
5182     header_b64, payload_b64, signature_b64 = jwt.split('.')
5183     # add trailing ='s that may have been stripped, superfluous ='s are ignored
5184     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5185     return payload_data
5186
5187
5188 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5189
5190
5191 @functools.cache
5192 def supports_terminal_sequences(stream):
5193     if compat_os_name == 'nt':
5194         if not WINDOWS_VT_MODE:
5195             return False
5196     elif not os.getenv('TERM'):
5197         return False
5198     try:
5199         return stream.isatty()
5200     except BaseException:
5201         return False
5202
5203
5204 def windows_enable_vt_mode():
5205     """Ref: https://bugs.python.org/issue30075 """
5206     if get_windows_version() < (10, 0, 10586):
5207         return
5208
5209     import ctypes
5210     import ctypes.wintypes
5211     import msvcrt
5212
5213     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5214
5215     dll = ctypes.WinDLL('kernel32', use_last_error=False)
5216     handle = os.open('CONOUT$', os.O_RDWR)
5217     try:
5218         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5219         dw_original_mode = ctypes.wintypes.DWORD()
5220         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5221         if not success:
5222             raise Exception('GetConsoleMode failed')
5223
5224         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5225             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5226         if not success:
5227             raise Exception('SetConsoleMode failed')
5228     finally:
5229         os.close(handle)
5230
5231     global WINDOWS_VT_MODE
5232     WINDOWS_VT_MODE = True
5233     supports_terminal_sequences.cache_clear()
5234
5235
5236 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5237
5238
5239 def remove_terminal_sequences(string):
5240     return _terminal_sequences_re.sub('', string)
5241
5242
5243 def number_of_digits(number):
5244     return len('%d' % number)
5245
5246
5247 def join_nonempty(*values, delim='-', from_dict=None):
5248     if from_dict is not None:
5249         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
5250     return delim.join(map(str, filter(None, values)))
5251
5252
5253 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5254     """
5255     Find the largest format dimensions in terms of video width and, for each thumbnail:
5256     * Modify the URL: Match the width with the provided regex and replace with the former width
5257     * Update dimensions
5258
5259     This function is useful with video services that scale the provided thumbnails on demand
5260     """
5261     _keys = ('width', 'height')
5262     max_dimensions = max(
5263         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5264         default=(0, 0))
5265     if not max_dimensions[0]:
5266         return thumbnails
5267     return [
5268         merge_dicts(
5269             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5270             dict(zip(_keys, max_dimensions)), thumbnail)
5271         for thumbnail in thumbnails
5272     ]
5273
5274
5275 def parse_http_range(range):
5276     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5277     if not range:
5278         return None, None, None
5279     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5280     if not crg:
5281         return None, None, None
5282     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5283
5284
5285 def read_stdin(what):
5286     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5287     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5288     return sys.stdin
5289
5290
5291 def determine_file_encoding(data):
5292     """
5293     Detect the text encoding used
5294     @returns (encoding, bytes to skip)
5295     """
5296
5297     # BOM marks are given priority over declarations
5298     for bom, enc in BOMS:
5299         if data.startswith(bom):
5300             return enc, len(bom)
5301
5302     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5303     # We ignore the endianness to get a good enough match
5304     data = data.replace(b'\0', b'')
5305     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5306     return mobj.group(1).decode() if mobj else None, 0
5307
5308
5309 class Config:
5310     own_args = None
5311     parsed_args = None
5312     filename = None
5313     __initialized = False
5314
5315     def __init__(self, parser, label=None):
5316         self.parser, self.label = parser, label
5317         self._loaded_paths, self.configs = set(), []
5318
5319     def init(self, args=None, filename=None):
5320         assert not self.__initialized
5321         self.own_args, self.filename = args, filename
5322         return self.load_configs()
5323
5324     def load_configs(self):
5325         directory = ''
5326         if self.filename:
5327             location = os.path.realpath(self.filename)
5328             directory = os.path.dirname(location)
5329             if location in self._loaded_paths:
5330                 return False
5331             self._loaded_paths.add(location)
5332
5333         self.__initialized = True
5334         opts, _ = self.parser.parse_known_args(self.own_args)
5335         self.parsed_args = self.own_args
5336         for location in opts.config_locations or []:
5337             if location == '-':
5338                 if location in self._loaded_paths:
5339                     continue
5340                 self._loaded_paths.add(location)
5341                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5342                 continue
5343             location = os.path.join(directory, expand_path(location))
5344             if os.path.isdir(location):
5345                 location = os.path.join(location, 'yt-dlp.conf')
5346             if not os.path.exists(location):
5347                 self.parser.error(f'config location {location} does not exist')
5348             self.append_config(self.read_file(location), location)
5349         return True
5350
5351     def __str__(self):
5352         label = join_nonempty(
5353             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5354             delim=' ')
5355         return join_nonempty(
5356             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5357             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5358             delim='\n')
5359
5360     @staticmethod
5361     def read_file(filename, default=[]):
5362         try:
5363             optionf = open(filename, 'rb')
5364         except OSError:
5365             return default  # silently skip if file is not present
5366         try:
5367             enc, skip = determine_file_encoding(optionf.read(512))
5368             optionf.seek(skip, io.SEEK_SET)
5369         except OSError:
5370             enc = None  # silently skip read errors
5371         try:
5372             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5373             contents = optionf.read().decode(enc or preferredencoding())
5374             res = shlex.split(contents, comments=True)
5375         except Exception as err:
5376             raise ValueError(f'Unable to parse "{filename}": {err}')
5377         finally:
5378             optionf.close()
5379         return res
5380
5381     @staticmethod
5382     def hide_login_info(opts):
5383         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5384         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5385
5386         def _scrub_eq(o):
5387             m = eqre.match(o)
5388             if m:
5389                 return m.group('key') + '=PRIVATE'
5390             else:
5391                 return o
5392
5393         opts = list(map(_scrub_eq, opts))
5394         for idx, opt in enumerate(opts):
5395             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5396                 opts[idx + 1] = 'PRIVATE'
5397         return opts
5398
5399     def append_config(self, *args, label=None):
5400         config = type(self)(self.parser, label)
5401         config._loaded_paths = self._loaded_paths
5402         if config.init(*args):
5403             self.configs.append(config)
5404
5405     @property
5406     def all_args(self):
5407         for config in reversed(self.configs):
5408             yield from config.all_args
5409         yield from self.parsed_args or []
5410
5411     def parse_known_args(self, **kwargs):
5412         return self.parser.parse_known_args(self.all_args, **kwargs)
5413
5414     def parse_args(self):
5415         return self.parser.parse_args(self.all_args)
5416
5417
5418 class WebSocketsWrapper:
5419     """Wraps websockets module to use in non-async scopes"""
5420     pool = None
5421
5422     def __init__(self, url, headers=None, connect=True):
5423         self.loop = asyncio.new_event_loop()
5424         # XXX: "loop" is deprecated
5425         self.conn = websockets.connect(
5426             url, extra_headers=headers, ping_interval=None,
5427             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5428         if connect:
5429             self.__enter__()
5430         atexit.register(self.__exit__, None, None, None)
5431
5432     def __enter__(self):
5433         if not self.pool:
5434             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5435         return self
5436
5437     def send(self, *args):
5438         self.run_with_loop(self.pool.send(*args), self.loop)
5439
5440     def recv(self, *args):
5441         return self.run_with_loop(self.pool.recv(*args), self.loop)
5442
5443     def __exit__(self, type, value, traceback):
5444         try:
5445             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5446         finally:
5447             self.loop.close()
5448             self._cancel_all_tasks(self.loop)
5449
5450     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5451     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5452     @staticmethod
5453     def run_with_loop(main, loop):
5454         if not asyncio.iscoroutine(main):
5455             raise ValueError(f'a coroutine was expected, got {main!r}')
5456
5457         try:
5458             return loop.run_until_complete(main)
5459         finally:
5460             loop.run_until_complete(loop.shutdown_asyncgens())
5461             if hasattr(loop, 'shutdown_default_executor'):
5462                 loop.run_until_complete(loop.shutdown_default_executor())
5463
5464     @staticmethod
5465     def _cancel_all_tasks(loop):
5466         to_cancel = asyncio.all_tasks(loop)
5467
5468         if not to_cancel:
5469             return
5470
5471         for task in to_cancel:
5472             task.cancel()
5473
5474         # XXX: "loop" is removed in python 3.10+
5475         loop.run_until_complete(
5476             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5477
5478         for task in to_cancel:
5479             if task.cancelled():
5480                 continue
5481             if task.exception() is not None:
5482                 loop.call_exception_handler({
5483                     'message': 'unhandled exception during asyncio.run() shutdown',
5484                     'exception': task.exception(),
5485                     'task': task,
5486                 })
5487
5488
5489 def merge_headers(*dicts):
5490     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5491     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5492
5493
5494 def cached_method(f):
5495     """Cache a method"""
5496     signature = inspect.signature(f)
5497
5498     @functools.wraps(f)
5499     def wrapper(self, *args, **kwargs):
5500         bound_args = signature.bind(self, *args, **kwargs)
5501         bound_args.apply_defaults()
5502         key = tuple(bound_args.arguments.values())[1:]
5503
5504         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5505         if key not in cache:
5506             cache[key] = f(self, *args, **kwargs)
5507         return cache[key]
5508     return wrapper
5509
5510
5511 class classproperty:
5512     """property access for class methods with optional caching"""
5513     def __new__(cls, func=None, *args, **kwargs):
5514         if not func:
5515             return functools.partial(cls, *args, **kwargs)
5516         return super().__new__(cls)
5517
5518     def __init__(self, func, *, cache=False):
5519         functools.update_wrapper(self, func)
5520         self.func = func
5521         self._cache = {} if cache else None
5522
5523     def __get__(self, _, cls):
5524         if self._cache is None:
5525             return self.func(cls)
5526         elif cls not in self._cache:
5527             self._cache[cls] = self.func(cls)
5528         return self._cache[cls]
5529
5530
5531 class function_with_repr:
5532     def __init__(self, func, repr_=None):
5533         functools.update_wrapper(self, func)
5534         self.func, self.__repr = func, repr_
5535
5536     def __call__(self, *args, **kwargs):
5537         return self.func(*args, **kwargs)
5538
5539     def __repr__(self):
5540         if self.__repr:
5541             return self.__repr
5542         return f'{self.func.__module__}.{self.func.__qualname__}'
5543
5544
5545 class Namespace(types.SimpleNamespace):
5546     """Immutable namespace"""
5547
5548     def __iter__(self):
5549         return iter(self.__dict__.values())
5550
5551     @property
5552     def items_(self):
5553         return self.__dict__.items()
5554
5555
5556 MEDIA_EXTENSIONS = Namespace(
5557     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5558     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5559     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5560     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5561     thumbnails=('jpg', 'png', 'webp'),
5562     storyboards=('mhtml', ),
5563     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5564     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5565 )
5566 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5567 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5568
5569 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5570
5571
5572 class RetryManager:
5573     """Usage:
5574         for retry in RetryManager(...):
5575             try:
5576                 ...
5577             except SomeException as err:
5578                 retry.error = err
5579                 continue
5580     """
5581     attempt, _error = 0, None
5582
5583     def __init__(self, _retries, _error_callback, **kwargs):
5584         self.retries = _retries or 0
5585         self.error_callback = functools.partial(_error_callback, **kwargs)
5586
5587     def _should_retry(self):
5588         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5589
5590     @property
5591     def error(self):
5592         if self._error is NO_DEFAULT:
5593             return None
5594         return self._error
5595
5596     @error.setter
5597     def error(self, value):
5598         self._error = value
5599
5600     def __iter__(self):
5601         while self._should_retry():
5602             self.error = NO_DEFAULT
5603             self.attempt += 1
5604             yield self
5605             if self.error:
5606                 self.error_callback(self.error, self.attempt, self.retries)
5607
5608     @staticmethod
5609     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5610         """Utility function for reporting retries"""
5611         if count > retries:
5612             if error:
5613                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5614             raise e
5615
5616         if not count:
5617             return warn(e)
5618         elif isinstance(e, ExtractorError):
5619             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5620         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5621
5622         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5623         if delay:
5624             info(f'Sleeping {delay:.2f} seconds ...')
5625             time.sleep(delay)
5626
5627
5628 def make_archive_id(ie, video_id):
5629     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5630     return f'{ie_key.lower()} {video_id}'
5631
5632
5633 def truncate_string(s, left, right=0):
5634     assert left > 3 and right >= 0
5635     if s is None or len(s) <= left + right:
5636         return s
5637     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5638
5639
5640 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5641     assert 'all' in alias_dict, '"all" alias is required'
5642     requested = list(start or [])
5643     for val in options:
5644         discard = val.startswith('-')
5645         if discard:
5646             val = val[1:]
5647
5648         if val in alias_dict:
5649             val = alias_dict[val] if not discard else [
5650                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5651             # NB: Do not allow regex in aliases for performance
5652             requested = orderedSet_from_options(val, alias_dict, start=requested)
5653             continue
5654
5655         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5656                    else [val] if val in alias_dict['all'] else None)
5657         if current is None:
5658             raise ValueError(val)
5659
5660         if discard:
5661             for item in current:
5662                 while item in requested:
5663                     requested.remove(item)
5664         else:
5665             requested.extend(current)
5666
5667     return orderedSet(requested)
5668
5669
5670 class FormatSorter:
5671     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5672
5673     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5674                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5675                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5676     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5677                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5678                     'fps', 'fs_approx', 'source', 'id')
5679
5680     settings = {
5681         'vcodec': {'type': 'ordered', 'regex': True,
5682                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5683         'acodec': {'type': 'ordered', 'regex': True,
5684                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5685         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5686                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5687         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5688                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5689         'vext': {'type': 'ordered', 'field': 'video_ext',
5690                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5691                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5692         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5693                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5694                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5695         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5696         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5697                        'field': ('vcodec', 'acodec'),
5698                        'function': lambda it: int(any(v != 'none' for v in it))},
5699         'ie_pref': {'priority': True, 'type': 'extractor'},
5700         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5701         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5702         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5703         'quality': {'convert': 'float', 'default': -1},
5704         'filesize': {'convert': 'bytes'},
5705         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5706         'id': {'convert': 'string', 'field': 'format_id'},
5707         'height': {'convert': 'float_none'},
5708         'width': {'convert': 'float_none'},
5709         'fps': {'convert': 'float_none'},
5710         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5711         'tbr': {'convert': 'float_none'},
5712         'vbr': {'convert': 'float_none'},
5713         'abr': {'convert': 'float_none'},
5714         'asr': {'convert': 'float_none'},
5715         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5716
5717         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5718         'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
5719         'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
5720         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5721         'res': {'type': 'multiple', 'field': ('height', 'width'),
5722                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5723
5724         # Actual field names
5725         'format_id': {'type': 'alias', 'field': 'id'},
5726         'preference': {'type': 'alias', 'field': 'ie_pref'},
5727         'language_preference': {'type': 'alias', 'field': 'lang'},
5728         'source_preference': {'type': 'alias', 'field': 'source'},
5729         'protocol': {'type': 'alias', 'field': 'proto'},
5730         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5731         'audio_channels': {'type': 'alias', 'field': 'channels'},
5732
5733         # Deprecated
5734         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5735         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5736         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5737         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5738         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5739         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5740         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5741         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5742         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5743         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5744         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5745         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5746         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5747         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5748         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5749         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5750         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5751         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5752         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5753         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5754     }
5755
5756     def __init__(self, ydl, field_preference):
5757         self.ydl = ydl
5758         self._order = []
5759         self.evaluate_params(self.ydl.params, field_preference)
5760         if ydl.params.get('verbose'):
5761             self.print_verbose_info(self.ydl.write_debug)
5762
5763     def _get_field_setting(self, field, key):
5764         if field not in self.settings:
5765             if key in ('forced', 'priority'):
5766                 return False
5767             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5768                                         'deprecated and may be removed in a future version')
5769             self.settings[field] = {}
5770         propObj = self.settings[field]
5771         if key not in propObj:
5772             type = propObj.get('type')
5773             if key == 'field':
5774                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5775             elif key == 'convert':
5776                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5777             else:
5778                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5779             propObj[key] = default
5780         return propObj[key]
5781
5782     def _resolve_field_value(self, field, value, convertNone=False):
5783         if value is None:
5784             if not convertNone:
5785                 return None
5786         else:
5787             value = value.lower()
5788         conversion = self._get_field_setting(field, 'convert')
5789         if conversion == 'ignore':
5790             return None
5791         if conversion == 'string':
5792             return value
5793         elif conversion == 'float_none':
5794             return float_or_none(value)
5795         elif conversion == 'bytes':
5796             return parse_bytes(value)
5797         elif conversion == 'order':
5798             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5799             use_regex = self._get_field_setting(field, 'regex')
5800             list_length = len(order_list)
5801             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5802             if use_regex and value is not None:
5803                 for i, regex in enumerate(order_list):
5804                     if regex and re.match(regex, value):
5805                         return list_length - i
5806                 return list_length - empty_pos  # not in list
5807             else:  # not regex or  value = None
5808                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5809         else:
5810             if value.isnumeric():
5811                 return float(value)
5812             else:
5813                 self.settings[field]['convert'] = 'string'
5814                 return value
5815
5816     def evaluate_params(self, params, sort_extractor):
5817         self._use_free_order = params.get('prefer_free_formats', False)
5818         self._sort_user = params.get('format_sort', [])
5819         self._sort_extractor = sort_extractor
5820
5821         def add_item(field, reverse, closest, limit_text):
5822             field = field.lower()
5823             if field in self._order:
5824                 return
5825             self._order.append(field)
5826             limit = self._resolve_field_value(field, limit_text)
5827             data = {
5828                 'reverse': reverse,
5829                 'closest': False if limit is None else closest,
5830                 'limit_text': limit_text,
5831                 'limit': limit}
5832             if field in self.settings:
5833                 self.settings[field].update(data)
5834             else:
5835                 self.settings[field] = data
5836
5837         sort_list = (
5838             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5839             + (tuple() if params.get('format_sort_force', False)
5840                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5841             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5842
5843         for item in sort_list:
5844             match = re.match(self.regex, item)
5845             if match is None:
5846                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5847             field = match.group('field')
5848             if field is None:
5849                 continue
5850             if self._get_field_setting(field, 'type') == 'alias':
5851                 alias, field = field, self._get_field_setting(field, 'field')
5852                 if self._get_field_setting(alias, 'deprecated'):
5853                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5854                                                 f'be removed in a future version. Please use {field} instead')
5855             reverse = match.group('reverse') is not None
5856             closest = match.group('separator') == '~'
5857             limit_text = match.group('limit')
5858
5859             has_limit = limit_text is not None
5860             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5861             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5862
5863             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5864             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5865             limit_count = len(limits)
5866             for (i, f) in enumerate(fields):
5867                 add_item(f, reverse, closest,
5868                          limits[i] if i < limit_count
5869                          else limits[0] if has_limit and not has_multiple_limits
5870                          else None)
5871
5872     def print_verbose_info(self, write_debug):
5873         if self._sort_user:
5874             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5875         if self._sort_extractor:
5876             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5877         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5878             '+' if self._get_field_setting(field, 'reverse') else '', field,
5879             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5880                           self._get_field_setting(field, 'limit_text'),
5881                           self._get_field_setting(field, 'limit'))
5882             if self._get_field_setting(field, 'limit_text') is not None else '')
5883             for field in self._order if self._get_field_setting(field, 'visible')]))
5884
5885     def _calculate_field_preference_from_value(self, format, field, type, value):
5886         reverse = self._get_field_setting(field, 'reverse')
5887         closest = self._get_field_setting(field, 'closest')
5888         limit = self._get_field_setting(field, 'limit')
5889
5890         if type == 'extractor':
5891             maximum = self._get_field_setting(field, 'max')
5892             if value is None or (maximum is not None and value >= maximum):
5893                 value = -1
5894         elif type == 'boolean':
5895             in_list = self._get_field_setting(field, 'in_list')
5896             not_in_list = self._get_field_setting(field, 'not_in_list')
5897             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5898         elif type == 'ordered':
5899             value = self._resolve_field_value(field, value, True)
5900
5901         # try to convert to number
5902         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5903         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5904         if is_num:
5905             value = val_num
5906
5907         return ((-10, 0) if value is None
5908                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5909                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5910                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5911                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5912                 else (-1, value, 0))
5913
5914     def _calculate_field_preference(self, format, field):
5915         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5916         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5917         if type == 'multiple':
5918             type = 'field'  # Only 'field' is allowed in multiple for now
5919             actual_fields = self._get_field_setting(field, 'field')
5920
5921             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5922         else:
5923             value = get_value(field)
5924         return self._calculate_field_preference_from_value(format, field, type, value)
5925
5926     def calculate_preference(self, format):
5927         # Determine missing protocol
5928         if not format.get('protocol'):
5929             format['protocol'] = determine_protocol(format)
5930
5931         # Determine missing ext
5932         if not format.get('ext') and 'url' in format:
5933             format['ext'] = determine_ext(format['url'])
5934         if format.get('vcodec') == 'none':
5935             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5936             format['video_ext'] = 'none'
5937         else:
5938             format['video_ext'] = format['ext']
5939             format['audio_ext'] = 'none'
5940         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5941         #    format['preference'] = -1000
5942
5943         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5944             # HEVC-over-FLV is out-of-spec by FLV's original spec
5945             # ref. https://trac.ffmpeg.org/ticket/6389
5946             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5947             format['preference'] = -100
5948
5949         # Determine missing bitrates
5950         if format.get('tbr') is None:
5951             if format.get('vbr') is not None and format.get('abr') is not None:
5952                 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
5953         else:
5954             if format.get('vcodec') != 'none' and format.get('vbr') is None:
5955                 format['vbr'] = format.get('tbr') - format.get('abr', 0)
5956             if format.get('acodec') != 'none' and format.get('abr') is None:
5957                 format['abr'] = format.get('tbr') - format.get('vbr', 0)
5958
5959         return tuple(self._calculate_field_preference(format, field) for field in self._order)