yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import unicodedata
  44 import urllib.error
  45 import urllib.parse
  46 import urllib.request
  47 import xml.etree.ElementTree
  48 import zlib
  49
  50 from . import traversal
  51
  52 from ..compat import functools  # isort: split
  53 from ..compat import (
  54     compat_etree_fromstring,
  55     compat_expanduser,
  56     compat_HTMLParseError,
  57     compat_os_name,
  58     compat_shlex_quote,
  59 )
  60 from ..dependencies import brotli, certifi, websockets, xattr
  61 from ..socks import ProxyType, sockssocket
  62
  63 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  64
  65 # This is not clearly defined otherwise
  66 compiled_regex_type = type(re.compile(''))
  67
  68
  69 def random_user_agent():
  70     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  71     _CHROME_VERSIONS = (
  72         '90.0.4430.212',
  73         '90.0.4430.24',
  74         '90.0.4430.70',
  75         '90.0.4430.72',
  76         '90.0.4430.85',
  77         '90.0.4430.93',
  78         '91.0.4472.101',
  79         '91.0.4472.106',
  80         '91.0.4472.114',
  81         '91.0.4472.124',
  82         '91.0.4472.164',
  83         '91.0.4472.19',
  84         '91.0.4472.77',
  85         '92.0.4515.107',
  86         '92.0.4515.115',
  87         '92.0.4515.131',
  88         '92.0.4515.159',
  89         '92.0.4515.43',
  90         '93.0.4556.0',
  91         '93.0.4577.15',
  92         '93.0.4577.63',
  93         '93.0.4577.82',
  94         '94.0.4606.41',
  95         '94.0.4606.54',
  96         '94.0.4606.61',
  97         '94.0.4606.71',
  98         '94.0.4606.81',
  99         '94.0.4606.85',
 100         '95.0.4638.17',
 101         '95.0.4638.50',
 102         '95.0.4638.54',
 103         '95.0.4638.69',
 104         '95.0.4638.74',
 105         '96.0.4664.18',
 106         '96.0.4664.45',
 107         '96.0.4664.55',
 108         '96.0.4664.93',
 109         '97.0.4692.20',
 110     )
 111     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 112
 113
 114 SUPPORTED_ENCODINGS = [
 115     'gzip', 'deflate'
 116 ]
 117 if brotli:
 118     SUPPORTED_ENCODINGS.append('br')
 119
 120 std_headers = {
 121     'User-Agent': random_user_agent(),
 122     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 123     'Accept-Language': 'en-us,en;q=0.5',
 124     'Sec-Fetch-Mode': 'navigate',
 125 }
 126
 127
 128 USER_AGENTS = {
 129     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 130 }
 131
 132
 133 NO_DEFAULT = object()
 134 IDENTITY = lambda x: x
 135
 136 ENGLISH_MONTH_NAMES = [
 137     'January', 'February', 'March', 'April', 'May', 'June',
 138     'July', 'August', 'September', 'October', 'November', 'December']
 139
 140 MONTH_NAMES = {
 141     'en': ENGLISH_MONTH_NAMES,
 142     'fr': [
 143         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 144         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 145     # these follow the genitive grammatical case (dopełniacz)
 146     # some websites might be using nominative, which will require another month list
 147     # https://en.wikibooks.org/wiki/Polish/Noun_cases
 148     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
 149            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
 150 }
 151
 152 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
 153 TIMEZONE_NAMES = {
 154     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
 155     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 156     'EST': -5, 'EDT': -4,  # Eastern
 157     'CST': -6, 'CDT': -5,  # Central
 158     'MST': -7, 'MDT': -6,  # Mountain
 159     'PST': -8, 'PDT': -7   # Pacific
 160 }
 161
 162 # needed for sanitizing filenames in restricted mode
 163 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 164                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 165                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 166
 167 DATE_FORMATS = (
 168     '%d %B %Y',
 169     '%d %b %Y',
 170     '%B %d %Y',
 171     '%B %dst %Y',
 172     '%B %dnd %Y',
 173     '%B %drd %Y',
 174     '%B %dth %Y',
 175     '%b %d %Y',
 176     '%b %dst %Y',
 177     '%b %dnd %Y',
 178     '%b %drd %Y',
 179     '%b %dth %Y',
 180     '%b %dst %Y %I:%M',
 181     '%b %dnd %Y %I:%M',
 182     '%b %drd %Y %I:%M',
 183     '%b %dth %Y %I:%M',
 184     '%Y %m %d',
 185     '%Y-%m-%d',
 186     '%Y.%m.%d.',
 187     '%Y/%m/%d',
 188     '%Y/%m/%d %H:%M',
 189     '%Y/%m/%d %H:%M:%S',
 190     '%Y%m%d%H%M',
 191     '%Y%m%d%H%M%S',
 192     '%Y%m%d',
 193     '%Y-%m-%d %H:%M',
 194     '%Y-%m-%d %H:%M:%S',
 195     '%Y-%m-%d %H:%M:%S.%f',
 196     '%Y-%m-%d %H:%M:%S:%f',
 197     '%d.%m.%Y %H:%M',
 198     '%d.%m.%Y %H.%M',
 199     '%Y-%m-%dT%H:%M:%SZ',
 200     '%Y-%m-%dT%H:%M:%S.%fZ',
 201     '%Y-%m-%dT%H:%M:%S.%f0Z',
 202     '%Y-%m-%dT%H:%M:%S',
 203     '%Y-%m-%dT%H:%M:%S.%f',
 204     '%Y-%m-%dT%H:%M',
 205     '%b %d %Y at %H:%M',
 206     '%b %d %Y at %H:%M:%S',
 207     '%B %d %Y at %H:%M',
 208     '%B %d %Y at %H:%M:%S',
 209     '%H:%M %d-%b-%Y',
 210 )
 211
 212 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 213 DATE_FORMATS_DAY_FIRST.extend([
 214     '%d-%m-%Y',
 215     '%d.%m.%Y',
 216     '%d.%m.%y',
 217     '%d/%m/%Y',
 218     '%d/%m/%y',
 219     '%d/%m/%Y %H:%M:%S',
 220     '%d-%m-%Y %H:%M',
 221 ])
 222
 223 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 224 DATE_FORMATS_MONTH_FIRST.extend([
 225     '%m-%d-%Y',
 226     '%m.%d.%Y',
 227     '%m/%d/%Y',
 228     '%m/%d/%y',
 229     '%m/%d/%Y %H:%M:%S',
 230 ])
 231
 232 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 233 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 234
 235 NUMBER_RE = r'\d+(?:\.\d+)?'
 236
 237
 238 @functools.cache
 239 def preferredencoding():
 240     """Get preferred encoding.
 241
 242     Returns the best encoding scheme for the system, based on
 243     locale.getpreferredencoding() and some further tweaks.
 244     """
 245     try:
 246         pref = locale.getpreferredencoding()
 247         'TEST'.encode(pref)
 248     except Exception:
 249         pref = 'UTF-8'
 250
 251     return pref
 252
 253
 254 def write_json_file(obj, fn):
 255     """ Encode obj as JSON and write it to fn, atomically if possible """
 256
 257     tf = tempfile.NamedTemporaryFile(
 258         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 259         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 260
 261     try:
 262         with tf:
 263             json.dump(obj, tf, ensure_ascii=False)
 264         if sys.platform == 'win32':
 265             # Need to remove existing file on Windows, else os.rename raises
 266             # WindowsError or FileExistsError.
 267             with contextlib.suppress(OSError):
 268                 os.unlink(fn)
 269         with contextlib.suppress(OSError):
 270             mask = os.umask(0)
 271             os.umask(mask)
 272             os.chmod(tf.name, 0o666 & ~mask)
 273         os.rename(tf.name, fn)
 274     except Exception:
 275         with contextlib.suppress(OSError):
 276             os.remove(tf.name)
 277         raise
 278
 279
 280 def find_xpath_attr(node, xpath, key, val=None):
 281     """ Find the xpath xpath[@key=val] """
 282     assert re.match(r'^[a-zA-Z_-]+$', key)
 283     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 284     return node.find(expr)
 285
 286 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 287 # the namespace parameter
 288
 289
 290 def xpath_with_ns(path, ns_map):
 291     components = [c.split(':') for c in path.split('/')]
 292     replaced = []
 293     for c in components:
 294         if len(c) == 1:
 295             replaced.append(c[0])
 296         else:
 297             ns, tag = c
 298             replaced.append('{%s}%s' % (ns_map[ns], tag))
 299     return '/'.join(replaced)
 300
 301
 302 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 303     def _find_xpath(xpath):
 304         return node.find(xpath)
 305
 306     if isinstance(xpath, str):
 307         n = _find_xpath(xpath)
 308     else:
 309         for xp in xpath:
 310             n = _find_xpath(xp)
 311             if n is not None:
 312                 break
 313
 314     if n is None:
 315         if default is not NO_DEFAULT:
 316             return default
 317         elif fatal:
 318             name = xpath if name is None else name
 319             raise ExtractorError('Could not find XML element %s' % name)
 320         else:
 321             return None
 322     return n
 323
 324
 325 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 326     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 327     if n is None or n == default:
 328         return n
 329     if n.text is None:
 330         if default is not NO_DEFAULT:
 331             return default
 332         elif fatal:
 333             name = xpath if name is None else name
 334             raise ExtractorError('Could not find XML element\'s text %s' % name)
 335         else:
 336             return None
 337     return n.text
 338
 339
 340 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 341     n = find_xpath_attr(node, xpath, key)
 342     if n is None:
 343         if default is not NO_DEFAULT:
 344             return default
 345         elif fatal:
 346             name = f'{xpath}[@{key}]' if name is None else name
 347             raise ExtractorError('Could not find XML attribute %s' % name)
 348         else:
 349             return None
 350     return n.attrib[key]
 351
 352
 353 def get_element_by_id(id, html, **kwargs):
 354     """Return the content of the tag with the specified ID in the passed HTML document"""
 355     return get_element_by_attribute('id', id, html, **kwargs)
 356
 357
 358 def get_element_html_by_id(id, html, **kwargs):
 359     """Return the html of the tag with the specified ID in the passed HTML document"""
 360     return get_element_html_by_attribute('id', id, html, **kwargs)
 361
 362
 363 def get_element_by_class(class_name, html):
 364     """Return the content of the first tag with the specified class in the passed HTML document"""
 365     retval = get_elements_by_class(class_name, html)
 366     return retval[0] if retval else None
 367
 368
 369 def get_element_html_by_class(class_name, html):
 370     """Return the html of the first tag with the specified class in the passed HTML document"""
 371     retval = get_elements_html_by_class(class_name, html)
 372     return retval[0] if retval else None
 373
 374
 375 def get_element_by_attribute(attribute, value, html, **kwargs):
 376     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 377     return retval[0] if retval else None
 378
 379
 380 def get_element_html_by_attribute(attribute, value, html, **kargs):
 381     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 382     return retval[0] if retval else None
 383
 384
 385 def get_elements_by_class(class_name, html, **kargs):
 386     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 387     return get_elements_by_attribute(
 388         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 389         html, escape_value=False)
 390
 391
 392 def get_elements_html_by_class(class_name, html):
 393     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 394     return get_elements_html_by_attribute(
 395         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 396         html, escape_value=False)
 397
 398
 399 def get_elements_by_attribute(*args, **kwargs):
 400     """Return the content of the tag with the specified attribute in the passed HTML document"""
 401     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 402
 403
 404 def get_elements_html_by_attribute(*args, **kwargs):
 405     """Return the html of the tag with the specified attribute in the passed HTML document"""
 406     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 407
 408
 409 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 410     """
 411     Return the text (content) and the html (whole) of the tag with the specified
 412     attribute in the passed HTML document
 413     """
 414     if not value:
 415         return
 416
 417     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 418
 419     value = re.escape(value) if escape_value else value
 420
 421     partial_element_re = rf'''(?x)
 422         <(?P<tag>{tag})
 423          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 424          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 425         '''
 426
 427     for m in re.finditer(partial_element_re, html):
 428         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 429
 430         yield (
 431             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 432             whole
 433         )
 434
 435
 436 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 437     """
 438     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 439     closing tag for the first opening tag it has encountered, and can be used
 440     as a context manager
 441     """
 442
 443     class HTMLBreakOnClosingTagException(Exception):
 444         pass
 445
 446     def __init__(self):
 447         self.tagstack = collections.deque()
 448         html.parser.HTMLParser.__init__(self)
 449
 450     def __enter__(self):
 451         return self
 452
 453     def __exit__(self, *_):
 454         self.close()
 455
 456     def close(self):
 457         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 458         # so data remains buffered; we no longer have any interest in it, thus
 459         # override this method to discard it
 460         pass
 461
 462     def handle_starttag(self, tag, _):
 463         self.tagstack.append(tag)
 464
 465     def handle_endtag(self, tag):
 466         if not self.tagstack:
 467             raise compat_HTMLParseError('no tags in the stack')
 468         while self.tagstack:
 469             inner_tag = self.tagstack.pop()
 470             if inner_tag == tag:
 471                 break
 472         else:
 473             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 474         if not self.tagstack:
 475             raise self.HTMLBreakOnClosingTagException()
 476
 477
 478 # XXX: This should be far less strict
 479 def get_element_text_and_html_by_tag(tag, html):
 480     """
 481     For the first element with the specified tag in the passed HTML document
 482     return its' content (text) and the whole element (html)
 483     """
 484     def find_or_raise(haystack, needle, exc):
 485         try:
 486             return haystack.index(needle)
 487         except ValueError:
 488             raise exc
 489     closing_tag = f'</{tag}>'
 490     whole_start = find_or_raise(
 491         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 492     content_start = find_or_raise(
 493         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 494     content_start += whole_start + 1
 495     with HTMLBreakOnClosingTagParser() as parser:
 496         parser.feed(html[whole_start:content_start])
 497         if not parser.tagstack or parser.tagstack[0] != tag:
 498             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 499         offset = content_start
 500         while offset < len(html):
 501             next_closing_tag_start = find_or_raise(
 502                 html[offset:], closing_tag,
 503                 compat_HTMLParseError(f'closing {tag} tag not found'))
 504             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 505             try:
 506                 parser.feed(html[offset:offset + next_closing_tag_end])
 507                 offset += next_closing_tag_end
 508             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 509                 return html[content_start:offset + next_closing_tag_start], \
 510                     html[whole_start:offset + next_closing_tag_end]
 511         raise compat_HTMLParseError('unexpected end of html')
 512
 513
 514 class HTMLAttributeParser(html.parser.HTMLParser):
 515     """Trivial HTML parser to gather the attributes for a single element"""
 516
 517     def __init__(self):
 518         self.attrs = {}
 519         html.parser.HTMLParser.__init__(self)
 520
 521     def handle_starttag(self, tag, attrs):
 522         self.attrs = dict(attrs)
 523         raise compat_HTMLParseError('done')
 524
 525
 526 class HTMLListAttrsParser(html.parser.HTMLParser):
 527     """HTML parser to gather the attributes for the elements of a list"""
 528
 529     def __init__(self):
 530         html.parser.HTMLParser.__init__(self)
 531         self.items = []
 532         self._level = 0
 533
 534     def handle_starttag(self, tag, attrs):
 535         if tag == 'li' and self._level == 0:
 536             self.items.append(dict(attrs))
 537         self._level += 1
 538
 539     def handle_endtag(self, tag):
 540         self._level -= 1
 541
 542
 543 def extract_attributes(html_element):
 544     """Given a string for an HTML element such as
 545     <el
 546          a="foo" B="bar" c="&98;az" d=boz
 547          empty= noval entity="&amp;"
 548          sq='"' dq="'"
 549     >
 550     Decode and return a dictionary of attributes.
 551     {
 552         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 553         'empty': '', 'noval': None, 'entity': '&',
 554         'sq': '"', 'dq': '\''
 555     }.
 556     """
 557     parser = HTMLAttributeParser()
 558     with contextlib.suppress(compat_HTMLParseError):
 559         parser.feed(html_element)
 560         parser.close()
 561     return parser.attrs
 562
 563
 564 def parse_list(webpage):
 565     """Given a string for an series of HTML <li> elements,
 566     return a dictionary of their attributes"""
 567     parser = HTMLListAttrsParser()
 568     parser.feed(webpage)
 569     parser.close()
 570     return parser.items
 571
 572
 573 def clean_html(html):
 574     """Clean an HTML snippet into a readable string"""
 575
 576     if html is None:  # Convenience for sanitizing descriptions etc.
 577         return html
 578
 579     html = re.sub(r'\s+', ' ', html)
 580     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 581     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 582     # Strip html tags
 583     html = re.sub('<.*?>', '', html)
 584     # Replace html entities
 585     html = unescapeHTML(html)
 586     return html.strip()
 587
 588
 589 class LenientJSONDecoder(json.JSONDecoder):
 590     # TODO: Write tests
 591     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 592         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 593         self._close_attempts = 2 * close_objects
 594         super().__init__(*args, **kwargs)
 595
 596     @staticmethod
 597     def _close_object(err):
 598         doc = err.doc[:err.pos]
 599         # We need to add comma first to get the correct error message
 600         if err.msg.startswith('Expecting \',\''):
 601             return doc + ','
 602         elif not doc.endswith(','):
 603             return
 604
 605         if err.msg.startswith('Expecting property name'):
 606             return doc[:-1] + '}'
 607         elif err.msg.startswith('Expecting value'):
 608             return doc[:-1] + ']'
 609
 610     def decode(self, s):
 611         if self.transform_source:
 612             s = self.transform_source(s)
 613         for attempt in range(self._close_attempts + 1):
 614             try:
 615                 if self.ignore_extra:
 616                     return self.raw_decode(s.lstrip())[0]
 617                 return super().decode(s)
 618             except json.JSONDecodeError as e:
 619                 if e.pos is None:
 620                     raise
 621                 elif attempt < self._close_attempts:
 622                     s = self._close_object(e)
 623                     if s is not None:
 624                         continue
 625                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 626         assert False, 'Too many attempts to decode JSON'
 627
 628
 629 def sanitize_open(filename, open_mode):
 630     """Try to open the given filename, and slightly tweak it if this fails.
 631
 632     Attempts to open the given filename. If this fails, it tries to change
 633     the filename slightly, step by step, until it's either able to open it
 634     or it fails and raises a final exception, like the standard open()
 635     function.
 636
 637     It returns the tuple (stream, definitive_file_name).
 638     """
 639     if filename == '-':
 640         if sys.platform == 'win32':
 641             import msvcrt
 642
 643             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 644             with contextlib.suppress(io.UnsupportedOperation):
 645                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 646         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 647
 648     for attempt in range(2):
 649         try:
 650             try:
 651                 if sys.platform == 'win32':
 652                     # FIXME: An exclusive lock also locks the file from being read.
 653                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 654                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 655                     raise LockingUnsupportedError()
 656                 stream = locked_file(filename, open_mode, block=False).__enter__()
 657             except OSError:
 658                 stream = open(filename, open_mode)
 659             return stream, filename
 660         except OSError as err:
 661             if attempt or err.errno in (errno.EACCES,):
 662                 raise
 663             old_filename, filename = filename, sanitize_path(filename)
 664             if old_filename == filename:
 665                 raise
 666
 667
 668 def timeconvert(timestr):
 669     """Convert RFC 2822 defined time string into system timestamp"""
 670     timestamp = None
 671     timetuple = email.utils.parsedate_tz(timestr)
 672     if timetuple is not None:
 673         timestamp = email.utils.mktime_tz(timetuple)
 674     return timestamp
 675
 676
 677 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 678     """Sanitizes a string so it could be used as part of a filename.
 679     @param restricted   Use a stricter subset of allowed characters
 680     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 681                         If unset, yt-dlp's new sanitization rules are in effect
 682     """
 683     if s == '':
 684         return ''
 685
 686     def replace_insane(char):
 687         if restricted and char in ACCENT_CHARS:
 688             return ACCENT_CHARS[char]
 689         elif not restricted and char == '\n':
 690             return '\0 '
 691         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 692             # Replace with their full-width unicode counterparts
 693             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 694         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 695             return ''
 696         elif char == '"':
 697             return '' if restricted else '\''
 698         elif char == ':':
 699             return '\0_\0-' if restricted else '\0 \0-'
 700         elif char in '\\/|*<>':
 701             return '\0_'
 702         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 703             return '\0_'
 704         return char
 705
 706     # Replace look-alike Unicode glyphs
 707     if restricted and (is_id is NO_DEFAULT or not is_id):
 708         s = unicodedata.normalize('NFKC', s)
 709     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 710     result = ''.join(map(replace_insane, s))
 711     if is_id is NO_DEFAULT:
 712         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 713         STRIP_RE = r'(?:\0.|[ _-])*'
 714         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 715     result = result.replace('\0', '') or '_'
 716
 717     if not is_id:
 718         while '__' in result:
 719             result = result.replace('__', '_')
 720         result = result.strip('_')
 721         # Common case of "Foreign band name - English song title"
 722         if restricted and result.startswith('-_'):
 723             result = result[2:]
 724         if result.startswith('-'):
 725             result = '_' + result[len('-'):]
 726         result = result.lstrip('.')
 727         if not result:
 728             result = '_'
 729     return result
 730
 731
 732 def sanitize_path(s, force=False):
 733     """Sanitizes and normalizes path on Windows"""
 734     if sys.platform == 'win32':
 735         force = False
 736         drive_or_unc, _ = os.path.splitdrive(s)
 737     elif force:
 738         drive_or_unc = ''
 739     else:
 740         return s
 741
 742     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 743     if drive_or_unc:
 744         norm_path.pop(0)
 745     sanitized_path = [
 746         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 747         for path_part in norm_path]
 748     if drive_or_unc:
 749         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 750     elif force and s and s[0] == os.path.sep:
 751         sanitized_path.insert(0, os.path.sep)
 752     return os.path.join(*sanitized_path)
 753
 754
 755 def sanitize_url(url, *, scheme='http'):
 756     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 757     # the number of unwanted failures due to missing protocol
 758     if url is None:
 759         return
 760     elif url.startswith('//'):
 761         return f'{scheme}:{url}'
 762     # Fix some common typos seen so far
 763     COMMON_TYPOS = (
 764         # https://github.com/ytdl-org/youtube-dl/issues/15649
 765         (r'^httpss://', r'https://'),
 766         # https://bx1.be/lives/direct-tv/
 767         (r'^rmtp([es]?)://', r'rtmp\1://'),
 768     )
 769     for mistake, fixup in COMMON_TYPOS:
 770         if re.match(mistake, url):
 771             return re.sub(mistake, fixup, url)
 772     return url
 773
 774
 775 def extract_basic_auth(url):
 776     parts = urllib.parse.urlsplit(url)
 777     if parts.username is None:
 778         return url, None
 779     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 780         parts.hostname if parts.port is None
 781         else '%s:%d' % (parts.hostname, parts.port))))
 782     auth_payload = base64.b64encode(
 783         ('%s:%s' % (parts.username, parts.password or '')).encode())
 784     return url, f'Basic {auth_payload.decode()}'
 785
 786
 787 def sanitized_Request(url, *args, **kwargs):
 788     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 789     if auth_header is not None:
 790         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 791         headers['Authorization'] = auth_header
 792     return urllib.request.Request(url, *args, **kwargs)
 793
 794
 795 def expand_path(s):
 796     """Expand shell variables and ~"""
 797     return os.path.expandvars(compat_expanduser(s))
 798
 799
 800 def orderedSet(iterable, *, lazy=False):
 801     """Remove all duplicates from the input iterable"""
 802     def _iter():
 803         seen = []  # Do not use set since the items can be unhashable
 804         for x in iterable:
 805             if x not in seen:
 806                 seen.append(x)
 807                 yield x
 808
 809     return _iter() if lazy else list(_iter())
 810
 811
 812 def _htmlentity_transform(entity_with_semicolon):
 813     """Transforms an HTML entity to a character."""
 814     entity = entity_with_semicolon[:-1]
 815
 816     # Known non-numeric HTML entity
 817     if entity in html.entities.name2codepoint:
 818         return chr(html.entities.name2codepoint[entity])
 819
 820     # TODO: HTML5 allows entities without a semicolon.
 821     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 822     if entity_with_semicolon in html.entities.html5:
 823         return html.entities.html5[entity_with_semicolon]
 824
 825     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 826     if mobj is not None:
 827         numstr = mobj.group(1)
 828         if numstr.startswith('x'):
 829             base = 16
 830             numstr = '0%s' % numstr
 831         else:
 832             base = 10
 833         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 834         with contextlib.suppress(ValueError):
 835             return chr(int(numstr, base))
 836
 837     # Unknown entity in name, return its literal representation
 838     return '&%s;' % entity
 839
 840
 841 def unescapeHTML(s):
 842     if s is None:
 843         return None
 844     assert isinstance(s, str)
 845
 846     return re.sub(
 847         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 848
 849
 850 def escapeHTML(text):
 851     return (
 852         text
 853         .replace('&', '&amp;')
 854         .replace('<', '&lt;')
 855         .replace('>', '&gt;')
 856         .replace('"', '&quot;')
 857         .replace("'", '&#39;')
 858     )
 859
 860
 861 def process_communicate_or_kill(p, *args, **kwargs):
 862     deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
 863                         f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
 864     return Popen.communicate_or_kill(p, *args, **kwargs)
 865
 866
 867 class Popen(subprocess.Popen):
 868     if sys.platform == 'win32':
 869         _startupinfo = subprocess.STARTUPINFO()
 870         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 871     else:
 872         _startupinfo = None
 873
 874     @staticmethod
 875     def _fix_pyinstaller_ld_path(env):
 876         """Restore LD_LIBRARY_PATH when using PyInstaller
 877             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 878                  https://github.com/yt-dlp/yt-dlp/issues/4573
 879         """
 880         if not hasattr(sys, '_MEIPASS'):
 881             return
 882
 883         def _fix(key):
 884             orig = env.get(f'{key}_ORIG')
 885             if orig is None:
 886                 env.pop(key, None)
 887             else:
 888                 env[key] = orig
 889
 890         _fix('LD_LIBRARY_PATH')  # Linux
 891         _fix('DYLD_LIBRARY_PATH')  # macOS
 892
 893     def __init__(self, *args, env=None, text=False, **kwargs):
 894         if env is None:
 895             env = os.environ.copy()
 896         self._fix_pyinstaller_ld_path(env)
 897
 898         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 899         if text is True:
 900             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 901             kwargs.setdefault('encoding', 'utf-8')
 902             kwargs.setdefault('errors', 'replace')
 903         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 904
 905     def communicate_or_kill(self, *args, **kwargs):
 906         try:
 907             return self.communicate(*args, **kwargs)
 908         except BaseException:  # Including KeyboardInterrupt
 909             self.kill(timeout=None)
 910             raise
 911
 912     def kill(self, *, timeout=0):
 913         super().kill()
 914         if timeout != 0:
 915             self.wait(timeout=timeout)
 916
 917     @classmethod
 918     def run(cls, *args, timeout=None, **kwargs):
 919         with cls(*args, **kwargs) as proc:
 920             default = '' if proc.__text_mode else b''
 921             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 922             return stdout or default, stderr or default, proc.returncode
 923
 924
 925 def encodeArgument(s):
 926     # Legacy code that uses byte strings
 927     # Uncomment the following line after fixing all post processors
 928     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 929     return s if isinstance(s, str) else s.decode('ascii')
 930
 931
 932 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 933
 934
 935 def timetuple_from_msec(msec):
 936     secs, msec = divmod(msec, 1000)
 937     mins, secs = divmod(secs, 60)
 938     hrs, mins = divmod(mins, 60)
 939     return _timetuple(hrs, mins, secs, msec)
 940
 941
 942 def formatSeconds(secs, delim=':', msec=False):
 943     time = timetuple_from_msec(secs * 1000)
 944     if time.hours:
 945         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 946     elif time.minutes:
 947         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 948     else:
 949         ret = '%d' % time.seconds
 950     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 951
 952
 953 def _ssl_load_windows_store_certs(ssl_context, storename):
 954     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 955     try:
 956         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 957                  if encoding == 'x509_asn' and (
 958                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 959     except PermissionError:
 960         return
 961     for cert in certs:
 962         with contextlib.suppress(ssl.SSLError):
 963             ssl_context.load_verify_locations(cadata=cert)
 964
 965
 966 def make_HTTPS_handler(params, **kwargs):
 967     opts_check_certificate = not params.get('nocheckcertificate')
 968     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 969     context.check_hostname = opts_check_certificate
 970     if params.get('legacyserverconnect'):
 971         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 972         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 973         context.set_ciphers('DEFAULT')
 974     elif (
 975         sys.version_info < (3, 10)
 976         and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
 977         and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
 978     ):
 979         # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
 980         # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
 981         # in some situations [2][3].
 982         # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
 983         # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
 984         # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
 985         # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
 986         # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
 987         # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
 988         # 4. https://peps.python.org/pep-0644/
 989         # 5. https://peps.python.org/pep-0644/#libressl-support
 990         # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
 991         context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
 992         context.minimum_version = ssl.TLSVersion.TLSv1_2
 993
 994     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 995     if opts_check_certificate:
 996         if certifi and 'no-certifi' not in params.get('compat_opts', []):
 997             context.load_verify_locations(cafile=certifi.where())
 998         else:
 999             try:
1000                 context.load_default_certs()
1001                 # Work around the issue in load_default_certs when there are bad certificates. See:
1002                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1003                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1004             except ssl.SSLError:
1005                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1006                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1007                     for storename in ('CA', 'ROOT'):
1008                         _ssl_load_windows_store_certs(context, storename)
1009                 context.set_default_verify_paths()
1010
1011     client_certfile = params.get('client_certificate')
1012     if client_certfile:
1013         try:
1014             context.load_cert_chain(
1015                 client_certfile, keyfile=params.get('client_certificate_key'),
1016                 password=params.get('client_certificate_password'))
1017         except ssl.SSLError:
1018             raise YoutubeDLError('Unable to load client certificate')
1019
1020     # Some servers may reject requests if ALPN extension is not sent. See:
1021     # https://github.com/python/cpython/issues/85140
1022     # https://github.com/yt-dlp/yt-dlp/issues/3878
1023     with contextlib.suppress(NotImplementedError):
1024         context.set_alpn_protocols(['http/1.1'])
1025
1026     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1027
1028
1029 def bug_reports_message(before=';'):
1030     from ..update import REPOSITORY
1031
1032     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
1033            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
1034
1035     before = before.rstrip()
1036     if not before or before.endswith(('.', '!', '?')):
1037         msg = msg[0].title() + msg[1:]
1038
1039     return (before + ' ' if before else '') + msg
1040
1041
1042 class YoutubeDLError(Exception):
1043     """Base exception for YoutubeDL errors."""
1044     msg = None
1045
1046     def __init__(self, msg=None):
1047         if msg is not None:
1048             self.msg = msg
1049         elif self.msg is None:
1050             self.msg = type(self).__name__
1051         super().__init__(self.msg)
1052
1053
1054 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1055 if hasattr(ssl, 'CertificateError'):
1056     network_exceptions.append(ssl.CertificateError)
1057 network_exceptions = tuple(network_exceptions)
1058
1059
1060 class ExtractorError(YoutubeDLError):
1061     """Error during info extraction."""
1062
1063     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1064         """ tb, if given, is the original traceback (so that it can be printed out).
1065         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1066         """
1067         if sys.exc_info()[0] in network_exceptions:
1068             expected = True
1069
1070         self.orig_msg = str(msg)
1071         self.traceback = tb
1072         self.expected = expected
1073         self.cause = cause
1074         self.video_id = video_id
1075         self.ie = ie
1076         self.exc_info = sys.exc_info()  # preserve original exception
1077         if isinstance(self.exc_info[1], ExtractorError):
1078             self.exc_info = self.exc_info[1].exc_info
1079         super().__init__(self.__msg)
1080
1081     @property
1082     def __msg(self):
1083         return ''.join((
1084             format_field(self.ie, None, '[%s] '),
1085             format_field(self.video_id, None, '%s: '),
1086             self.orig_msg,
1087             format_field(self.cause, None, ' (caused by %r)'),
1088             '' if self.expected else bug_reports_message()))
1089
1090     def format_traceback(self):
1091         return join_nonempty(
1092             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1093             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1094             delim='\n') or None
1095
1096     def __setattr__(self, name, value):
1097         super().__setattr__(name, value)
1098         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1099             self.msg = self.__msg or type(self).__name__
1100             self.args = (self.msg, )  # Cannot be property
1101
1102
1103 class UnsupportedError(ExtractorError):
1104     def __init__(self, url):
1105         super().__init__(
1106             'Unsupported URL: %s' % url, expected=True)
1107         self.url = url
1108
1109
1110 class RegexNotFoundError(ExtractorError):
1111     """Error when a regex didn't match"""
1112     pass
1113
1114
1115 class GeoRestrictedError(ExtractorError):
1116     """Geographic restriction Error exception.
1117
1118     This exception may be thrown when a video is not available from your
1119     geographic location due to geographic restrictions imposed by a website.
1120     """
1121
1122     def __init__(self, msg, countries=None, **kwargs):
1123         kwargs['expected'] = True
1124         super().__init__(msg, **kwargs)
1125         self.countries = countries
1126
1127
1128 class UserNotLive(ExtractorError):
1129     """Error when a channel/user is not live"""
1130
1131     def __init__(self, msg=None, **kwargs):
1132         kwargs['expected'] = True
1133         super().__init__(msg or 'The channel is not currently live', **kwargs)
1134
1135
1136 class DownloadError(YoutubeDLError):
1137     """Download Error exception.
1138
1139     This exception may be thrown by FileDownloader objects if they are not
1140     configured to continue on errors. They will contain the appropriate
1141     error message.
1142     """
1143
1144     def __init__(self, msg, exc_info=None):
1145         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1146         super().__init__(msg)
1147         self.exc_info = exc_info
1148
1149
1150 class EntryNotInPlaylist(YoutubeDLError):
1151     """Entry not in playlist exception.
1152
1153     This exception will be thrown by YoutubeDL when a requested entry
1154     is not found in the playlist info_dict
1155     """
1156     msg = 'Entry not found in info'
1157
1158
1159 class SameFileError(YoutubeDLError):
1160     """Same File exception.
1161
1162     This exception will be thrown by FileDownloader objects if they detect
1163     multiple files would have to be downloaded to the same file on disk.
1164     """
1165     msg = 'Fixed output name but more than one file to download'
1166
1167     def __init__(self, filename=None):
1168         if filename is not None:
1169             self.msg += f': {filename}'
1170         super().__init__(self.msg)
1171
1172
1173 class PostProcessingError(YoutubeDLError):
1174     """Post Processing exception.
1175
1176     This exception may be raised by PostProcessor's .run() method to
1177     indicate an error in the postprocessing task.
1178     """
1179
1180
1181 class DownloadCancelled(YoutubeDLError):
1182     """ Exception raised when the download queue should be interrupted """
1183     msg = 'The download was cancelled'
1184
1185
1186 class ExistingVideoReached(DownloadCancelled):
1187     """ --break-on-existing triggered """
1188     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1189
1190
1191 class RejectedVideoReached(DownloadCancelled):
1192     """ --break-match-filter triggered """
1193     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1194
1195
1196 class MaxDownloadsReached(DownloadCancelled):
1197     """ --max-downloads limit has been reached. """
1198     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1199
1200
1201 class ReExtractInfo(YoutubeDLError):
1202     """ Video info needs to be re-extracted. """
1203
1204     def __init__(self, msg, expected=False):
1205         super().__init__(msg)
1206         self.expected = expected
1207
1208
1209 class ThrottledDownload(ReExtractInfo):
1210     """ Download speed below --throttled-rate. """
1211     msg = 'The download speed is below throttle limit'
1212
1213     def __init__(self):
1214         super().__init__(self.msg, expected=False)
1215
1216
1217 class UnavailableVideoError(YoutubeDLError):
1218     """Unavailable Format exception.
1219
1220     This exception will be thrown when a video is requested
1221     in a format that is not available for that video.
1222     """
1223     msg = 'Unable to download video'
1224
1225     def __init__(self, err=None):
1226         if err is not None:
1227             self.msg += f': {err}'
1228         super().__init__(self.msg)
1229
1230
1231 class ContentTooShortError(YoutubeDLError):
1232     """Content Too Short exception.
1233
1234     This exception may be raised by FileDownloader objects when a file they
1235     download is too small for what the server announced first, indicating
1236     the connection was probably interrupted.
1237     """
1238
1239     def __init__(self, downloaded, expected):
1240         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1241         # Both in bytes
1242         self.downloaded = downloaded
1243         self.expected = expected
1244
1245
1246 class XAttrMetadataError(YoutubeDLError):
1247     def __init__(self, code=None, msg='Unknown error'):
1248         super().__init__(msg)
1249         self.code = code
1250         self.msg = msg
1251
1252         # Parsing code and msg
1253         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1254                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1255             self.reason = 'NO_SPACE'
1256         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1257             self.reason = 'VALUE_TOO_LONG'
1258         else:
1259             self.reason = 'NOT_SUPPORTED'
1260
1261
1262 class XAttrUnavailableError(YoutubeDLError):
1263     pass
1264
1265
1266 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1267     hc = http_class(*args, **kwargs)
1268     source_address = ydl_handler._params.get('source_address')
1269
1270     if source_address is not None:
1271         # This is to workaround _create_connection() from socket where it will try all
1272         # address data from getaddrinfo() including IPv6. This filters the result from
1273         # getaddrinfo() based on the source_address value.
1274         # This is based on the cpython socket.create_connection() function.
1275         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1276         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1277             host, port = address
1278             err = None
1279             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1280             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1281             ip_addrs = [addr for addr in addrs if addr[0] == af]
1282             if addrs and not ip_addrs:
1283                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1284                 raise OSError(
1285                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1286                     % (ip_version, source_address[0]))
1287             for res in ip_addrs:
1288                 af, socktype, proto, canonname, sa = res
1289                 sock = None
1290                 try:
1291                     sock = socket.socket(af, socktype, proto)
1292                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1293                         sock.settimeout(timeout)
1294                     sock.bind(source_address)
1295                     sock.connect(sa)
1296                     err = None  # Explicitly break reference cycle
1297                     return sock
1298                 except OSError as _:
1299                     err = _
1300                     if sock is not None:
1301                         sock.close()
1302             if err is not None:
1303                 raise err
1304             else:
1305                 raise OSError('getaddrinfo returns an empty list')
1306         if hasattr(hc, '_create_connection'):
1307             hc._create_connection = _create_connection
1308         hc.source_address = (source_address, 0)
1309
1310     return hc
1311
1312
1313 class YoutubeDLHandler(urllib.request.HTTPHandler):
1314     """Handler for HTTP requests and responses.
1315
1316     This class, when installed with an OpenerDirector, automatically adds
1317     the standard headers to every HTTP request and handles gzipped, deflated and
1318     brotli responses from web servers.
1319
1320     Part of this code was copied from:
1321
1322     http://techknack.net/python-urllib2-handlers/
1323
1324     Andrew Rowls, the author of that code, agreed to release it to the
1325     public domain.
1326     """
1327
1328     def __init__(self, params, *args, **kwargs):
1329         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1330         self._params = params
1331
1332     def http_open(self, req):
1333         conn_class = http.client.HTTPConnection
1334
1335         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1336         if socks_proxy:
1337             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1338             del req.headers['Ytdl-socks-proxy']
1339
1340         return self.do_open(functools.partial(
1341             _create_http_connection, self, conn_class, False),
1342             req)
1343
1344     @staticmethod
1345     def deflate(data):
1346         if not data:
1347             return data
1348         try:
1349             return zlib.decompress(data, -zlib.MAX_WBITS)
1350         except zlib.error:
1351             return zlib.decompress(data)
1352
1353     @staticmethod
1354     def brotli(data):
1355         if not data:
1356             return data
1357         return brotli.decompress(data)
1358
1359     def http_request(self, req):
1360         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1361         # always respected by websites, some tend to give out URLs with non percent-encoded
1362         # non-ASCII characters (see telemb.py, ard.py [#3412])
1363         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1364         # To work around aforementioned issue we will replace request's original URL with
1365         # percent-encoded one
1366         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1367         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1368         url = req.get_full_url()
1369         url_escaped = escape_url(url)
1370
1371         # Substitute URL if any change after escaping
1372         if url != url_escaped:
1373             req = update_Request(req, url=url_escaped)
1374
1375         for h, v in self._params.get('http_headers', std_headers).items():
1376             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1377             # The dict keys are capitalized because of this bug by urllib
1378             if h.capitalize() not in req.headers:
1379                 req.add_header(h, v)
1380
1381         if 'Youtubedl-no-compression' in req.headers:  # deprecated
1382             req.headers.pop('Youtubedl-no-compression', None)
1383             req.add_header('Accept-encoding', 'identity')
1384
1385         if 'Accept-encoding' not in req.headers:
1386             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1387
1388         return super().do_request_(req)
1389
1390     def http_response(self, req, resp):
1391         old_resp = resp
1392         # gzip
1393         if resp.headers.get('Content-encoding', '') == 'gzip':
1394             content = resp.read()
1395             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1396             try:
1397                 uncompressed = io.BytesIO(gz.read())
1398             except OSError as original_ioerror:
1399                 # There may be junk add the end of the file
1400                 # See http://stackoverflow.com/q/4928560/35070 for details
1401                 for i in range(1, 1024):
1402                     try:
1403                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1404                         uncompressed = io.BytesIO(gz.read())
1405                     except OSError:
1406                         continue
1407                     break
1408                 else:
1409                     raise original_ioerror
1410             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1411             resp.msg = old_resp.msg
1412         # deflate
1413         if resp.headers.get('Content-encoding', '') == 'deflate':
1414             gz = io.BytesIO(self.deflate(resp.read()))
1415             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1416             resp.msg = old_resp.msg
1417         # brotli
1418         if resp.headers.get('Content-encoding', '') == 'br':
1419             resp = urllib.request.addinfourl(
1420                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1421             resp.msg = old_resp.msg
1422         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1423         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1424         if 300 <= resp.code < 400:
1425             location = resp.headers.get('Location')
1426             if location:
1427                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1428                 location = location.encode('iso-8859-1').decode()
1429                 location_escaped = escape_url(location)
1430                 if location != location_escaped:
1431                     del resp.headers['Location']
1432                     resp.headers['Location'] = location_escaped
1433         return resp
1434
1435     https_request = http_request
1436     https_response = http_response
1437
1438
1439 def make_socks_conn_class(base_class, socks_proxy):
1440     assert issubclass(base_class, (
1441         http.client.HTTPConnection, http.client.HTTPSConnection))
1442
1443     url_components = urllib.parse.urlparse(socks_proxy)
1444     if url_components.scheme.lower() == 'socks5':
1445         socks_type = ProxyType.SOCKS5
1446     elif url_components.scheme.lower() in ('socks', 'socks4'):
1447         socks_type = ProxyType.SOCKS4
1448     elif url_components.scheme.lower() == 'socks4a':
1449         socks_type = ProxyType.SOCKS4A
1450
1451     def unquote_if_non_empty(s):
1452         if not s:
1453             return s
1454         return urllib.parse.unquote_plus(s)
1455
1456     proxy_args = (
1457         socks_type,
1458         url_components.hostname, url_components.port or 1080,
1459         True,  # Remote DNS
1460         unquote_if_non_empty(url_components.username),
1461         unquote_if_non_empty(url_components.password),
1462     )
1463
1464     class SocksConnection(base_class):
1465         def connect(self):
1466             self.sock = sockssocket()
1467             self.sock.setproxy(*proxy_args)
1468             if isinstance(self.timeout, (int, float)):
1469                 self.sock.settimeout(self.timeout)
1470             self.sock.connect((self.host, self.port))
1471
1472             if isinstance(self, http.client.HTTPSConnection):
1473                 if hasattr(self, '_context'):  # Python > 2.6
1474                     self.sock = self._context.wrap_socket(
1475                         self.sock, server_hostname=self.host)
1476                 else:
1477                     self.sock = ssl.wrap_socket(self.sock)
1478
1479     return SocksConnection
1480
1481
1482 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1483     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1484         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1485         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1486         self._params = params
1487
1488     def https_open(self, req):
1489         kwargs = {}
1490         conn_class = self._https_conn_class
1491
1492         if hasattr(self, '_context'):  # python > 2.6
1493             kwargs['context'] = self._context
1494         if hasattr(self, '_check_hostname'):  # python 3.x
1495             kwargs['check_hostname'] = self._check_hostname
1496
1497         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1498         if socks_proxy:
1499             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1500             del req.headers['Ytdl-socks-proxy']
1501
1502         try:
1503             return self.do_open(
1504                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1505         except urllib.error.URLError as e:
1506             if (isinstance(e.reason, ssl.SSLError)
1507                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1508                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1509             raise
1510
1511
1512 def is_path_like(f):
1513     return isinstance(f, (str, bytes, os.PathLike))
1514
1515
1516 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1517     """
1518     See [1] for cookie file format.
1519
1520     1. https://curl.haxx.se/docs/http-cookies.html
1521     """
1522     _HTTPONLY_PREFIX = '#HttpOnly_'
1523     _ENTRY_LEN = 7
1524     _HEADER = '''# Netscape HTTP Cookie File
1525 # This file is generated by yt-dlp.  Do not edit.
1526
1527 '''
1528     _CookieFileEntry = collections.namedtuple(
1529         'CookieFileEntry',
1530         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1531
1532     def __init__(self, filename=None, *args, **kwargs):
1533         super().__init__(None, *args, **kwargs)
1534         if is_path_like(filename):
1535             filename = os.fspath(filename)
1536         self.filename = filename
1537
1538     @staticmethod
1539     def _true_or_false(cndn):
1540         return 'TRUE' if cndn else 'FALSE'
1541
1542     @contextlib.contextmanager
1543     def open(self, file, *, write=False):
1544         if is_path_like(file):
1545             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1546                 yield f
1547         else:
1548             if write:
1549                 file.truncate(0)
1550             yield file
1551
1552     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1553         now = time.time()
1554         for cookie in self:
1555             if (not ignore_discard and cookie.discard
1556                     or not ignore_expires and cookie.is_expired(now)):
1557                 continue
1558             name, value = cookie.name, cookie.value
1559             if value is None:
1560                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1561                 # with no name, whereas http.cookiejar regards it as a
1562                 # cookie with no value.
1563                 name, value = '', name
1564             f.write('%s\n' % '\t'.join((
1565                 cookie.domain,
1566                 self._true_or_false(cookie.domain.startswith('.')),
1567                 cookie.path,
1568                 self._true_or_false(cookie.secure),
1569                 str_or_none(cookie.expires, default=''),
1570                 name, value
1571             )))
1572
1573     def save(self, filename=None, *args, **kwargs):
1574         """
1575         Save cookies to a file.
1576         Code is taken from CPython 3.6
1577         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1578
1579         if filename is None:
1580             if self.filename is not None:
1581                 filename = self.filename
1582             else:
1583                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1584
1585         # Store session cookies with `expires` set to 0 instead of an empty string
1586         for cookie in self:
1587             if cookie.expires is None:
1588                 cookie.expires = 0
1589
1590         with self.open(filename, write=True) as f:
1591             f.write(self._HEADER)
1592             self._really_save(f, *args, **kwargs)
1593
1594     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1595         """Load cookies from a file."""
1596         if filename is None:
1597             if self.filename is not None:
1598                 filename = self.filename
1599             else:
1600                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1601
1602         def prepare_line(line):
1603             if line.startswith(self._HTTPONLY_PREFIX):
1604                 line = line[len(self._HTTPONLY_PREFIX):]
1605             # comments and empty lines are fine
1606             if line.startswith('#') or not line.strip():
1607                 return line
1608             cookie_list = line.split('\t')
1609             if len(cookie_list) != self._ENTRY_LEN:
1610                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1611             cookie = self._CookieFileEntry(*cookie_list)
1612             if cookie.expires_at and not cookie.expires_at.isdigit():
1613                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1614             return line
1615
1616         cf = io.StringIO()
1617         with self.open(filename) as f:
1618             for line in f:
1619                 try:
1620                     cf.write(prepare_line(line))
1621                 except http.cookiejar.LoadError as e:
1622                     if f'{line.strip()} '[0] in '[{"':
1623                         raise http.cookiejar.LoadError(
1624                             'Cookies file must be Netscape formatted, not JSON. See  '
1625                             'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1626                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1627                     continue
1628         cf.seek(0)
1629         self._really_load(cf, filename, ignore_discard, ignore_expires)
1630         # Session cookies are denoted by either `expires` field set to
1631         # an empty string or 0. MozillaCookieJar only recognizes the former
1632         # (see [1]). So we need force the latter to be recognized as session
1633         # cookies on our own.
1634         # Session cookies may be important for cookies-based authentication,
1635         # e.g. usually, when user does not check 'Remember me' check box while
1636         # logging in on a site, some important cookies are stored as session
1637         # cookies so that not recognizing them will result in failed login.
1638         # 1. https://bugs.python.org/issue17164
1639         for cookie in self:
1640             # Treat `expires=0` cookies as session cookies
1641             if cookie.expires == 0:
1642                 cookie.expires = None
1643                 cookie.discard = True
1644
1645
1646 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1647     def __init__(self, cookiejar=None):
1648         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1649
1650     def http_response(self, request, response):
1651         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1652
1653     https_request = urllib.request.HTTPCookieProcessor.http_request
1654     https_response = http_response
1655
1656
1657 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1658     """YoutubeDL redirect handler
1659
1660     The code is based on HTTPRedirectHandler implementation from CPython [1].
1661
1662     This redirect handler solves two issues:
1663      - ensures redirect URL is always unicode under python 2
1664      - introduces support for experimental HTTP response status code
1665        308 Permanent Redirect [2] used by some sites [3]
1666
1667     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1668     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1669     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1670     """
1671
1672     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1673
1674     def redirect_request(self, req, fp, code, msg, headers, newurl):
1675         """Return a Request or None in response to a redirect.
1676
1677         This is called by the http_error_30x methods when a
1678         redirection response is received.  If a redirection should
1679         take place, return a new Request to allow http_error_30x to
1680         perform the redirect.  Otherwise, raise HTTPError if no-one
1681         else should try to handle this url.  Return None if you can't
1682         but another Handler might.
1683         """
1684         m = req.get_method()
1685         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1686                  or code in (301, 302, 303) and m == "POST")):
1687             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1688         # Strictly (according to RFC 2616), 301 or 302 in response to
1689         # a POST MUST NOT cause a redirection without confirmation
1690         # from the user (of urllib.request, in this case).  In practice,
1691         # essentially all clients do redirect in this case, so we do
1692         # the same.
1693
1694         # Be conciliant with URIs containing a space.  This is mainly
1695         # redundant with the more complete encoding done in http_error_302(),
1696         # but it is kept for compatibility with other callers.
1697         newurl = newurl.replace(' ', '%20')
1698
1699         CONTENT_HEADERS = ("content-length", "content-type")
1700         # NB: don't use dict comprehension for python 2.6 compatibility
1701         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1702
1703         # A 303 must either use GET or HEAD for subsequent request
1704         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1705         if code == 303 and m != 'HEAD':
1706             m = 'GET'
1707         # 301 and 302 redirects are commonly turned into a GET from a POST
1708         # for subsequent requests by browsers, so we'll do the same.
1709         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1710         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1711         if code in (301, 302) and m == 'POST':
1712             m = 'GET'
1713
1714         return urllib.request.Request(
1715             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1716             unverifiable=True, method=m)
1717
1718
1719 def extract_timezone(date_str):
1720     m = re.search(
1721         r'''(?x)
1722             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1723             (?P<tz>Z|                                            # just the UTC Z, or
1724                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1725                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1726                    [ ]?                                          # optional space
1727                 (?P<sign>\+|-)                                   # +/-
1728                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1729             $)
1730         ''', date_str)
1731     if not m:
1732         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1733         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1734         if timezone is not None:
1735             date_str = date_str[:-len(m.group('tz'))]
1736         timezone = datetime.timedelta(hours=timezone or 0)
1737     else:
1738         date_str = date_str[:-len(m.group('tz'))]
1739         if not m.group('sign'):
1740             timezone = datetime.timedelta()
1741         else:
1742             sign = 1 if m.group('sign') == '+' else -1
1743             timezone = datetime.timedelta(
1744                 hours=sign * int(m.group('hours')),
1745                 minutes=sign * int(m.group('minutes')))
1746     return timezone, date_str
1747
1748
1749 def parse_iso8601(date_str, delimiter='T', timezone=None):
1750     """ Return a UNIX timestamp from the given date """
1751
1752     if date_str is None:
1753         return None
1754
1755     date_str = re.sub(r'\.[0-9]+', '', date_str)
1756
1757     if timezone is None:
1758         timezone, date_str = extract_timezone(date_str)
1759
1760     with contextlib.suppress(ValueError):
1761         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1762         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1763         return calendar.timegm(dt.timetuple())
1764
1765
1766 def date_formats(day_first=True):
1767     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1768
1769
1770 def unified_strdate(date_str, day_first=True):
1771     """Return a string with the date in the format YYYYMMDD"""
1772
1773     if date_str is None:
1774         return None
1775     upload_date = None
1776     # Replace commas
1777     date_str = date_str.replace(',', ' ')
1778     # Remove AM/PM + timezone
1779     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1780     _, date_str = extract_timezone(date_str)
1781
1782     for expression in date_formats(day_first):
1783         with contextlib.suppress(ValueError):
1784             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1785     if upload_date is None:
1786         timetuple = email.utils.parsedate_tz(date_str)
1787         if timetuple:
1788             with contextlib.suppress(ValueError):
1789                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1790     if upload_date is not None:
1791         return str(upload_date)
1792
1793
1794 def unified_timestamp(date_str, day_first=True):
1795     if date_str is None:
1796         return None
1797
1798     date_str = re.sub(r'\s+', ' ', re.sub(
1799         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1800
1801     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1802     timezone, date_str = extract_timezone(date_str)
1803
1804     # Remove AM/PM + timezone
1805     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1806
1807     # Remove unrecognized timezones from ISO 8601 alike timestamps
1808     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1809     if m:
1810         date_str = date_str[:-len(m.group('tz'))]
1811
1812     # Python only supports microseconds, so remove nanoseconds
1813     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1814     if m:
1815         date_str = m.group(1)
1816
1817     for expression in date_formats(day_first):
1818         with contextlib.suppress(ValueError):
1819             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1820             return calendar.timegm(dt.timetuple())
1821
1822     timetuple = email.utils.parsedate_tz(date_str)
1823     if timetuple:
1824         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1825
1826
1827 def determine_ext(url, default_ext='unknown_video'):
1828     if url is None or '.' not in url:
1829         return default_ext
1830     guess = url.partition('?')[0].rpartition('.')[2]
1831     if re.match(r'^[A-Za-z0-9]+$', guess):
1832         return guess
1833     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1834     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1835         return guess.rstrip('/')
1836     else:
1837         return default_ext
1838
1839
1840 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1841     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1842
1843
1844 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1845     R"""
1846     Return a datetime object from a string.
1847     Supported format:
1848         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1849
1850     @param format       strftime format of DATE
1851     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1852                         auto: round to the unit provided in date_str (if applicable).
1853     """
1854     auto_precision = False
1855     if precision == 'auto':
1856         auto_precision = True
1857         precision = 'microsecond'
1858     today = datetime_round(datetime.datetime.utcnow(), precision)
1859     if date_str in ('now', 'today'):
1860         return today
1861     if date_str == 'yesterday':
1862         return today - datetime.timedelta(days=1)
1863     match = re.match(
1864         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1865         date_str)
1866     if match is not None:
1867         start_time = datetime_from_str(match.group('start'), precision, format)
1868         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1869         unit = match.group('unit')
1870         if unit == 'month' or unit == 'year':
1871             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1872             unit = 'day'
1873         else:
1874             if unit == 'week':
1875                 unit = 'day'
1876                 time *= 7
1877             delta = datetime.timedelta(**{unit + 's': time})
1878             new_date = start_time + delta
1879         if auto_precision:
1880             return datetime_round(new_date, unit)
1881         return new_date
1882
1883     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1884
1885
1886 def date_from_str(date_str, format='%Y%m%d', strict=False):
1887     R"""
1888     Return a date object from a string using datetime_from_str
1889
1890     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1891                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1892     """
1893     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1894         raise ValueError(f'Invalid date format "{date_str}"')
1895     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1896
1897
1898 def datetime_add_months(dt, months):
1899     """Increment/Decrement a datetime object by months."""
1900     month = dt.month + months - 1
1901     year = dt.year + month // 12
1902     month = month % 12 + 1
1903     day = min(dt.day, calendar.monthrange(year, month)[1])
1904     return dt.replace(year, month, day)
1905
1906
1907 def datetime_round(dt, precision='day'):
1908     """
1909     Round a datetime object's time to a specific precision
1910     """
1911     if precision == 'microsecond':
1912         return dt
1913
1914     unit_seconds = {
1915         'day': 86400,
1916         'hour': 3600,
1917         'minute': 60,
1918         'second': 1,
1919     }
1920     roundto = lambda x, n: ((x + n / 2) // n) * n
1921     timestamp = calendar.timegm(dt.timetuple())
1922     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1923
1924
1925 def hyphenate_date(date_str):
1926     """
1927     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1928     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1929     if match is not None:
1930         return '-'.join(match.groups())
1931     else:
1932         return date_str
1933
1934
1935 class DateRange:
1936     """Represents a time interval between two dates"""
1937
1938     def __init__(self, start=None, end=None):
1939         """start and end must be strings in the format accepted by date"""
1940         if start is not None:
1941             self.start = date_from_str(start, strict=True)
1942         else:
1943             self.start = datetime.datetime.min.date()
1944         if end is not None:
1945             self.end = date_from_str(end, strict=True)
1946         else:
1947             self.end = datetime.datetime.max.date()
1948         if self.start > self.end:
1949             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1950
1951     @classmethod
1952     def day(cls, day):
1953         """Returns a range that only contains the given day"""
1954         return cls(day, day)
1955
1956     def __contains__(self, date):
1957         """Check if the date is in the range"""
1958         if not isinstance(date, datetime.date):
1959             date = date_from_str(date)
1960         return self.start <= date <= self.end
1961
1962     def __repr__(self):
1963         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1964
1965     def __eq__(self, other):
1966         return (isinstance(other, DateRange)
1967                 and self.start == other.start and self.end == other.end)
1968
1969
1970 @functools.cache
1971 def system_identifier():
1972     python_implementation = platform.python_implementation()
1973     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1974         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1975     libc_ver = []
1976     with contextlib.suppress(OSError):  # We may not have access to the executable
1977         libc_ver = platform.libc_ver()
1978
1979     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1980         platform.python_version(),
1981         python_implementation,
1982         platform.machine(),
1983         platform.architecture()[0],
1984         platform.platform(),
1985         ssl.OPENSSL_VERSION,
1986         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1987     )
1988
1989
1990 @functools.cache
1991 def get_windows_version():
1992     ''' Get Windows version. returns () if it's not running on Windows '''
1993     if compat_os_name == 'nt':
1994         return version_tuple(platform.win32_ver()[1])
1995     else:
1996         return ()
1997
1998
1999 def write_string(s, out=None, encoding=None):
2000     assert isinstance(s, str)
2001     out = out or sys.stderr
2002     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
2003     if not out:
2004         return
2005
2006     if compat_os_name == 'nt' and supports_terminal_sequences(out):
2007         s = re.sub(r'([\r\n]+)', r' \1', s)
2008
2009     enc, buffer = None, out
2010     if 'b' in getattr(out, 'mode', ''):
2011         enc = encoding or preferredencoding()
2012     elif hasattr(out, 'buffer'):
2013         buffer = out.buffer
2014         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2015
2016     buffer.write(s.encode(enc, 'ignore') if enc else s)
2017     out.flush()
2018
2019
2020 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2021     from .. import _IN_CLI
2022     if _IN_CLI:
2023         if msg in deprecation_warning._cache:
2024             return
2025         deprecation_warning._cache.add(msg)
2026         if printer:
2027             return printer(f'{msg}{bug_reports_message()}', **kwargs)
2028         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2029     else:
2030         import warnings
2031         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2032
2033
2034 deprecation_warning._cache = set()
2035
2036
2037 def bytes_to_intlist(bs):
2038     if not bs:
2039         return []
2040     if isinstance(bs[0], int):  # Python 3
2041         return list(bs)
2042     else:
2043         return [ord(c) for c in bs]
2044
2045
2046 def intlist_to_bytes(xs):
2047     if not xs:
2048         return b''
2049     return struct.pack('%dB' % len(xs), *xs)
2050
2051
2052 class LockingUnsupportedError(OSError):
2053     msg = 'File locking is not supported'
2054
2055     def __init__(self):
2056         super().__init__(self.msg)
2057
2058
2059 # Cross-platform file locking
2060 if sys.platform == 'win32':
2061     import ctypes
2062     import ctypes.wintypes
2063     import msvcrt
2064
2065     class OVERLAPPED(ctypes.Structure):
2066         _fields_ = [
2067             ('Internal', ctypes.wintypes.LPVOID),
2068             ('InternalHigh', ctypes.wintypes.LPVOID),
2069             ('Offset', ctypes.wintypes.DWORD),
2070             ('OffsetHigh', ctypes.wintypes.DWORD),
2071             ('hEvent', ctypes.wintypes.HANDLE),
2072         ]
2073
2074     kernel32 = ctypes.WinDLL('kernel32')
2075     LockFileEx = kernel32.LockFileEx
2076     LockFileEx.argtypes = [
2077         ctypes.wintypes.HANDLE,     # hFile
2078         ctypes.wintypes.DWORD,      # dwFlags
2079         ctypes.wintypes.DWORD,      # dwReserved
2080         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2081         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2082         ctypes.POINTER(OVERLAPPED)  # Overlapped
2083     ]
2084     LockFileEx.restype = ctypes.wintypes.BOOL
2085     UnlockFileEx = kernel32.UnlockFileEx
2086     UnlockFileEx.argtypes = [
2087         ctypes.wintypes.HANDLE,     # hFile
2088         ctypes.wintypes.DWORD,      # dwReserved
2089         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2090         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2091         ctypes.POINTER(OVERLAPPED)  # Overlapped
2092     ]
2093     UnlockFileEx.restype = ctypes.wintypes.BOOL
2094     whole_low = 0xffffffff
2095     whole_high = 0x7fffffff
2096
2097     def _lock_file(f, exclusive, block):
2098         overlapped = OVERLAPPED()
2099         overlapped.Offset = 0
2100         overlapped.OffsetHigh = 0
2101         overlapped.hEvent = 0
2102         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2103
2104         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2105                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2106                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2107             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2108             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2109
2110     def _unlock_file(f):
2111         assert f._lock_file_overlapped_p
2112         handle = msvcrt.get_osfhandle(f.fileno())
2113         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2114             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2115
2116 else:
2117     try:
2118         import fcntl
2119
2120         def _lock_file(f, exclusive, block):
2121             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2122             if not block:
2123                 flags |= fcntl.LOCK_NB
2124             try:
2125                 fcntl.flock(f, flags)
2126             except BlockingIOError:
2127                 raise
2128             except OSError:  # AOSP does not have flock()
2129                 fcntl.lockf(f, flags)
2130
2131         def _unlock_file(f):
2132             with contextlib.suppress(OSError):
2133                 return fcntl.flock(f, fcntl.LOCK_UN)
2134             with contextlib.suppress(OSError):
2135                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
2136             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
2137
2138     except ImportError:
2139
2140         def _lock_file(f, exclusive, block):
2141             raise LockingUnsupportedError()
2142
2143         def _unlock_file(f):
2144             raise LockingUnsupportedError()
2145
2146
2147 class locked_file:
2148     locked = False
2149
2150     def __init__(self, filename, mode, block=True, encoding=None):
2151         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2152             raise NotImplementedError(mode)
2153         self.mode, self.block = mode, block
2154
2155         writable = any(f in mode for f in 'wax+')
2156         readable = any(f in mode for f in 'r+')
2157         flags = functools.reduce(operator.ior, (
2158             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2159             getattr(os, 'O_BINARY', 0),  # Windows only
2160             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2161             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2162             os.O_APPEND if 'a' in mode else 0,
2163             os.O_EXCL if 'x' in mode else 0,
2164             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2165         ))
2166
2167         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2168
2169     def __enter__(self):
2170         exclusive = 'r' not in self.mode
2171         try:
2172             _lock_file(self.f, exclusive, self.block)
2173             self.locked = True
2174         except OSError:
2175             self.f.close()
2176             raise
2177         if 'w' in self.mode:
2178             try:
2179                 self.f.truncate()
2180             except OSError as e:
2181                 if e.errno not in (
2182                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2183                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2184                 ):
2185                     raise
2186         return self
2187
2188     def unlock(self):
2189         if not self.locked:
2190             return
2191         try:
2192             _unlock_file(self.f)
2193         finally:
2194             self.locked = False
2195
2196     def __exit__(self, *_):
2197         try:
2198             self.unlock()
2199         finally:
2200             self.f.close()
2201
2202     open = __enter__
2203     close = __exit__
2204
2205     def __getattr__(self, attr):
2206         return getattr(self.f, attr)
2207
2208     def __iter__(self):
2209         return iter(self.f)
2210
2211
2212 @functools.cache
2213 def get_filesystem_encoding():
2214     encoding = sys.getfilesystemencoding()
2215     return encoding if encoding is not None else 'utf-8'
2216
2217
2218 def shell_quote(args):
2219     quoted_args = []
2220     encoding = get_filesystem_encoding()
2221     for a in args:
2222         if isinstance(a, bytes):
2223             # We may get a filename encoded with 'encodeFilename'
2224             a = a.decode(encoding)
2225         quoted_args.append(compat_shlex_quote(a))
2226     return ' '.join(quoted_args)
2227
2228
2229 def smuggle_url(url, data):
2230     """ Pass additional data in a URL for internal use. """
2231
2232     url, idata = unsmuggle_url(url, {})
2233     data.update(idata)
2234     sdata = urllib.parse.urlencode(
2235         {'__youtubedl_smuggle': json.dumps(data)})
2236     return url + '#' + sdata
2237
2238
2239 def unsmuggle_url(smug_url, default=None):
2240     if '#__youtubedl_smuggle' not in smug_url:
2241         return smug_url, default
2242     url, _, sdata = smug_url.rpartition('#')
2243     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2244     data = json.loads(jsond)
2245     return url, data
2246
2247
2248 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2249     """ Formats numbers with decimal sufixes like K, M, etc """
2250     num, factor = float_or_none(num), float(factor)
2251     if num is None or num < 0:
2252         return None
2253     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2254     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2255     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2256     if factor == 1024:
2257         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2258     converted = num / (factor ** exponent)
2259     return fmt % (converted, suffix)
2260
2261
2262 def format_bytes(bytes):
2263     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2264
2265
2266 def lookup_unit_table(unit_table, s, strict=False):
2267     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2268     units_re = '|'.join(re.escape(u) for u in unit_table)
2269     m = (re.fullmatch if strict else re.match)(
2270         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2271     if not m:
2272         return None
2273
2274     num = float(m.group('num').replace(',', '.'))
2275     mult = unit_table[m.group('unit')]
2276     return round(num * mult)
2277
2278
2279 def parse_bytes(s):
2280     """Parse a string indicating a byte quantity into an integer"""
2281     return lookup_unit_table(
2282         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2283         s.upper(), strict=True)
2284
2285
2286 def parse_filesize(s):
2287     if s is None:
2288         return None
2289
2290     # The lower-case forms are of course incorrect and unofficial,
2291     # but we support those too
2292     _UNIT_TABLE = {
2293         'B': 1,
2294         'b': 1,
2295         'bytes': 1,
2296         'KiB': 1024,
2297         'KB': 1000,
2298         'kB': 1024,
2299         'Kb': 1000,
2300         'kb': 1000,
2301         'kilobytes': 1000,
2302         'kibibytes': 1024,
2303         'MiB': 1024 ** 2,
2304         'MB': 1000 ** 2,
2305         'mB': 1024 ** 2,
2306         'Mb': 1000 ** 2,
2307         'mb': 1000 ** 2,
2308         'megabytes': 1000 ** 2,
2309         'mebibytes': 1024 ** 2,
2310         'GiB': 1024 ** 3,
2311         'GB': 1000 ** 3,
2312         'gB': 1024 ** 3,
2313         'Gb': 1000 ** 3,
2314         'gb': 1000 ** 3,
2315         'gigabytes': 1000 ** 3,
2316         'gibibytes': 1024 ** 3,
2317         'TiB': 1024 ** 4,
2318         'TB': 1000 ** 4,
2319         'tB': 1024 ** 4,
2320         'Tb': 1000 ** 4,
2321         'tb': 1000 ** 4,
2322         'terabytes': 1000 ** 4,
2323         'tebibytes': 1024 ** 4,
2324         'PiB': 1024 ** 5,
2325         'PB': 1000 ** 5,
2326         'pB': 1024 ** 5,
2327         'Pb': 1000 ** 5,
2328         'pb': 1000 ** 5,
2329         'petabytes': 1000 ** 5,
2330         'pebibytes': 1024 ** 5,
2331         'EiB': 1024 ** 6,
2332         'EB': 1000 ** 6,
2333         'eB': 1024 ** 6,
2334         'Eb': 1000 ** 6,
2335         'eb': 1000 ** 6,
2336         'exabytes': 1000 ** 6,
2337         'exbibytes': 1024 ** 6,
2338         'ZiB': 1024 ** 7,
2339         'ZB': 1000 ** 7,
2340         'zB': 1024 ** 7,
2341         'Zb': 1000 ** 7,
2342         'zb': 1000 ** 7,
2343         'zettabytes': 1000 ** 7,
2344         'zebibytes': 1024 ** 7,
2345         'YiB': 1024 ** 8,
2346         'YB': 1000 ** 8,
2347         'yB': 1024 ** 8,
2348         'Yb': 1000 ** 8,
2349         'yb': 1000 ** 8,
2350         'yottabytes': 1000 ** 8,
2351         'yobibytes': 1024 ** 8,
2352     }
2353
2354     return lookup_unit_table(_UNIT_TABLE, s)
2355
2356
2357 def parse_count(s):
2358     if s is None:
2359         return None
2360
2361     s = re.sub(r'^[^\d]+\s', '', s).strip()
2362
2363     if re.match(r'^[\d,.]+$', s):
2364         return str_to_int(s)
2365
2366     _UNIT_TABLE = {
2367         'k': 1000,
2368         'K': 1000,
2369         'm': 1000 ** 2,
2370         'M': 1000 ** 2,
2371         'kk': 1000 ** 2,
2372         'KK': 1000 ** 2,
2373         'b': 1000 ** 3,
2374         'B': 1000 ** 3,
2375     }
2376
2377     ret = lookup_unit_table(_UNIT_TABLE, s)
2378     if ret is not None:
2379         return ret
2380
2381     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2382     if mobj:
2383         return str_to_int(mobj.group(1))
2384
2385
2386 def parse_resolution(s, *, lenient=False):
2387     if s is None:
2388         return {}
2389
2390     if lenient:
2391         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2392     else:
2393         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2394     if mobj:
2395         return {
2396             'width': int(mobj.group('w')),
2397             'height': int(mobj.group('h')),
2398         }
2399
2400     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2401     if mobj:
2402         return {'height': int(mobj.group(1))}
2403
2404     mobj = re.search(r'\b([48])[kK]\b', s)
2405     if mobj:
2406         return {'height': int(mobj.group(1)) * 540}
2407
2408     return {}
2409
2410
2411 def parse_bitrate(s):
2412     if not isinstance(s, str):
2413         return
2414     mobj = re.search(r'\b(\d+)\s*kbps', s)
2415     if mobj:
2416         return int(mobj.group(1))
2417
2418
2419 def month_by_name(name, lang='en'):
2420     """ Return the number of a month by (locale-independently) English name """
2421
2422     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2423
2424     try:
2425         return month_names.index(name) + 1
2426     except ValueError:
2427         return None
2428
2429
2430 def month_by_abbreviation(abbrev):
2431     """ Return the number of a month by (locale-independently) English
2432         abbreviations """
2433
2434     try:
2435         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2436     except ValueError:
2437         return None
2438
2439
2440 def fix_xml_ampersands(xml_str):
2441     """Replace all the '&' by '&amp;' in XML"""
2442     return re.sub(
2443         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2444         '&amp;',
2445         xml_str)
2446
2447
2448 def setproctitle(title):
2449     assert isinstance(title, str)
2450
2451     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2452     try:
2453         import ctypes
2454     except ImportError:
2455         return
2456
2457     try:
2458         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2459     except OSError:
2460         return
2461     except TypeError:
2462         # LoadLibrary in Windows Python 2.7.13 only expects
2463         # a bytestring, but since unicode_literals turns
2464         # every string into a unicode string, it fails.
2465         return
2466     title_bytes = title.encode()
2467     buf = ctypes.create_string_buffer(len(title_bytes))
2468     buf.value = title_bytes
2469     try:
2470         libc.prctl(15, buf, 0, 0, 0)
2471     except AttributeError:
2472         return  # Strange libc, just skip this
2473
2474
2475 def remove_start(s, start):
2476     return s[len(start):] if s is not None and s.startswith(start) else s
2477
2478
2479 def remove_end(s, end):
2480     return s[:-len(end)] if s is not None and s.endswith(end) else s
2481
2482
2483 def remove_quotes(s):
2484     if s is None or len(s) < 2:
2485         return s
2486     for quote in ('"', "'", ):
2487         if s[0] == quote and s[-1] == quote:
2488             return s[1:-1]
2489     return s
2490
2491
2492 def get_domain(url):
2493     """
2494     This implementation is inconsistent, but is kept for compatibility.
2495     Use this only for "webpage_url_domain"
2496     """
2497     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2498
2499
2500 def url_basename(url):
2501     path = urllib.parse.urlparse(url).path
2502     return path.strip('/').split('/')[-1]
2503
2504
2505 def base_url(url):
2506     return re.match(r'https?://[^?#]+/', url).group()
2507
2508
2509 def urljoin(base, path):
2510     if isinstance(path, bytes):
2511         path = path.decode()
2512     if not isinstance(path, str) or not path:
2513         return None
2514     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2515         return path
2516     if isinstance(base, bytes):
2517         base = base.decode()
2518     if not isinstance(base, str) or not re.match(
2519             r'^(?:https?:)?//', base):
2520         return None
2521     return urllib.parse.urljoin(base, path)
2522
2523
2524 class HEADRequest(urllib.request.Request):
2525     def get_method(self):
2526         return 'HEAD'
2527
2528
2529 class PUTRequest(urllib.request.Request):
2530     def get_method(self):
2531         return 'PUT'
2532
2533
2534 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2535     if get_attr and v is not None:
2536         v = getattr(v, get_attr, None)
2537     try:
2538         return int(v) * invscale // scale
2539     except (ValueError, TypeError, OverflowError):
2540         return default
2541
2542
2543 def str_or_none(v, default=None):
2544     return default if v is None else str(v)
2545
2546
2547 def str_to_int(int_str):
2548     """ A more relaxed version of int_or_none """
2549     if isinstance(int_str, int):
2550         return int_str
2551     elif isinstance(int_str, str):
2552         int_str = re.sub(r'[,\.\+]', '', int_str)
2553         return int_or_none(int_str)
2554
2555
2556 def float_or_none(v, scale=1, invscale=1, default=None):
2557     if v is None:
2558         return default
2559     try:
2560         return float(v) * invscale / scale
2561     except (ValueError, TypeError):
2562         return default
2563
2564
2565 def bool_or_none(v, default=None):
2566     return v if isinstance(v, bool) else default
2567
2568
2569 def strip_or_none(v, default=None):
2570     return v.strip() if isinstance(v, str) else default
2571
2572
2573 def url_or_none(url):
2574     if not url or not isinstance(url, str):
2575         return None
2576     url = url.strip()
2577     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2578
2579
2580 def request_to_url(req):
2581     if isinstance(req, urllib.request.Request):
2582         return req.get_full_url()
2583     else:
2584         return req
2585
2586
2587 def strftime_or_none(timestamp, date_format, default=None):
2588     datetime_object = None
2589     try:
2590         if isinstance(timestamp, (int, float)):  # unix timestamp
2591             # Using naive datetime here can break timestamp() in Windows
2592             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2593             datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2594         elif isinstance(timestamp, str):  # assume YYYYMMDD
2595             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2596         date_format = re.sub(  # Support %s on windows
2597             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2598         return datetime_object.strftime(date_format)
2599     except (ValueError, TypeError, AttributeError):
2600         return default
2601
2602
2603 def parse_duration(s):
2604     if not isinstance(s, str):
2605         return None
2606     s = s.strip()
2607     if not s:
2608         return None
2609
2610     days, hours, mins, secs, ms = [None] * 5
2611     m = re.match(r'''(?x)
2612             (?P<before_secs>
2613                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2614             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2615             (?P<ms>[.:][0-9]+)?Z?$
2616         ''', s)
2617     if m:
2618         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2619     else:
2620         m = re.match(
2621             r'''(?ix)(?:P?
2622                 (?:
2623                     [0-9]+\s*y(?:ears?)?,?\s*
2624                 )?
2625                 (?:
2626                     [0-9]+\s*m(?:onths?)?,?\s*
2627                 )?
2628                 (?:
2629                     [0-9]+\s*w(?:eeks?)?,?\s*
2630                 )?
2631                 (?:
2632                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2633                 )?
2634                 T)?
2635                 (?:
2636                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2637                 )?
2638                 (?:
2639                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2640                 )?
2641                 (?:
2642                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2643                 )?Z?$''', s)
2644         if m:
2645             days, hours, mins, secs, ms = m.groups()
2646         else:
2647             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2648             if m:
2649                 hours, mins = m.groups()
2650             else:
2651                 return None
2652
2653     if ms:
2654         ms = ms.replace(':', '.')
2655     return sum(float(part or 0) * mult for part, mult in (
2656         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2657
2658
2659 def prepend_extension(filename, ext, expected_real_ext=None):
2660     name, real_ext = os.path.splitext(filename)
2661     return (
2662         f'{name}.{ext}{real_ext}'
2663         if not expected_real_ext or real_ext[1:] == expected_real_ext
2664         else f'{filename}.{ext}')
2665
2666
2667 def replace_extension(filename, ext, expected_real_ext=None):
2668     name, real_ext = os.path.splitext(filename)
2669     return '{}.{}'.format(
2670         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2671         ext)
2672
2673
2674 def check_executable(exe, args=[]):
2675     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2676     args can be a list of arguments for a short output (like -version) """
2677     try:
2678         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2679     except OSError:
2680         return False
2681     return exe
2682
2683
2684 def _get_exe_version_output(exe, args):
2685     try:
2686         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2687         # SIGTTOU if yt-dlp is run in the background.
2688         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2689         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2690                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2691         if ret:
2692             return None
2693     except OSError:
2694         return False
2695     return stdout
2696
2697
2698 def detect_exe_version(output, version_re=None, unrecognized='present'):
2699     assert isinstance(output, str)
2700     if version_re is None:
2701         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2702     m = re.search(version_re, output)
2703     if m:
2704         return m.group(1)
2705     else:
2706         return unrecognized
2707
2708
2709 def get_exe_version(exe, args=['--version'],
2710                     version_re=None, unrecognized=('present', 'broken')):
2711     """ Returns the version of the specified executable,
2712     or False if the executable is not present """
2713     unrecognized = variadic(unrecognized)
2714     assert len(unrecognized) in (1, 2)
2715     out = _get_exe_version_output(exe, args)
2716     if out is None:
2717         return unrecognized[-1]
2718     return out and detect_exe_version(out, version_re, unrecognized[0])
2719
2720
2721 def frange(start=0, stop=None, step=1):
2722     """Float range"""
2723     if stop is None:
2724         start, stop = 0, start
2725     sign = [-1, 1][step > 0] if step else 0
2726     while sign * start < sign * stop:
2727         yield start
2728         start += step
2729
2730
2731 class LazyList(collections.abc.Sequence):
2732     """Lazy immutable list from an iterable
2733     Note that slices of a LazyList are lists and not LazyList"""
2734
2735     class IndexError(IndexError):
2736         pass
2737
2738     def __init__(self, iterable, *, reverse=False, _cache=None):
2739         self._iterable = iter(iterable)
2740         self._cache = [] if _cache is None else _cache
2741         self._reversed = reverse
2742
2743     def __iter__(self):
2744         if self._reversed:
2745             # We need to consume the entire iterable to iterate in reverse
2746             yield from self.exhaust()
2747             return
2748         yield from self._cache
2749         for item in self._iterable:
2750             self._cache.append(item)
2751             yield item
2752
2753     def _exhaust(self):
2754         self._cache.extend(self._iterable)
2755         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2756         return self._cache
2757
2758     def exhaust(self):
2759         """Evaluate the entire iterable"""
2760         return self._exhaust()[::-1 if self._reversed else 1]
2761
2762     @staticmethod
2763     def _reverse_index(x):
2764         return None if x is None else ~x
2765
2766     def __getitem__(self, idx):
2767         if isinstance(idx, slice):
2768             if self._reversed:
2769                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2770             start, stop, step = idx.start, idx.stop, idx.step or 1
2771         elif isinstance(idx, int):
2772             if self._reversed:
2773                 idx = self._reverse_index(idx)
2774             start, stop, step = idx, idx, 0
2775         else:
2776             raise TypeError('indices must be integers or slices')
2777         if ((start or 0) < 0 or (stop or 0) < 0
2778                 or (start is None and step < 0)
2779                 or (stop is None and step > 0)):
2780             # We need to consume the entire iterable to be able to slice from the end
2781             # Obviously, never use this with infinite iterables
2782             self._exhaust()
2783             try:
2784                 return self._cache[idx]
2785             except IndexError as e:
2786                 raise self.IndexError(e) from e
2787         n = max(start or 0, stop or 0) - len(self._cache) + 1
2788         if n > 0:
2789             self._cache.extend(itertools.islice(self._iterable, n))
2790         try:
2791             return self._cache[idx]
2792         except IndexError as e:
2793             raise self.IndexError(e) from e
2794
2795     def __bool__(self):
2796         try:
2797             self[-1] if self._reversed else self[0]
2798         except self.IndexError:
2799             return False
2800         return True
2801
2802     def __len__(self):
2803         self._exhaust()
2804         return len(self._cache)
2805
2806     def __reversed__(self):
2807         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2808
2809     def __copy__(self):
2810         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2811
2812     def __repr__(self):
2813         # repr and str should mimic a list. So we exhaust the iterable
2814         return repr(self.exhaust())
2815
2816     def __str__(self):
2817         return repr(self.exhaust())
2818
2819
2820 class PagedList:
2821
2822     class IndexError(IndexError):
2823         pass
2824
2825     def __len__(self):
2826         # This is only useful for tests
2827         return len(self.getslice())
2828
2829     def __init__(self, pagefunc, pagesize, use_cache=True):
2830         self._pagefunc = pagefunc
2831         self._pagesize = pagesize
2832         self._pagecount = float('inf')
2833         self._use_cache = use_cache
2834         self._cache = {}
2835
2836     def getpage(self, pagenum):
2837         page_results = self._cache.get(pagenum)
2838         if page_results is None:
2839             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2840         if self._use_cache:
2841             self._cache[pagenum] = page_results
2842         return page_results
2843
2844     def getslice(self, start=0, end=None):
2845         return list(self._getslice(start, end))
2846
2847     def _getslice(self, start, end):
2848         raise NotImplementedError('This method must be implemented by subclasses')
2849
2850     def __getitem__(self, idx):
2851         assert self._use_cache, 'Indexing PagedList requires cache'
2852         if not isinstance(idx, int) or idx < 0:
2853             raise TypeError('indices must be non-negative integers')
2854         entries = self.getslice(idx, idx + 1)
2855         if not entries:
2856             raise self.IndexError()
2857         return entries[0]
2858
2859
2860 class OnDemandPagedList(PagedList):
2861     """Download pages until a page with less than maximum results"""
2862
2863     def _getslice(self, start, end):
2864         for pagenum in itertools.count(start // self._pagesize):
2865             firstid = pagenum * self._pagesize
2866             nextfirstid = pagenum * self._pagesize + self._pagesize
2867             if start >= nextfirstid:
2868                 continue
2869
2870             startv = (
2871                 start % self._pagesize
2872                 if firstid <= start < nextfirstid
2873                 else 0)
2874             endv = (
2875                 ((end - 1) % self._pagesize) + 1
2876                 if (end is not None and firstid <= end <= nextfirstid)
2877                 else None)
2878
2879             try:
2880                 page_results = self.getpage(pagenum)
2881             except Exception:
2882                 self._pagecount = pagenum - 1
2883                 raise
2884             if startv != 0 or endv is not None:
2885                 page_results = page_results[startv:endv]
2886             yield from page_results
2887
2888             # A little optimization - if current page is not "full", ie. does
2889             # not contain page_size videos then we can assume that this page
2890             # is the last one - there are no more ids on further pages -
2891             # i.e. no need to query again.
2892             if len(page_results) + startv < self._pagesize:
2893                 break
2894
2895             # If we got the whole page, but the next page is not interesting,
2896             # break out early as well
2897             if end == nextfirstid:
2898                 break
2899
2900
2901 class InAdvancePagedList(PagedList):
2902     """PagedList with total number of pages known in advance"""
2903
2904     def __init__(self, pagefunc, pagecount, pagesize):
2905         PagedList.__init__(self, pagefunc, pagesize, True)
2906         self._pagecount = pagecount
2907
2908     def _getslice(self, start, end):
2909         start_page = start // self._pagesize
2910         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2911         skip_elems = start - start_page * self._pagesize
2912         only_more = None if end is None else end - start
2913         for pagenum in range(start_page, end_page):
2914             page_results = self.getpage(pagenum)
2915             if skip_elems:
2916                 page_results = page_results[skip_elems:]
2917                 skip_elems = None
2918             if only_more is not None:
2919                 if len(page_results) < only_more:
2920                     only_more -= len(page_results)
2921                 else:
2922                     yield from page_results[:only_more]
2923                     break
2924             yield from page_results
2925
2926
2927 class PlaylistEntries:
2928     MissingEntry = object()
2929     is_exhausted = False
2930
2931     def __init__(self, ydl, info_dict):
2932         self.ydl = ydl
2933
2934         # _entries must be assigned now since infodict can change during iteration
2935         entries = info_dict.get('entries')
2936         if entries is None:
2937             raise EntryNotInPlaylist('There are no entries')
2938         elif isinstance(entries, list):
2939             self.is_exhausted = True
2940
2941         requested_entries = info_dict.get('requested_entries')
2942         self.is_incomplete = requested_entries is not None
2943         if self.is_incomplete:
2944             assert self.is_exhausted
2945             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2946             for i, entry in zip(requested_entries, entries):
2947                 self._entries[i - 1] = entry
2948         elif isinstance(entries, (list, PagedList, LazyList)):
2949             self._entries = entries
2950         else:
2951             self._entries = LazyList(entries)
2952
2953     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2954         (?P<start>[+-]?\d+)?
2955         (?P<range>[:-]
2956             (?P<end>[+-]?\d+|inf(?:inite)?)?
2957             (?::(?P<step>[+-]?\d+))?
2958         )?''')
2959
2960     @classmethod
2961     def parse_playlist_items(cls, string):
2962         for segment in string.split(','):
2963             if not segment:
2964                 raise ValueError('There is two or more consecutive commas')
2965             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2966             if not mobj:
2967                 raise ValueError(f'{segment!r} is not a valid specification')
2968             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2969             if int_or_none(step) == 0:
2970                 raise ValueError(f'Step in {segment!r} cannot be zero')
2971             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2972
2973     def get_requested_items(self):
2974         playlist_items = self.ydl.params.get('playlist_items')
2975         playlist_start = self.ydl.params.get('playliststart', 1)
2976         playlist_end = self.ydl.params.get('playlistend')
2977         # For backwards compatibility, interpret -1 as whole list
2978         if playlist_end in (-1, None):
2979             playlist_end = ''
2980         if not playlist_items:
2981             playlist_items = f'{playlist_start}:{playlist_end}'
2982         elif playlist_start != 1 or playlist_end:
2983             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2984
2985         for index in self.parse_playlist_items(playlist_items):
2986             for i, entry in self[index]:
2987                 yield i, entry
2988                 if not entry:
2989                     continue
2990                 try:
2991                     # The item may have just been added to archive. Don't break due to it
2992                     if not self.ydl.params.get('lazy_playlist'):
2993                         # TODO: Add auto-generated fields
2994                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2995                 except (ExistingVideoReached, RejectedVideoReached):
2996                     return
2997
2998     def get_full_count(self):
2999         if self.is_exhausted and not self.is_incomplete:
3000             return len(self)
3001         elif isinstance(self._entries, InAdvancePagedList):
3002             if self._entries._pagesize == 1:
3003                 return self._entries._pagecount
3004
3005     @functools.cached_property
3006     def _getter(self):
3007         if isinstance(self._entries, list):
3008             def get_entry(i):
3009                 try:
3010                     entry = self._entries[i]
3011                 except IndexError:
3012                     entry = self.MissingEntry
3013                     if not self.is_incomplete:
3014                         raise self.IndexError()
3015                 if entry is self.MissingEntry:
3016                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
3017                 return entry
3018         else:
3019             def get_entry(i):
3020                 try:
3021                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3022                 except (LazyList.IndexError, PagedList.IndexError):
3023                     raise self.IndexError()
3024         return get_entry
3025
3026     def __getitem__(self, idx):
3027         if isinstance(idx, int):
3028             idx = slice(idx, idx)
3029
3030         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3031         step = 1 if idx.step is None else idx.step
3032         if idx.start is None:
3033             start = 0 if step > 0 else len(self) - 1
3034         else:
3035             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3036
3037         # NB: Do not call len(self) when idx == [:]
3038         if idx.stop is None:
3039             stop = 0 if step < 0 else float('inf')
3040         else:
3041             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3042         stop += [-1, 1][step > 0]
3043
3044         for i in frange(start, stop, step):
3045             if i < 0:
3046                 continue
3047             try:
3048                 entry = self._getter(i)
3049             except self.IndexError:
3050                 self.is_exhausted = True
3051                 if step > 0:
3052                     break
3053                 continue
3054             yield i + 1, entry
3055
3056     def __len__(self):
3057         return len(tuple(self[:]))
3058
3059     class IndexError(IndexError):
3060         pass
3061
3062
3063 def uppercase_escape(s):
3064     unicode_escape = codecs.getdecoder('unicode_escape')
3065     return re.sub(
3066         r'\\U[0-9a-fA-F]{8}',
3067         lambda m: unicode_escape(m.group(0))[0],
3068         s)
3069
3070
3071 def lowercase_escape(s):
3072     unicode_escape = codecs.getdecoder('unicode_escape')
3073     return re.sub(
3074         r'\\u[0-9a-fA-F]{4}',
3075         lambda m: unicode_escape(m.group(0))[0],
3076         s)
3077
3078
3079 def escape_rfc3986(s):
3080     """Escape non-ASCII characters as suggested by RFC 3986"""
3081     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3082
3083
3084 def escape_url(url):
3085     """Escape URL as suggested by RFC 3986"""
3086     url_parsed = urllib.parse.urlparse(url)
3087     return url_parsed._replace(
3088         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3089         path=escape_rfc3986(url_parsed.path),
3090         params=escape_rfc3986(url_parsed.params),
3091         query=escape_rfc3986(url_parsed.query),
3092         fragment=escape_rfc3986(url_parsed.fragment)
3093     ).geturl()
3094
3095
3096 def parse_qs(url, **kwargs):
3097     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
3098
3099
3100 def read_batch_urls(batch_fd):
3101     def fixup(url):
3102         if not isinstance(url, str):
3103             url = url.decode('utf-8', 'replace')
3104         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3105         for bom in BOM_UTF8:
3106             if url.startswith(bom):
3107                 url = url[len(bom):]
3108         url = url.lstrip()
3109         if not url or url.startswith(('#', ';', ']')):
3110             return False
3111         # "#" cannot be stripped out since it is part of the URI
3112         # However, it can be safely stripped out if following a whitespace
3113         return re.split(r'\s#', url, 1)[0].rstrip()
3114
3115     with contextlib.closing(batch_fd) as fd:
3116         return [url for url in map(fixup, fd) if url]
3117
3118
3119 def urlencode_postdata(*args, **kargs):
3120     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3121
3122
3123 def update_url(url, *, query_update=None, **kwargs):
3124     """Replace URL components specified by kwargs
3125        @param url           str or parse url tuple
3126        @param query_update  update query
3127        @returns             str
3128     """
3129     if isinstance(url, str):
3130         if not kwargs and not query_update:
3131             return url
3132         else:
3133             url = urllib.parse.urlparse(url)
3134     if query_update:
3135         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3136         kwargs['query'] = urllib.parse.urlencode({
3137             **urllib.parse.parse_qs(url.query),
3138             **query_update
3139         }, True)
3140     return urllib.parse.urlunparse(url._replace(**kwargs))
3141
3142
3143 def update_url_query(url, query):
3144     return update_url(url, query_update=query)
3145
3146
3147 def update_Request(req, url=None, data=None, headers=None, query=None):
3148     req_headers = req.headers.copy()
3149     req_headers.update(headers or {})
3150     req_data = data or req.data
3151     req_url = update_url_query(url or req.get_full_url(), query)
3152     req_get_method = req.get_method()
3153     if req_get_method == 'HEAD':
3154         req_type = HEADRequest
3155     elif req_get_method == 'PUT':
3156         req_type = PUTRequest
3157     else:
3158         req_type = urllib.request.Request
3159     new_req = req_type(
3160         req_url, data=req_data, headers=req_headers,
3161         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3162     if hasattr(req, 'timeout'):
3163         new_req.timeout = req.timeout
3164     return new_req
3165
3166
3167 def _multipart_encode_impl(data, boundary):
3168     content_type = 'multipart/form-data; boundary=%s' % boundary
3169
3170     out = b''
3171     for k, v in data.items():
3172         out += b'--' + boundary.encode('ascii') + b'\r\n'
3173         if isinstance(k, str):
3174             k = k.encode()
3175         if isinstance(v, str):
3176             v = v.encode()
3177         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3178         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3179         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3180         if boundary.encode('ascii') in content:
3181             raise ValueError('Boundary overlaps with data')
3182         out += content
3183
3184     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3185
3186     return out, content_type
3187
3188
3189 def multipart_encode(data, boundary=None):
3190     '''
3191     Encode a dict to RFC 7578-compliant form-data
3192
3193     data:
3194         A dict where keys and values can be either Unicode or bytes-like
3195         objects.
3196     boundary:
3197         If specified a Unicode object, it's used as the boundary. Otherwise
3198         a random boundary is generated.
3199
3200     Reference: https://tools.ietf.org/html/rfc7578
3201     '''
3202     has_specified_boundary = boundary is not None
3203
3204     while True:
3205         if boundary is None:
3206             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3207
3208         try:
3209             out, content_type = _multipart_encode_impl(data, boundary)
3210             break
3211         except ValueError:
3212             if has_specified_boundary:
3213                 raise
3214             boundary = None
3215
3216     return out, content_type
3217
3218
3219 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3220     if blocked_types is NO_DEFAULT:
3221         blocked_types = (str, bytes, collections.abc.Mapping)
3222     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3223
3224
3225 def variadic(x, allowed_types=NO_DEFAULT):
3226     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3227
3228
3229 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3230     for f in funcs:
3231         try:
3232             val = f(*args, **kwargs)
3233         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3234             pass
3235         else:
3236             if expected_type is None or isinstance(val, expected_type):
3237                 return val
3238
3239
3240 def try_get(src, getter, expected_type=None):
3241     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3242
3243
3244 def filter_dict(dct, cndn=lambda _, v: v is not None):
3245     return {k: v for k, v in dct.items() if cndn(k, v)}
3246
3247
3248 def merge_dicts(*dicts):
3249     merged = {}
3250     for a_dict in dicts:
3251         for k, v in a_dict.items():
3252             if (v is not None and k not in merged
3253                     or isinstance(v, str) and merged[k] == ''):
3254                 merged[k] = v
3255     return merged
3256
3257
3258 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3259     return string if isinstance(string, str) else str(string, encoding, errors)
3260
3261
3262 US_RATINGS = {
3263     'G': 0,
3264     'PG': 10,
3265     'PG-13': 13,
3266     'R': 16,
3267     'NC': 18,
3268 }
3269
3270
3271 TV_PARENTAL_GUIDELINES = {
3272     'TV-Y': 0,
3273     'TV-Y7': 7,
3274     'TV-G': 0,
3275     'TV-PG': 0,
3276     'TV-14': 14,
3277     'TV-MA': 17,
3278 }
3279
3280
3281 def parse_age_limit(s):
3282     # isinstance(False, int) is True. So type() must be used instead
3283     if type(s) is int:  # noqa: E721
3284         return s if 0 <= s <= 21 else None
3285     elif not isinstance(s, str):
3286         return None
3287     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3288     if m:
3289         return int(m.group('age'))
3290     s = s.upper()
3291     if s in US_RATINGS:
3292         return US_RATINGS[s]
3293     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3294     if m:
3295         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3296     return None
3297
3298
3299 def strip_jsonp(code):
3300     return re.sub(
3301         r'''(?sx)^
3302             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3303             (?:\s*&&\s*(?P=func_name))?
3304             \s*\(\s*(?P<callback_data>.*)\);?
3305             \s*?(?://[^\n]*)*$''',
3306         r'\g<callback_data>', code)
3307
3308
3309 def js_to_json(code, vars={}, *, strict=False):
3310     # vars is a dict of var, val pairs to substitute
3311     STRING_QUOTES = '\'"`'
3312     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3313     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3314     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3315     INTEGER_TABLE = (
3316         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3317         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3318     )
3319
3320     def process_escape(match):
3321         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3322         escape = match.group(1) or match.group(2)
3323
3324         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3325                 else R'\u00' if escape == 'x'
3326                 else '' if escape == '\n'
3327                 else escape)
3328
3329     def template_substitute(match):
3330         evaluated = js_to_json(match.group(1), vars, strict=strict)
3331         if evaluated[0] == '"':
3332             return json.loads(evaluated)
3333         return evaluated
3334
3335     def fix_kv(m):
3336         v = m.group(0)
3337         if v in ('true', 'false', 'null'):
3338             return v
3339         elif v in ('undefined', 'void 0'):
3340             return 'null'
3341         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3342             return ''
3343
3344         if v[0] in STRING_QUOTES:
3345             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3346             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3347             return f'"{escaped}"'
3348
3349         for regex, base in INTEGER_TABLE:
3350             im = re.match(regex, v)
3351             if im:
3352                 i = int(im.group(1), base)
3353                 return f'"{i}":' if v.endswith(':') else str(i)
3354
3355         if v in vars:
3356             try:
3357                 if not strict:
3358                     json.loads(vars[v])
3359             except json.JSONDecodeError:
3360                 return json.dumps(vars[v])
3361             else:
3362                 return vars[v]
3363
3364         if not strict:
3365             return f'"{v}"'
3366
3367         raise ValueError(f'Unknown value: {v}')
3368
3369     def create_map(mobj):
3370         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3371
3372     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3373     if not strict:
3374         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3375         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3376         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3377         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3378
3379     return re.sub(rf'''(?sx)
3380         {STRING_RE}|
3381         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3382         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3383         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3384         [0-9]+(?={SKIP_RE}:)|
3385         !+
3386         ''', fix_kv, code)
3387
3388
3389 def qualities(quality_ids):
3390     """ Get a numeric quality value out of a list of possible values """
3391     def q(qid):
3392         try:
3393             return quality_ids.index(qid)
3394         except ValueError:
3395             return -1
3396     return q
3397
3398
3399 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3400
3401
3402 DEFAULT_OUTTMPL = {
3403     'default': '%(title)s [%(id)s].%(ext)s',
3404     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3405 }
3406 OUTTMPL_TYPES = {
3407     'chapter': None,
3408     'subtitle': None,
3409     'thumbnail': None,
3410     'description': 'description',
3411     'annotation': 'annotations.xml',
3412     'infojson': 'info.json',
3413     'link': None,
3414     'pl_video': None,
3415     'pl_thumbnail': None,
3416     'pl_description': 'description',
3417     'pl_infojson': 'info.json',
3418 }
3419
3420 # As of [1] format syntax is:
3421 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3422 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3423 STR_FORMAT_RE_TMPL = r'''(?x)
3424     (?<!%)(?P<prefix>(?:%%)*)
3425     %
3426     (?P<has_key>\((?P<key>{0})\))?
3427     (?P<format>
3428         (?P<conversion>[#0\-+ ]+)?
3429         (?P<min_width>\d+)?
3430         (?P<precision>\.\d+)?
3431         (?P<len_mod>[hlL])?  # unused in python
3432         {1}  # conversion type
3433     )
3434 '''
3435
3436
3437 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3438
3439
3440 def limit_length(s, length):
3441     """ Add ellipses to overly long strings """
3442     if s is None:
3443         return None
3444     ELLIPSES = '...'
3445     if len(s) > length:
3446         return s[:length - len(ELLIPSES)] + ELLIPSES
3447     return s
3448
3449
3450 def version_tuple(v):
3451     return tuple(int(e) for e in re.split(r'[-.]', v))
3452
3453
3454 def is_outdated_version(version, limit, assume_new=True):
3455     if not version:
3456         return not assume_new
3457     try:
3458         return version_tuple(version) < version_tuple(limit)
3459     except ValueError:
3460         return not assume_new
3461
3462
3463 def ytdl_is_updateable():
3464     """ Returns if yt-dlp can be updated with -U """
3465
3466     from ..update import is_non_updateable
3467
3468     return not is_non_updateable()
3469
3470
3471 def args_to_str(args):
3472     # Get a short string representation for a subprocess command
3473     return ' '.join(compat_shlex_quote(a) for a in args)
3474
3475
3476 def error_to_str(err):
3477     return f'{type(err).__name__}: {err}'
3478
3479
3480 def mimetype2ext(mt, default=NO_DEFAULT):
3481     if not isinstance(mt, str):
3482         if default is not NO_DEFAULT:
3483             return default
3484         return None
3485
3486     MAP = {
3487         # video
3488         '3gpp': '3gp',
3489         'mp2t': 'ts',
3490         'mp4': 'mp4',
3491         'mpeg': 'mpeg',
3492         'mpegurl': 'm3u8',
3493         'quicktime': 'mov',
3494         'webm': 'webm',
3495         'vp9': 'vp9',
3496         'x-flv': 'flv',
3497         'x-m4v': 'm4v',
3498         'x-matroska': 'mkv',
3499         'x-mng': 'mng',
3500         'x-mp4-fragmented': 'mp4',
3501         'x-ms-asf': 'asf',
3502         'x-ms-wmv': 'wmv',
3503         'x-msvideo': 'avi',
3504
3505         # application (streaming playlists)
3506         'dash+xml': 'mpd',
3507         'f4m+xml': 'f4m',
3508         'hds+xml': 'f4m',
3509         'vnd.apple.mpegurl': 'm3u8',
3510         'vnd.ms-sstr+xml': 'ism',
3511         'x-mpegurl': 'm3u8',
3512
3513         # audio
3514         'audio/mp4': 'm4a',
3515         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3516         # Using .mp3 as it's the most popular one
3517         'audio/mpeg': 'mp3',
3518         'audio/webm': 'webm',
3519         'audio/x-matroska': 'mka',
3520         'audio/x-mpegurl': 'm3u',
3521         'midi': 'mid',
3522         'ogg': 'ogg',
3523         'wav': 'wav',
3524         'wave': 'wav',
3525         'x-aac': 'aac',
3526         'x-flac': 'flac',
3527         'x-m4a': 'm4a',
3528         'x-realaudio': 'ra',
3529         'x-wav': 'wav',
3530
3531         # image
3532         'avif': 'avif',
3533         'bmp': 'bmp',
3534         'gif': 'gif',
3535         'jpeg': 'jpg',
3536         'png': 'png',
3537         'svg+xml': 'svg',
3538         'tiff': 'tif',
3539         'vnd.wap.wbmp': 'wbmp',
3540         'webp': 'webp',
3541         'x-icon': 'ico',
3542         'x-jng': 'jng',
3543         'x-ms-bmp': 'bmp',
3544
3545         # caption
3546         'filmstrip+json': 'fs',
3547         'smptett+xml': 'tt',
3548         'ttaf+xml': 'dfxp',
3549         'ttml+xml': 'ttml',
3550         'x-ms-sami': 'sami',
3551
3552         # misc
3553         'gzip': 'gz',
3554         'json': 'json',
3555         'xml': 'xml',
3556         'zip': 'zip',
3557     }
3558
3559     mimetype = mt.partition(';')[0].strip().lower()
3560     _, _, subtype = mimetype.rpartition('/')
3561
3562     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3563     if ext:
3564         return ext
3565     elif default is not NO_DEFAULT:
3566         return default
3567     return subtype.replace('+', '.')
3568
3569
3570 def ext2mimetype(ext_or_url):
3571     if not ext_or_url:
3572         return None
3573     if '.' not in ext_or_url:
3574         ext_or_url = f'file.{ext_or_url}'
3575     return mimetypes.guess_type(ext_or_url)[0]
3576
3577
3578 def parse_codecs(codecs_str):
3579     # http://tools.ietf.org/html/rfc6381
3580     if not codecs_str:
3581         return {}
3582     split_codecs = list(filter(None, map(
3583         str.strip, codecs_str.strip().strip(',').split(','))))
3584     vcodec, acodec, scodec, hdr = None, None, None, None
3585     for full_codec in split_codecs:
3586         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3587         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3588                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3589             if vcodec:
3590                 continue
3591             vcodec = full_codec
3592             if parts[0] in ('dvh1', 'dvhe'):
3593                 hdr = 'DV'
3594             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3595                 hdr = 'HDR10'
3596             elif parts[:2] == ['vp9', '2']:
3597                 hdr = 'HDR10'
3598         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3599                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3600             acodec = acodec or full_codec
3601         elif parts[0] in ('stpp', 'wvtt'):
3602             scodec = scodec or full_codec
3603         else:
3604             write_string(f'WARNING: Unknown codec {full_codec}\n')
3605     if vcodec or acodec or scodec:
3606         return {
3607             'vcodec': vcodec or 'none',
3608             'acodec': acodec or 'none',
3609             'dynamic_range': hdr,
3610             **({'scodec': scodec} if scodec is not None else {}),
3611         }
3612     elif len(split_codecs) == 2:
3613         return {
3614             'vcodec': split_codecs[0],
3615             'acodec': split_codecs[1],
3616         }
3617     return {}
3618
3619
3620 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3621     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3622
3623     allow_mkv = not preferences or 'mkv' in preferences
3624
3625     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3626         return 'mkv'  # TODO: any other format allows this?
3627
3628     # TODO: All codecs supported by parse_codecs isn't handled here
3629     COMPATIBLE_CODECS = {
3630         'mp4': {
3631             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3632             'h264', 'aacl', 'ec-3',  # Set in ISM
3633         },
3634         'webm': {
3635             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3636             'vp9x', 'vp8x',  # in the webm spec
3637         },
3638     }
3639
3640     sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3641     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3642
3643     for ext in preferences or COMPATIBLE_CODECS.keys():
3644         codec_set = COMPATIBLE_CODECS.get(ext, set())
3645         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3646             return ext
3647
3648     COMPATIBLE_EXTS = (
3649         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3650         {'webm', 'weba'},
3651     )
3652     for ext in preferences or vexts:
3653         current_exts = {ext, *vexts, *aexts}
3654         if ext == 'mkv' or current_exts == {ext} or any(
3655                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3656             return ext
3657     return 'mkv' if allow_mkv else preferences[-1]
3658
3659
3660 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3661     getheader = url_handle.headers.get
3662
3663     cd = getheader('Content-Disposition')
3664     if cd:
3665         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3666         if m:
3667             e = determine_ext(m.group('filename'), default_ext=None)
3668             if e:
3669                 return e
3670
3671     meta_ext = getheader('x-amz-meta-name')
3672     if meta_ext:
3673         e = meta_ext.rpartition('.')[2]
3674         if e:
3675             return e
3676
3677     return mimetype2ext(getheader('Content-Type'), default=default)
3678
3679
3680 def encode_data_uri(data, mime_type):
3681     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3682
3683
3684 def age_restricted(content_limit, age_limit):
3685     """ Returns True iff the content should be blocked """
3686
3687     if age_limit is None:  # No limit set
3688         return False
3689     if content_limit is None:
3690         return False  # Content available for everyone
3691     return age_limit < content_limit
3692
3693
3694 # List of known byte-order-marks (BOM)
3695 BOMS = [
3696     (b'\xef\xbb\xbf', 'utf-8'),
3697     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3698     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3699     (b'\xff\xfe', 'utf-16-le'),
3700     (b'\xfe\xff', 'utf-16-be'),
3701 ]
3702
3703
3704 def is_html(first_bytes):
3705     """ Detect whether a file contains HTML by examining its first bytes. """
3706
3707     encoding = 'utf-8'
3708     for bom, enc in BOMS:
3709         while first_bytes.startswith(bom):
3710             encoding, first_bytes = enc, first_bytes[len(bom):]
3711
3712     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3713
3714
3715 def determine_protocol(info_dict):
3716     protocol = info_dict.get('protocol')
3717     if protocol is not None:
3718         return protocol
3719
3720     url = sanitize_url(info_dict['url'])
3721     if url.startswith('rtmp'):
3722         return 'rtmp'
3723     elif url.startswith('mms'):
3724         return 'mms'
3725     elif url.startswith('rtsp'):
3726         return 'rtsp'
3727
3728     ext = determine_ext(url)
3729     if ext == 'm3u8':
3730         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3731     elif ext == 'f4m':
3732         return 'f4m'
3733
3734     return urllib.parse.urlparse(url).scheme
3735
3736
3737 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3738     """ Render a list of rows, each as a list of values.
3739     Text after a \t will be right aligned """
3740     def width(string):
3741         return len(remove_terminal_sequences(string).replace('\t', ''))
3742
3743     def get_max_lens(table):
3744         return [max(width(str(v)) for v in col) for col in zip(*table)]
3745
3746     def filter_using_list(row, filterArray):
3747         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3748
3749     max_lens = get_max_lens(data) if hide_empty else []
3750     header_row = filter_using_list(header_row, max_lens)
3751     data = [filter_using_list(row, max_lens) for row in data]
3752
3753     table = [header_row] + data
3754     max_lens = get_max_lens(table)
3755     extra_gap += 1
3756     if delim:
3757         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3758         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3759     for row in table:
3760         for pos, text in enumerate(map(str, row)):
3761             if '\t' in text:
3762                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3763             else:
3764                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3765     ret = '\n'.join(''.join(row).rstrip() for row in table)
3766     return ret
3767
3768
3769 def _match_one(filter_part, dct, incomplete):
3770     # TODO: Generalize code with YoutubeDL._build_format_filter
3771     STRING_OPERATORS = {
3772         '*=': operator.contains,
3773         '^=': lambda attr, value: attr.startswith(value),
3774         '$=': lambda attr, value: attr.endswith(value),
3775         '~=': lambda attr, value: re.search(value, attr),
3776     }
3777     COMPARISON_OPERATORS = {
3778         **STRING_OPERATORS,
3779         '<=': operator.le,  # "<=" must be defined above "<"
3780         '<': operator.lt,
3781         '>=': operator.ge,
3782         '>': operator.gt,
3783         '=': operator.eq,
3784     }
3785
3786     if isinstance(incomplete, bool):
3787         is_incomplete = lambda _: incomplete
3788     else:
3789         is_incomplete = lambda k: k in incomplete
3790
3791     operator_rex = re.compile(r'''(?x)
3792         (?P<key>[a-z_]+)
3793         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3794         (?:
3795             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3796             (?P<strval>.+?)
3797         )
3798         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3799     m = operator_rex.fullmatch(filter_part.strip())
3800     if m:
3801         m = m.groupdict()
3802         unnegated_op = COMPARISON_OPERATORS[m['op']]
3803         if m['negation']:
3804             op = lambda attr, value: not unnegated_op(attr, value)
3805         else:
3806             op = unnegated_op
3807         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3808         if m['quote']:
3809             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3810         actual_value = dct.get(m['key'])
3811         numeric_comparison = None
3812         if isinstance(actual_value, (int, float)):
3813             # If the original field is a string and matching comparisonvalue is
3814             # a number we should respect the origin of the original field
3815             # and process comparison value as a string (see
3816             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3817             try:
3818                 numeric_comparison = int(comparison_value)
3819             except ValueError:
3820                 numeric_comparison = parse_filesize(comparison_value)
3821                 if numeric_comparison is None:
3822                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3823                 if numeric_comparison is None:
3824                     numeric_comparison = parse_duration(comparison_value)
3825         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3826             raise ValueError('Operator %s only supports string values!' % m['op'])
3827         if actual_value is None:
3828             return is_incomplete(m['key']) or m['none_inclusive']
3829         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3830
3831     UNARY_OPERATORS = {
3832         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3833         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3834     }
3835     operator_rex = re.compile(r'''(?x)
3836         (?P<op>%s)\s*(?P<key>[a-z_]+)
3837         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3838     m = operator_rex.fullmatch(filter_part.strip())
3839     if m:
3840         op = UNARY_OPERATORS[m.group('op')]
3841         actual_value = dct.get(m.group('key'))
3842         if is_incomplete(m.group('key')) and actual_value is None:
3843             return True
3844         return op(actual_value)
3845
3846     raise ValueError('Invalid filter part %r' % filter_part)
3847
3848
3849 def match_str(filter_str, dct, incomplete=False):
3850     """ Filter a dictionary with a simple string syntax.
3851     @returns           Whether the filter passes
3852     @param incomplete  Set of keys that is expected to be missing from dct.
3853                        Can be True/False to indicate all/none of the keys may be missing.
3854                        All conditions on incomplete keys pass if the key is missing
3855     """
3856     return all(
3857         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3858         for filter_part in re.split(r'(?<!\\)&', filter_str))
3859
3860
3861 def match_filter_func(filters, breaking_filters=None):
3862     if not filters and not breaking_filters:
3863         return None
3864     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3865     filters = set(variadic(filters or []))
3866
3867     interactive = '-' in filters
3868     if interactive:
3869         filters.remove('-')
3870
3871     def _match_func(info_dict, incomplete=False):
3872         ret = breaking_filters(info_dict, incomplete)
3873         if ret is not None:
3874             raise RejectedVideoReached(ret)
3875
3876         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3877             return NO_DEFAULT if interactive and not incomplete else None
3878         else:
3879             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3880             filter_str = ') | ('.join(map(str.strip, filters))
3881             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3882     return _match_func
3883
3884
3885 class download_range_func:
3886     def __init__(self, chapters, ranges):
3887         self.chapters, self.ranges = chapters, ranges
3888
3889     def __call__(self, info_dict, ydl):
3890         if not self.ranges and not self.chapters:
3891             yield {}
3892
3893         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3894                    else 'Cannot match chapters since chapter information is unavailable')
3895         for regex in self.chapters or []:
3896             for i, chapter in enumerate(info_dict.get('chapters') or []):
3897                 if re.search(regex, chapter['title']):
3898                     warning = None
3899                     yield {**chapter, 'index': i}
3900         if self.chapters and warning:
3901             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3902
3903         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3904
3905     def __eq__(self, other):
3906         return (isinstance(other, download_range_func)
3907                 and self.chapters == other.chapters and self.ranges == other.ranges)
3908
3909     def __repr__(self):
3910         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3911
3912
3913 def parse_dfxp_time_expr(time_expr):
3914     if not time_expr:
3915         return
3916
3917     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3918     if mobj:
3919         return float(mobj.group('time_offset'))
3920
3921     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3922     if mobj:
3923         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3924
3925
3926 def srt_subtitles_timecode(seconds):
3927     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3928
3929
3930 def ass_subtitles_timecode(seconds):
3931     time = timetuple_from_msec(seconds * 1000)
3932     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3933
3934
3935 def dfxp2srt(dfxp_data):
3936     '''
3937     @param dfxp_data A bytes-like object containing DFXP data
3938     @returns A unicode object containing converted SRT data
3939     '''
3940     LEGACY_NAMESPACES = (
3941         (b'http://www.w3.org/ns/ttml', [
3942             b'http://www.w3.org/2004/11/ttaf1',
3943             b'http://www.w3.org/2006/04/ttaf1',
3944             b'http://www.w3.org/2006/10/ttaf1',
3945         ]),
3946         (b'http://www.w3.org/ns/ttml#styling', [
3947             b'http://www.w3.org/ns/ttml#style',
3948         ]),
3949     )
3950
3951     SUPPORTED_STYLING = [
3952         'color',
3953         'fontFamily',
3954         'fontSize',
3955         'fontStyle',
3956         'fontWeight',
3957         'textDecoration'
3958     ]
3959
3960     _x = functools.partial(xpath_with_ns, ns_map={
3961         'xml': 'http://www.w3.org/XML/1998/namespace',
3962         'ttml': 'http://www.w3.org/ns/ttml',
3963         'tts': 'http://www.w3.org/ns/ttml#styling',
3964     })
3965
3966     styles = {}
3967     default_style = {}
3968
3969     class TTMLPElementParser:
3970         _out = ''
3971         _unclosed_elements = []
3972         _applied_styles = []
3973
3974         def start(self, tag, attrib):
3975             if tag in (_x('ttml:br'), 'br'):
3976                 self._out += '\n'
3977             else:
3978                 unclosed_elements = []
3979                 style = {}
3980                 element_style_id = attrib.get('style')
3981                 if default_style:
3982                     style.update(default_style)
3983                 if element_style_id:
3984                     style.update(styles.get(element_style_id, {}))
3985                 for prop in SUPPORTED_STYLING:
3986                     prop_val = attrib.get(_x('tts:' + prop))
3987                     if prop_val:
3988                         style[prop] = prop_val
3989                 if style:
3990                     font = ''
3991                     for k, v in sorted(style.items()):
3992                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3993                             continue
3994                         if k == 'color':
3995                             font += ' color="%s"' % v
3996                         elif k == 'fontSize':
3997                             font += ' size="%s"' % v
3998                         elif k == 'fontFamily':
3999                             font += ' face="%s"' % v
4000                         elif k == 'fontWeight' and v == 'bold':
4001                             self._out += '<b>'
4002                             unclosed_elements.append('b')
4003                         elif k == 'fontStyle' and v == 'italic':
4004                             self._out += '<i>'
4005                             unclosed_elements.append('i')
4006                         elif k == 'textDecoration' and v == 'underline':
4007                             self._out += '<u>'
4008                             unclosed_elements.append('u')
4009                     if font:
4010                         self._out += '<font' + font + '>'
4011                         unclosed_elements.append('font')
4012                     applied_style = {}
4013                     if self._applied_styles:
4014                         applied_style.update(self._applied_styles[-1])
4015                     applied_style.update(style)
4016                     self._applied_styles.append(applied_style)
4017                 self._unclosed_elements.append(unclosed_elements)
4018
4019         def end(self, tag):
4020             if tag not in (_x('ttml:br'), 'br'):
4021                 unclosed_elements = self._unclosed_elements.pop()
4022                 for element in reversed(unclosed_elements):
4023                     self._out += '</%s>' % element
4024                 if unclosed_elements and self._applied_styles:
4025                     self._applied_styles.pop()
4026
4027         def data(self, data):
4028             self._out += data
4029
4030         def close(self):
4031             return self._out.strip()
4032
4033     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
4034     # This will not trigger false positives since only UTF-8 text is being replaced
4035     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
4036
4037     def parse_node(node):
4038         target = TTMLPElementParser()
4039         parser = xml.etree.ElementTree.XMLParser(target=target)
4040         parser.feed(xml.etree.ElementTree.tostring(node))
4041         return parser.close()
4042
4043     for k, v in LEGACY_NAMESPACES:
4044         for ns in v:
4045             dfxp_data = dfxp_data.replace(ns, k)
4046
4047     dfxp = compat_etree_fromstring(dfxp_data)
4048     out = []
4049     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4050
4051     if not paras:
4052         raise ValueError('Invalid dfxp/TTML subtitle')
4053
4054     repeat = False
4055     while True:
4056         for style in dfxp.findall(_x('.//ttml:style')):
4057             style_id = style.get('id') or style.get(_x('xml:id'))
4058             if not style_id:
4059                 continue
4060             parent_style_id = style.get('style')
4061             if parent_style_id:
4062                 if parent_style_id not in styles:
4063                     repeat = True
4064                     continue
4065                 styles[style_id] = styles[parent_style_id].copy()
4066             for prop in SUPPORTED_STYLING:
4067                 prop_val = style.get(_x('tts:' + prop))
4068                 if prop_val:
4069                     styles.setdefault(style_id, {})[prop] = prop_val
4070         if repeat:
4071             repeat = False
4072         else:
4073             break
4074
4075     for p in ('body', 'div'):
4076         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4077         if ele is None:
4078             continue
4079         style = styles.get(ele.get('style'))
4080         if not style:
4081             continue
4082         default_style.update(style)
4083
4084     for para, index in zip(paras, itertools.count(1)):
4085         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4086         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4087         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4088         if begin_time is None:
4089             continue
4090         if not end_time:
4091             if not dur:
4092                 continue
4093             end_time = begin_time + dur
4094         out.append('%d\n%s --> %s\n%s\n\n' % (
4095             index,
4096             srt_subtitles_timecode(begin_time),
4097             srt_subtitles_timecode(end_time),
4098             parse_node(para)))
4099
4100     return ''.join(out)
4101
4102
4103 def cli_option(params, command_option, param, separator=None):
4104     param = params.get(param)
4105     return ([] if param is None
4106             else [command_option, str(param)] if separator is None
4107             else [f'{command_option}{separator}{param}'])
4108
4109
4110 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4111     param = params.get(param)
4112     assert param in (True, False, None)
4113     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4114
4115
4116 def cli_valueless_option(params, command_option, param, expected_value=True):
4117     return [command_option] if params.get(param) == expected_value else []
4118
4119
4120 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4121     if isinstance(argdict, (list, tuple)):  # for backward compatibility
4122         if use_compat:
4123             return argdict
4124         else:
4125             argdict = None
4126     if argdict is None:
4127         return default
4128     assert isinstance(argdict, dict)
4129
4130     assert isinstance(keys, (list, tuple))
4131     for key_list in keys:
4132         arg_list = list(filter(
4133             lambda x: x is not None,
4134             [argdict.get(key.lower()) for key in variadic(key_list)]))
4135         if arg_list:
4136             return [arg for args in arg_list for arg in args]
4137     return default
4138
4139
4140 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4141     main_key, exe = main_key.lower(), exe.lower()
4142     root_key = exe if main_key == exe else f'{main_key}+{exe}'
4143     keys = [f'{root_key}{k}' for k in (keys or [''])]
4144     if root_key in keys:
4145         if main_key != exe:
4146             keys.append((main_key, exe))
4147         keys.append('default')
4148     else:
4149         use_compat = False
4150     return cli_configuration_args(argdict, keys, default, use_compat)
4151
4152
4153 class ISO639Utils:
4154     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4155     _lang_map = {
4156         'aa': 'aar',
4157         'ab': 'abk',
4158         'ae': 'ave',
4159         'af': 'afr',
4160         'ak': 'aka',
4161         'am': 'amh',
4162         'an': 'arg',
4163         'ar': 'ara',
4164         'as': 'asm',
4165         'av': 'ava',
4166         'ay': 'aym',
4167         'az': 'aze',
4168         'ba': 'bak',
4169         'be': 'bel',
4170         'bg': 'bul',
4171         'bh': 'bih',
4172         'bi': 'bis',
4173         'bm': 'bam',
4174         'bn': 'ben',
4175         'bo': 'bod',
4176         'br': 'bre',
4177         'bs': 'bos',
4178         'ca': 'cat',
4179         'ce': 'che',
4180         'ch': 'cha',
4181         'co': 'cos',
4182         'cr': 'cre',
4183         'cs': 'ces',
4184         'cu': 'chu',
4185         'cv': 'chv',
4186         'cy': 'cym',
4187         'da': 'dan',
4188         'de': 'deu',
4189         'dv': 'div',
4190         'dz': 'dzo',
4191         'ee': 'ewe',
4192         'el': 'ell',
4193         'en': 'eng',
4194         'eo': 'epo',
4195         'es': 'spa',
4196         'et': 'est',
4197         'eu': 'eus',
4198         'fa': 'fas',
4199         'ff': 'ful',
4200         'fi': 'fin',
4201         'fj': 'fij',
4202         'fo': 'fao',
4203         'fr': 'fra',
4204         'fy': 'fry',
4205         'ga': 'gle',
4206         'gd': 'gla',
4207         'gl': 'glg',
4208         'gn': 'grn',
4209         'gu': 'guj',
4210         'gv': 'glv',
4211         'ha': 'hau',
4212         'he': 'heb',
4213         'iw': 'heb',  # Replaced by he in 1989 revision
4214         'hi': 'hin',
4215         'ho': 'hmo',
4216         'hr': 'hrv',
4217         'ht': 'hat',
4218         'hu': 'hun',
4219         'hy': 'hye',
4220         'hz': 'her',
4221         'ia': 'ina',
4222         'id': 'ind',
4223         'in': 'ind',  # Replaced by id in 1989 revision
4224         'ie': 'ile',
4225         'ig': 'ibo',
4226         'ii': 'iii',
4227         'ik': 'ipk',
4228         'io': 'ido',
4229         'is': 'isl',
4230         'it': 'ita',
4231         'iu': 'iku',
4232         'ja': 'jpn',
4233         'jv': 'jav',
4234         'ka': 'kat',
4235         'kg': 'kon',
4236         'ki': 'kik',
4237         'kj': 'kua',
4238         'kk': 'kaz',
4239         'kl': 'kal',
4240         'km': 'khm',
4241         'kn': 'kan',
4242         'ko': 'kor',
4243         'kr': 'kau',
4244         'ks': 'kas',
4245         'ku': 'kur',
4246         'kv': 'kom',
4247         'kw': 'cor',
4248         'ky': 'kir',
4249         'la': 'lat',
4250         'lb': 'ltz',
4251         'lg': 'lug',
4252         'li': 'lim',
4253         'ln': 'lin',
4254         'lo': 'lao',
4255         'lt': 'lit',
4256         'lu': 'lub',
4257         'lv': 'lav',
4258         'mg': 'mlg',
4259         'mh': 'mah',
4260         'mi': 'mri',
4261         'mk': 'mkd',
4262         'ml': 'mal',
4263         'mn': 'mon',
4264         'mr': 'mar',
4265         'ms': 'msa',
4266         'mt': 'mlt',
4267         'my': 'mya',
4268         'na': 'nau',
4269         'nb': 'nob',
4270         'nd': 'nde',
4271         'ne': 'nep',
4272         'ng': 'ndo',
4273         'nl': 'nld',
4274         'nn': 'nno',
4275         'no': 'nor',
4276         'nr': 'nbl',
4277         'nv': 'nav',
4278         'ny': 'nya',
4279         'oc': 'oci',
4280         'oj': 'oji',
4281         'om': 'orm',
4282         'or': 'ori',
4283         'os': 'oss',
4284         'pa': 'pan',
4285         'pi': 'pli',
4286         'pl': 'pol',
4287         'ps': 'pus',
4288         'pt': 'por',
4289         'qu': 'que',
4290         'rm': 'roh',
4291         'rn': 'run',
4292         'ro': 'ron',
4293         'ru': 'rus',
4294         'rw': 'kin',
4295         'sa': 'san',
4296         'sc': 'srd',
4297         'sd': 'snd',
4298         'se': 'sme',
4299         'sg': 'sag',
4300         'si': 'sin',
4301         'sk': 'slk',
4302         'sl': 'slv',
4303         'sm': 'smo',
4304         'sn': 'sna',
4305         'so': 'som',
4306         'sq': 'sqi',
4307         'sr': 'srp',
4308         'ss': 'ssw',
4309         'st': 'sot',
4310         'su': 'sun',
4311         'sv': 'swe',
4312         'sw': 'swa',
4313         'ta': 'tam',
4314         'te': 'tel',
4315         'tg': 'tgk',
4316         'th': 'tha',
4317         'ti': 'tir',
4318         'tk': 'tuk',
4319         'tl': 'tgl',
4320         'tn': 'tsn',
4321         'to': 'ton',
4322         'tr': 'tur',
4323         'ts': 'tso',
4324         'tt': 'tat',
4325         'tw': 'twi',
4326         'ty': 'tah',
4327         'ug': 'uig',
4328         'uk': 'ukr',
4329         'ur': 'urd',
4330         'uz': 'uzb',
4331         've': 'ven',
4332         'vi': 'vie',
4333         'vo': 'vol',
4334         'wa': 'wln',
4335         'wo': 'wol',
4336         'xh': 'xho',
4337         'yi': 'yid',
4338         'ji': 'yid',  # Replaced by yi in 1989 revision
4339         'yo': 'yor',
4340         'za': 'zha',
4341         'zh': 'zho',
4342         'zu': 'zul',
4343     }
4344
4345     @classmethod
4346     def short2long(cls, code):
4347         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4348         return cls._lang_map.get(code[:2])
4349
4350     @classmethod
4351     def long2short(cls, code):
4352         """Convert language code from ISO 639-2/T to ISO 639-1"""
4353         for short_name, long_name in cls._lang_map.items():
4354             if long_name == code:
4355                 return short_name
4356
4357
4358 class ISO3166Utils:
4359     # From http://data.okfn.org/data/core/country-list
4360     _country_map = {
4361         'AF': 'Afghanistan',
4362         'AX': 'Åland Islands',
4363         'AL': 'Albania',
4364         'DZ': 'Algeria',
4365         'AS': 'American Samoa',
4366         'AD': 'Andorra',
4367         'AO': 'Angola',
4368         'AI': 'Anguilla',
4369         'AQ': 'Antarctica',
4370         'AG': 'Antigua and Barbuda',
4371         'AR': 'Argentina',
4372         'AM': 'Armenia',
4373         'AW': 'Aruba',
4374         'AU': 'Australia',
4375         'AT': 'Austria',
4376         'AZ': 'Azerbaijan',
4377         'BS': 'Bahamas',
4378         'BH': 'Bahrain',
4379         'BD': 'Bangladesh',
4380         'BB': 'Barbados',
4381         'BY': 'Belarus',
4382         'BE': 'Belgium',
4383         'BZ': 'Belize',
4384         'BJ': 'Benin',
4385         'BM': 'Bermuda',
4386         'BT': 'Bhutan',
4387         'BO': 'Bolivia, Plurinational State of',
4388         'BQ': 'Bonaire, Sint Eustatius and Saba',
4389         'BA': 'Bosnia and Herzegovina',
4390         'BW': 'Botswana',
4391         'BV': 'Bouvet Island',
4392         'BR': 'Brazil',
4393         'IO': 'British Indian Ocean Territory',
4394         'BN': 'Brunei Darussalam',
4395         'BG': 'Bulgaria',
4396         'BF': 'Burkina Faso',
4397         'BI': 'Burundi',
4398         'KH': 'Cambodia',
4399         'CM': 'Cameroon',
4400         'CA': 'Canada',
4401         'CV': 'Cape Verde',
4402         'KY': 'Cayman Islands',
4403         'CF': 'Central African Republic',
4404         'TD': 'Chad',
4405         'CL': 'Chile',
4406         'CN': 'China',
4407         'CX': 'Christmas Island',
4408         'CC': 'Cocos (Keeling) Islands',
4409         'CO': 'Colombia',
4410         'KM': 'Comoros',
4411         'CG': 'Congo',
4412         'CD': 'Congo, the Democratic Republic of the',
4413         'CK': 'Cook Islands',
4414         'CR': 'Costa Rica',
4415         'CI': 'Côte d\'Ivoire',
4416         'HR': 'Croatia',
4417         'CU': 'Cuba',
4418         'CW': 'Curaçao',
4419         'CY': 'Cyprus',
4420         'CZ': 'Czech Republic',
4421         'DK': 'Denmark',
4422         'DJ': 'Djibouti',
4423         'DM': 'Dominica',
4424         'DO': 'Dominican Republic',
4425         'EC': 'Ecuador',
4426         'EG': 'Egypt',
4427         'SV': 'El Salvador',
4428         'GQ': 'Equatorial Guinea',
4429         'ER': 'Eritrea',
4430         'EE': 'Estonia',
4431         'ET': 'Ethiopia',
4432         'FK': 'Falkland Islands (Malvinas)',
4433         'FO': 'Faroe Islands',
4434         'FJ': 'Fiji',
4435         'FI': 'Finland',
4436         'FR': 'France',
4437         'GF': 'French Guiana',
4438         'PF': 'French Polynesia',
4439         'TF': 'French Southern Territories',
4440         'GA': 'Gabon',
4441         'GM': 'Gambia',
4442         'GE': 'Georgia',
4443         'DE': 'Germany',
4444         'GH': 'Ghana',
4445         'GI': 'Gibraltar',
4446         'GR': 'Greece',
4447         'GL': 'Greenland',
4448         'GD': 'Grenada',
4449         'GP': 'Guadeloupe',
4450         'GU': 'Guam',
4451         'GT': 'Guatemala',
4452         'GG': 'Guernsey',
4453         'GN': 'Guinea',
4454         'GW': 'Guinea-Bissau',
4455         'GY': 'Guyana',
4456         'HT': 'Haiti',
4457         'HM': 'Heard Island and McDonald Islands',
4458         'VA': 'Holy See (Vatican City State)',
4459         'HN': 'Honduras',
4460         'HK': 'Hong Kong',
4461         'HU': 'Hungary',
4462         'IS': 'Iceland',
4463         'IN': 'India',
4464         'ID': 'Indonesia',
4465         'IR': 'Iran, Islamic Republic of',
4466         'IQ': 'Iraq',
4467         'IE': 'Ireland',
4468         'IM': 'Isle of Man',
4469         'IL': 'Israel',
4470         'IT': 'Italy',
4471         'JM': 'Jamaica',
4472         'JP': 'Japan',
4473         'JE': 'Jersey',
4474         'JO': 'Jordan',
4475         'KZ': 'Kazakhstan',
4476         'KE': 'Kenya',
4477         'KI': 'Kiribati',
4478         'KP': 'Korea, Democratic People\'s Republic of',
4479         'KR': 'Korea, Republic of',
4480         'KW': 'Kuwait',
4481         'KG': 'Kyrgyzstan',
4482         'LA': 'Lao People\'s Democratic Republic',
4483         'LV': 'Latvia',
4484         'LB': 'Lebanon',
4485         'LS': 'Lesotho',
4486         'LR': 'Liberia',
4487         'LY': 'Libya',
4488         'LI': 'Liechtenstein',
4489         'LT': 'Lithuania',
4490         'LU': 'Luxembourg',
4491         'MO': 'Macao',
4492         'MK': 'Macedonia, the Former Yugoslav Republic of',
4493         'MG': 'Madagascar',
4494         'MW': 'Malawi',
4495         'MY': 'Malaysia',
4496         'MV': 'Maldives',
4497         'ML': 'Mali',
4498         'MT': 'Malta',
4499         'MH': 'Marshall Islands',
4500         'MQ': 'Martinique',
4501         'MR': 'Mauritania',
4502         'MU': 'Mauritius',
4503         'YT': 'Mayotte',
4504         'MX': 'Mexico',
4505         'FM': 'Micronesia, Federated States of',
4506         'MD': 'Moldova, Republic of',
4507         'MC': 'Monaco',
4508         'MN': 'Mongolia',
4509         'ME': 'Montenegro',
4510         'MS': 'Montserrat',
4511         'MA': 'Morocco',
4512         'MZ': 'Mozambique',
4513         'MM': 'Myanmar',
4514         'NA': 'Namibia',
4515         'NR': 'Nauru',
4516         'NP': 'Nepal',
4517         'NL': 'Netherlands',
4518         'NC': 'New Caledonia',
4519         'NZ': 'New Zealand',
4520         'NI': 'Nicaragua',
4521         'NE': 'Niger',
4522         'NG': 'Nigeria',
4523         'NU': 'Niue',
4524         'NF': 'Norfolk Island',
4525         'MP': 'Northern Mariana Islands',
4526         'NO': 'Norway',
4527         'OM': 'Oman',
4528         'PK': 'Pakistan',
4529         'PW': 'Palau',
4530         'PS': 'Palestine, State of',
4531         'PA': 'Panama',
4532         'PG': 'Papua New Guinea',
4533         'PY': 'Paraguay',
4534         'PE': 'Peru',
4535         'PH': 'Philippines',
4536         'PN': 'Pitcairn',
4537         'PL': 'Poland',
4538         'PT': 'Portugal',
4539         'PR': 'Puerto Rico',
4540         'QA': 'Qatar',
4541         'RE': 'Réunion',
4542         'RO': 'Romania',
4543         'RU': 'Russian Federation',
4544         'RW': 'Rwanda',
4545         'BL': 'Saint Barthélemy',
4546         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4547         'KN': 'Saint Kitts and Nevis',
4548         'LC': 'Saint Lucia',
4549         'MF': 'Saint Martin (French part)',
4550         'PM': 'Saint Pierre and Miquelon',
4551         'VC': 'Saint Vincent and the Grenadines',
4552         'WS': 'Samoa',
4553         'SM': 'San Marino',
4554         'ST': 'Sao Tome and Principe',
4555         'SA': 'Saudi Arabia',
4556         'SN': 'Senegal',
4557         'RS': 'Serbia',
4558         'SC': 'Seychelles',
4559         'SL': 'Sierra Leone',
4560         'SG': 'Singapore',
4561         'SX': 'Sint Maarten (Dutch part)',
4562         'SK': 'Slovakia',
4563         'SI': 'Slovenia',
4564         'SB': 'Solomon Islands',
4565         'SO': 'Somalia',
4566         'ZA': 'South Africa',
4567         'GS': 'South Georgia and the South Sandwich Islands',
4568         'SS': 'South Sudan',
4569         'ES': 'Spain',
4570         'LK': 'Sri Lanka',
4571         'SD': 'Sudan',
4572         'SR': 'Suriname',
4573         'SJ': 'Svalbard and Jan Mayen',
4574         'SZ': 'Swaziland',
4575         'SE': 'Sweden',
4576         'CH': 'Switzerland',
4577         'SY': 'Syrian Arab Republic',
4578         'TW': 'Taiwan, Province of China',
4579         'TJ': 'Tajikistan',
4580         'TZ': 'Tanzania, United Republic of',
4581         'TH': 'Thailand',
4582         'TL': 'Timor-Leste',
4583         'TG': 'Togo',
4584         'TK': 'Tokelau',
4585         'TO': 'Tonga',
4586         'TT': 'Trinidad and Tobago',
4587         'TN': 'Tunisia',
4588         'TR': 'Turkey',
4589         'TM': 'Turkmenistan',
4590         'TC': 'Turks and Caicos Islands',
4591         'TV': 'Tuvalu',
4592         'UG': 'Uganda',
4593         'UA': 'Ukraine',
4594         'AE': 'United Arab Emirates',
4595         'GB': 'United Kingdom',
4596         'US': 'United States',
4597         'UM': 'United States Minor Outlying Islands',
4598         'UY': 'Uruguay',
4599         'UZ': 'Uzbekistan',
4600         'VU': 'Vanuatu',
4601         'VE': 'Venezuela, Bolivarian Republic of',
4602         'VN': 'Viet Nam',
4603         'VG': 'Virgin Islands, British',
4604         'VI': 'Virgin Islands, U.S.',
4605         'WF': 'Wallis and Futuna',
4606         'EH': 'Western Sahara',
4607         'YE': 'Yemen',
4608         'ZM': 'Zambia',
4609         'ZW': 'Zimbabwe',
4610         # Not ISO 3166 codes, but used for IP blocks
4611         'AP': 'Asia/Pacific Region',
4612         'EU': 'Europe',
4613     }
4614
4615     @classmethod
4616     def short2full(cls, code):
4617         """Convert an ISO 3166-2 country code to the corresponding full name"""
4618         return cls._country_map.get(code.upper())
4619
4620
4621 class GeoUtils:
4622     # Major IPv4 address blocks per country
4623     _country_ip_map = {
4624         'AD': '46.172.224.0/19',
4625         'AE': '94.200.0.0/13',
4626         'AF': '149.54.0.0/17',
4627         'AG': '209.59.64.0/18',
4628         'AI': '204.14.248.0/21',
4629         'AL': '46.99.0.0/16',
4630         'AM': '46.70.0.0/15',
4631         'AO': '105.168.0.0/13',
4632         'AP': '182.50.184.0/21',
4633         'AQ': '23.154.160.0/24',
4634         'AR': '181.0.0.0/12',
4635         'AS': '202.70.112.0/20',
4636         'AT': '77.116.0.0/14',
4637         'AU': '1.128.0.0/11',
4638         'AW': '181.41.0.0/18',
4639         'AX': '185.217.4.0/22',
4640         'AZ': '5.197.0.0/16',
4641         'BA': '31.176.128.0/17',
4642         'BB': '65.48.128.0/17',
4643         'BD': '114.130.0.0/16',
4644         'BE': '57.0.0.0/8',
4645         'BF': '102.178.0.0/15',
4646         'BG': '95.42.0.0/15',
4647         'BH': '37.131.0.0/17',
4648         'BI': '154.117.192.0/18',
4649         'BJ': '137.255.0.0/16',
4650         'BL': '185.212.72.0/23',
4651         'BM': '196.12.64.0/18',
4652         'BN': '156.31.0.0/16',
4653         'BO': '161.56.0.0/16',
4654         'BQ': '161.0.80.0/20',
4655         'BR': '191.128.0.0/12',
4656         'BS': '24.51.64.0/18',
4657         'BT': '119.2.96.0/19',
4658         'BW': '168.167.0.0/16',
4659         'BY': '178.120.0.0/13',
4660         'BZ': '179.42.192.0/18',
4661         'CA': '99.224.0.0/11',
4662         'CD': '41.243.0.0/16',
4663         'CF': '197.242.176.0/21',
4664         'CG': '160.113.0.0/16',
4665         'CH': '85.0.0.0/13',
4666         'CI': '102.136.0.0/14',
4667         'CK': '202.65.32.0/19',
4668         'CL': '152.172.0.0/14',
4669         'CM': '102.244.0.0/14',
4670         'CN': '36.128.0.0/10',
4671         'CO': '181.240.0.0/12',
4672         'CR': '201.192.0.0/12',
4673         'CU': '152.206.0.0/15',
4674         'CV': '165.90.96.0/19',
4675         'CW': '190.88.128.0/17',
4676         'CY': '31.153.0.0/16',
4677         'CZ': '88.100.0.0/14',
4678         'DE': '53.0.0.0/8',
4679         'DJ': '197.241.0.0/17',
4680         'DK': '87.48.0.0/12',
4681         'DM': '192.243.48.0/20',
4682         'DO': '152.166.0.0/15',
4683         'DZ': '41.96.0.0/12',
4684         'EC': '186.68.0.0/15',
4685         'EE': '90.190.0.0/15',
4686         'EG': '156.160.0.0/11',
4687         'ER': '196.200.96.0/20',
4688         'ES': '88.0.0.0/11',
4689         'ET': '196.188.0.0/14',
4690         'EU': '2.16.0.0/13',
4691         'FI': '91.152.0.0/13',
4692         'FJ': '144.120.0.0/16',
4693         'FK': '80.73.208.0/21',
4694         'FM': '119.252.112.0/20',
4695         'FO': '88.85.32.0/19',
4696         'FR': '90.0.0.0/9',
4697         'GA': '41.158.0.0/15',
4698         'GB': '25.0.0.0/8',
4699         'GD': '74.122.88.0/21',
4700         'GE': '31.146.0.0/16',
4701         'GF': '161.22.64.0/18',
4702         'GG': '62.68.160.0/19',
4703         'GH': '154.160.0.0/12',
4704         'GI': '95.164.0.0/16',
4705         'GL': '88.83.0.0/19',
4706         'GM': '160.182.0.0/15',
4707         'GN': '197.149.192.0/18',
4708         'GP': '104.250.0.0/19',
4709         'GQ': '105.235.224.0/20',
4710         'GR': '94.64.0.0/13',
4711         'GT': '168.234.0.0/16',
4712         'GU': '168.123.0.0/16',
4713         'GW': '197.214.80.0/20',
4714         'GY': '181.41.64.0/18',
4715         'HK': '113.252.0.0/14',
4716         'HN': '181.210.0.0/16',
4717         'HR': '93.136.0.0/13',
4718         'HT': '148.102.128.0/17',
4719         'HU': '84.0.0.0/14',
4720         'ID': '39.192.0.0/10',
4721         'IE': '87.32.0.0/12',
4722         'IL': '79.176.0.0/13',
4723         'IM': '5.62.80.0/20',
4724         'IN': '117.192.0.0/10',
4725         'IO': '203.83.48.0/21',
4726         'IQ': '37.236.0.0/14',
4727         'IR': '2.176.0.0/12',
4728         'IS': '82.221.0.0/16',
4729         'IT': '79.0.0.0/10',
4730         'JE': '87.244.64.0/18',
4731         'JM': '72.27.0.0/17',
4732         'JO': '176.29.0.0/16',
4733         'JP': '133.0.0.0/8',
4734         'KE': '105.48.0.0/12',
4735         'KG': '158.181.128.0/17',
4736         'KH': '36.37.128.0/17',
4737         'KI': '103.25.140.0/22',
4738         'KM': '197.255.224.0/20',
4739         'KN': '198.167.192.0/19',
4740         'KP': '175.45.176.0/22',
4741         'KR': '175.192.0.0/10',
4742         'KW': '37.36.0.0/14',
4743         'KY': '64.96.0.0/15',
4744         'KZ': '2.72.0.0/13',
4745         'LA': '115.84.64.0/18',
4746         'LB': '178.135.0.0/16',
4747         'LC': '24.92.144.0/20',
4748         'LI': '82.117.0.0/19',
4749         'LK': '112.134.0.0/15',
4750         'LR': '102.183.0.0/16',
4751         'LS': '129.232.0.0/17',
4752         'LT': '78.56.0.0/13',
4753         'LU': '188.42.0.0/16',
4754         'LV': '46.109.0.0/16',
4755         'LY': '41.252.0.0/14',
4756         'MA': '105.128.0.0/11',
4757         'MC': '88.209.64.0/18',
4758         'MD': '37.246.0.0/16',
4759         'ME': '178.175.0.0/17',
4760         'MF': '74.112.232.0/21',
4761         'MG': '154.126.0.0/17',
4762         'MH': '117.103.88.0/21',
4763         'MK': '77.28.0.0/15',
4764         'ML': '154.118.128.0/18',
4765         'MM': '37.111.0.0/17',
4766         'MN': '49.0.128.0/17',
4767         'MO': '60.246.0.0/16',
4768         'MP': '202.88.64.0/20',
4769         'MQ': '109.203.224.0/19',
4770         'MR': '41.188.64.0/18',
4771         'MS': '208.90.112.0/22',
4772         'MT': '46.11.0.0/16',
4773         'MU': '105.16.0.0/12',
4774         'MV': '27.114.128.0/18',
4775         'MW': '102.70.0.0/15',
4776         'MX': '187.192.0.0/11',
4777         'MY': '175.136.0.0/13',
4778         'MZ': '197.218.0.0/15',
4779         'NA': '41.182.0.0/16',
4780         'NC': '101.101.0.0/18',
4781         'NE': '197.214.0.0/18',
4782         'NF': '203.17.240.0/22',
4783         'NG': '105.112.0.0/12',
4784         'NI': '186.76.0.0/15',
4785         'NL': '145.96.0.0/11',
4786         'NO': '84.208.0.0/13',
4787         'NP': '36.252.0.0/15',
4788         'NR': '203.98.224.0/19',
4789         'NU': '49.156.48.0/22',
4790         'NZ': '49.224.0.0/14',
4791         'OM': '5.36.0.0/15',
4792         'PA': '186.72.0.0/15',
4793         'PE': '186.160.0.0/14',
4794         'PF': '123.50.64.0/18',
4795         'PG': '124.240.192.0/19',
4796         'PH': '49.144.0.0/13',
4797         'PK': '39.32.0.0/11',
4798         'PL': '83.0.0.0/11',
4799         'PM': '70.36.0.0/20',
4800         'PR': '66.50.0.0/16',
4801         'PS': '188.161.0.0/16',
4802         'PT': '85.240.0.0/13',
4803         'PW': '202.124.224.0/20',
4804         'PY': '181.120.0.0/14',
4805         'QA': '37.210.0.0/15',
4806         'RE': '102.35.0.0/16',
4807         'RO': '79.112.0.0/13',
4808         'RS': '93.86.0.0/15',
4809         'RU': '5.136.0.0/13',
4810         'RW': '41.186.0.0/16',
4811         'SA': '188.48.0.0/13',
4812         'SB': '202.1.160.0/19',
4813         'SC': '154.192.0.0/11',
4814         'SD': '102.120.0.0/13',
4815         'SE': '78.64.0.0/12',
4816         'SG': '8.128.0.0/10',
4817         'SI': '188.196.0.0/14',
4818         'SK': '78.98.0.0/15',
4819         'SL': '102.143.0.0/17',
4820         'SM': '89.186.32.0/19',
4821         'SN': '41.82.0.0/15',
4822         'SO': '154.115.192.0/18',
4823         'SR': '186.179.128.0/17',
4824         'SS': '105.235.208.0/21',
4825         'ST': '197.159.160.0/19',
4826         'SV': '168.243.0.0/16',
4827         'SX': '190.102.0.0/20',
4828         'SY': '5.0.0.0/16',
4829         'SZ': '41.84.224.0/19',
4830         'TC': '65.255.48.0/20',
4831         'TD': '154.68.128.0/19',
4832         'TG': '196.168.0.0/14',
4833         'TH': '171.96.0.0/13',
4834         'TJ': '85.9.128.0/18',
4835         'TK': '27.96.24.0/21',
4836         'TL': '180.189.160.0/20',
4837         'TM': '95.85.96.0/19',
4838         'TN': '197.0.0.0/11',
4839         'TO': '175.176.144.0/21',
4840         'TR': '78.160.0.0/11',
4841         'TT': '186.44.0.0/15',
4842         'TV': '202.2.96.0/19',
4843         'TW': '120.96.0.0/11',
4844         'TZ': '156.156.0.0/14',
4845         'UA': '37.52.0.0/14',
4846         'UG': '102.80.0.0/13',
4847         'US': '6.0.0.0/8',
4848         'UY': '167.56.0.0/13',
4849         'UZ': '84.54.64.0/18',
4850         'VA': '212.77.0.0/19',
4851         'VC': '207.191.240.0/21',
4852         'VE': '186.88.0.0/13',
4853         'VG': '66.81.192.0/20',
4854         'VI': '146.226.0.0/16',
4855         'VN': '14.160.0.0/11',
4856         'VU': '202.80.32.0/20',
4857         'WF': '117.20.32.0/21',
4858         'WS': '202.4.32.0/19',
4859         'YE': '134.35.0.0/16',
4860         'YT': '41.242.116.0/22',
4861         'ZA': '41.0.0.0/11',
4862         'ZM': '102.144.0.0/13',
4863         'ZW': '102.177.192.0/18',
4864     }
4865
4866     @classmethod
4867     def random_ipv4(cls, code_or_block):
4868         if len(code_or_block) == 2:
4869             block = cls._country_ip_map.get(code_or_block.upper())
4870             if not block:
4871                 return None
4872         else:
4873             block = code_or_block
4874         addr, preflen = block.split('/')
4875         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4876         addr_max = addr_min | (0xffffffff >> int(preflen))
4877         return str(socket.inet_ntoa(
4878             struct.pack('!L', random.randint(addr_min, addr_max))))
4879
4880
4881 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4882     def __init__(self, proxies=None):
4883         # Set default handlers
4884         for type in ('http', 'https'):
4885             setattr(self, '%s_open' % type,
4886                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4887                         meth(r, proxy, type))
4888         urllib.request.ProxyHandler.__init__(self, proxies)
4889
4890     def proxy_open(self, req, proxy, type):
4891         req_proxy = req.headers.get('Ytdl-request-proxy')
4892         if req_proxy is not None:
4893             proxy = req_proxy
4894             del req.headers['Ytdl-request-proxy']
4895
4896         if proxy == '__noproxy__':
4897             return None  # No Proxy
4898         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4899             req.add_header('Ytdl-socks-proxy', proxy)
4900             # yt-dlp's http/https handlers do wrapping the socket with socks
4901             return None
4902         return urllib.request.ProxyHandler.proxy_open(
4903             self, req, proxy, type)
4904
4905
4906 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4907 # released into Public Domain
4908 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4909
4910 def long_to_bytes(n, blocksize=0):
4911     """long_to_bytes(n:long, blocksize:int) : string
4912     Convert a long integer to a byte string.
4913
4914     If optional blocksize is given and greater than zero, pad the front of the
4915     byte string with binary zeros so that the length is a multiple of
4916     blocksize.
4917     """
4918     # after much testing, this algorithm was deemed to be the fastest
4919     s = b''
4920     n = int(n)
4921     while n > 0:
4922         s = struct.pack('>I', n & 0xffffffff) + s
4923         n = n >> 32
4924     # strip off leading zeros
4925     for i in range(len(s)):
4926         if s[i] != b'\000'[0]:
4927             break
4928     else:
4929         # only happens when n == 0
4930         s = b'\000'
4931         i = 0
4932     s = s[i:]
4933     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4934     # de-padding being done above, but sigh...
4935     if blocksize > 0 and len(s) % blocksize:
4936         s = (blocksize - len(s) % blocksize) * b'\000' + s
4937     return s
4938
4939
4940 def bytes_to_long(s):
4941     """bytes_to_long(string) : long
4942     Convert a byte string to a long integer.
4943
4944     This is (essentially) the inverse of long_to_bytes().
4945     """
4946     acc = 0
4947     length = len(s)
4948     if length % 4:
4949         extra = (4 - length % 4)
4950         s = b'\000' * extra + s
4951         length = length + extra
4952     for i in range(0, length, 4):
4953         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4954     return acc
4955
4956
4957 def ohdave_rsa_encrypt(data, exponent, modulus):
4958     '''
4959     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4960
4961     Input:
4962         data: data to encrypt, bytes-like object
4963         exponent, modulus: parameter e and N of RSA algorithm, both integer
4964     Output: hex string of encrypted data
4965
4966     Limitation: supports one block encryption only
4967     '''
4968
4969     payload = int(binascii.hexlify(data[::-1]), 16)
4970     encrypted = pow(payload, exponent, modulus)
4971     return '%x' % encrypted
4972
4973
4974 def pkcs1pad(data, length):
4975     """
4976     Padding input data with PKCS#1 scheme
4977
4978     @param {int[]} data        input data
4979     @param {int}   length      target length
4980     @returns {int[]}           padded data
4981     """
4982     if len(data) > length - 11:
4983         raise ValueError('Input data too long for PKCS#1 padding')
4984
4985     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4986     return [0, 2] + pseudo_random + [0] + data
4987
4988
4989 def _base_n_table(n, table):
4990     if not table and not n:
4991         raise ValueError('Either table or n must be specified')
4992     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4993
4994     if n and n != len(table):
4995         raise ValueError(f'base {n} exceeds table length {len(table)}')
4996     return table
4997
4998
4999 def encode_base_n(num, n=None, table=None):
5000     """Convert given int to a base-n string"""
5001     table = _base_n_table(n, table)
5002     if not num:
5003         return table[0]
5004
5005     result, base = '', len(table)
5006     while num:
5007         result = table[num % base] + result
5008         num = num // base
5009     return result
5010
5011
5012 def decode_base_n(string, n=None, table=None):
5013     """Convert given base-n string to int"""
5014     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
5015     result, base = 0, len(table)
5016     for char in string:
5017         result = result * base + table[char]
5018     return result
5019
5020
5021 def decode_packed_codes(code):
5022     mobj = re.search(PACKED_CODES_RE, code)
5023     obfuscated_code, base, count, symbols = mobj.groups()
5024     base = int(base)
5025     count = int(count)
5026     symbols = symbols.split('|')
5027     symbol_table = {}
5028
5029     while count:
5030         count -= 1
5031         base_n_count = encode_base_n(count, base)
5032         symbol_table[base_n_count] = symbols[count] or base_n_count
5033
5034     return re.sub(
5035         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
5036         obfuscated_code)
5037
5038
5039 def caesar(s, alphabet, shift):
5040     if shift == 0:
5041         return s
5042     l = len(alphabet)
5043     return ''.join(
5044         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5045         for c in s)
5046
5047
5048 def rot47(s):
5049     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5050
5051
5052 def parse_m3u8_attributes(attrib):
5053     info = {}
5054     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5055         if val.startswith('"'):
5056             val = val[1:-1]
5057         info[key] = val
5058     return info
5059
5060
5061 def urshift(val, n):
5062     return val >> n if val >= 0 else (val + 0x100000000) >> n
5063
5064
5065 def write_xattr(path, key, value):
5066     # Windows: Write xattrs to NTFS Alternate Data Streams:
5067     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5068     if compat_os_name == 'nt':
5069         assert ':' not in key
5070         assert os.path.exists(path)
5071
5072         try:
5073             with open(f'{path}:{key}', 'wb') as f:
5074                 f.write(value)
5075         except OSError as e:
5076             raise XAttrMetadataError(e.errno, e.strerror)
5077         return
5078
5079     # UNIX Method 1. Use xattrs/pyxattrs modules
5080
5081     setxattr = None
5082     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5083         # Unicode arguments are not supported in pyxattr until version 0.5.0
5084         # See https://github.com/ytdl-org/youtube-dl/issues/5498
5085         if version_tuple(xattr.__version__) >= (0, 5, 0):
5086             setxattr = xattr.set
5087     elif xattr:
5088         setxattr = xattr.setxattr
5089
5090     if setxattr:
5091         try:
5092             setxattr(path, key, value)
5093         except OSError as e:
5094             raise XAttrMetadataError(e.errno, e.strerror)
5095         return
5096
5097     # UNIX Method 2. Use setfattr/xattr executables
5098     exe = ('setfattr' if check_executable('setfattr', ['--version'])
5099            else 'xattr' if check_executable('xattr', ['-h']) else None)
5100     if not exe:
5101         raise XAttrUnavailableError(
5102             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5103             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5104
5105     value = value.decode()
5106     try:
5107         _, stderr, returncode = Popen.run(
5108             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5109             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5110     except OSError as e:
5111         raise XAttrMetadataError(e.errno, e.strerror)
5112     if returncode:
5113         raise XAttrMetadataError(returncode, stderr)
5114
5115
5116 def random_birthday(year_field, month_field, day_field):
5117     start_date = datetime.date(1950, 1, 1)
5118     end_date = datetime.date(1995, 12, 31)
5119     offset = random.randint(0, (end_date - start_date).days)
5120     random_date = start_date + datetime.timedelta(offset)
5121     return {
5122         year_field: str(random_date.year),
5123         month_field: str(random_date.month),
5124         day_field: str(random_date.day),
5125     }
5126
5127
5128 def find_available_port(interface=''):
5129     try:
5130         with socket.socket() as sock:
5131             sock.bind((interface, 0))
5132             return sock.getsockname()[1]
5133     except OSError:
5134         return None
5135
5136
5137 # Templates for internet shortcut files, which are plain text files.
5138 DOT_URL_LINK_TEMPLATE = '''\
5139 [InternetShortcut]
5140 URL=%(url)s
5141 '''
5142
5143 DOT_WEBLOC_LINK_TEMPLATE = '''\
5144 <?xml version="1.0" encoding="UTF-8"?>
5145 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5146 <plist version="1.0">
5147 <dict>
5148 \t<key>URL</key>
5149 \t<string>%(url)s</string>
5150 </dict>
5151 </plist>
5152 '''
5153
5154 DOT_DESKTOP_LINK_TEMPLATE = '''\
5155 [Desktop Entry]
5156 Encoding=UTF-8
5157 Name=%(filename)s
5158 Type=Link
5159 URL=%(url)s
5160 Icon=text-html
5161 '''
5162
5163 LINK_TEMPLATES = {
5164     'url': DOT_URL_LINK_TEMPLATE,
5165     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5166     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5167 }
5168
5169
5170 def iri_to_uri(iri):
5171     """
5172     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5173
5174     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5175     """
5176
5177     iri_parts = urllib.parse.urlparse(iri)
5178
5179     if '[' in iri_parts.netloc:
5180         raise ValueError('IPv6 URIs are not, yet, supported.')
5181         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5182
5183     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5184
5185     net_location = ''
5186     if iri_parts.username:
5187         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5188         if iri_parts.password is not None:
5189             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5190         net_location += '@'
5191
5192     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5193     # The 'idna' encoding produces ASCII text.
5194     if iri_parts.port is not None and iri_parts.port != 80:
5195         net_location += ':' + str(iri_parts.port)
5196
5197     return urllib.parse.urlunparse(
5198         (iri_parts.scheme,
5199             net_location,
5200
5201             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5202
5203             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5204             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5205
5206             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5207             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5208
5209             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5210
5211     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5212
5213
5214 def to_high_limit_path(path):
5215     if sys.platform in ['win32', 'cygwin']:
5216         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5217         return '\\\\?\\' + os.path.abspath(path)
5218
5219     return path
5220
5221
5222 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5223     val = traversal.traverse_obj(obj, *variadic(field))
5224     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5225         return default
5226     return template % func(val)
5227
5228
5229 def clean_podcast_url(url):
5230     return re.sub(r'''(?x)
5231         (?:
5232             (?:
5233                 chtbl\.com/track|
5234                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5235                 play\.podtrac\.com
5236             )/[^/]+|
5237             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5238             flex\.acast\.com|
5239             pd(?:
5240                 cn\.co| # https://podcorn.com/analytics-prefix/
5241                 st\.fm # https://podsights.com/docs/
5242             )/e
5243         )/''', '', url)
5244
5245
5246 _HEX_TABLE = '0123456789abcdef'
5247
5248
5249 def random_uuidv4():
5250     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5251
5252
5253 def make_dir(path, to_screen=None):
5254     try:
5255         dn = os.path.dirname(path)
5256         if dn:
5257             os.makedirs(dn, exist_ok=True)
5258         return True
5259     except OSError as err:
5260         if callable(to_screen) is not None:
5261             to_screen(f'unable to create directory {err}')
5262         return False
5263
5264
5265 def get_executable_path():
5266     from ..update import _get_variant_and_executable_path
5267
5268     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5269
5270
5271 def get_user_config_dirs(package_name):
5272     # .config (e.g. ~/.config/package_name)
5273     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5274     yield os.path.join(xdg_config_home, package_name)
5275
5276     # appdata (%APPDATA%/package_name)
5277     appdata_dir = os.getenv('appdata')
5278     if appdata_dir:
5279         yield os.path.join(appdata_dir, package_name)
5280
5281     # home (~/.package_name)
5282     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5283
5284
5285 def get_system_config_dirs(package_name):
5286     # /etc/package_name
5287     yield os.path.join('/etc', package_name)
5288
5289
5290 def time_seconds(**kwargs):
5291     """
5292     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5293     """
5294     return time.time() + datetime.timedelta(**kwargs).total_seconds()
5295
5296
5297 # create a JSON Web Signature (jws) with HS256 algorithm
5298 # the resulting format is in JWS Compact Serialization
5299 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5300 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5301 def jwt_encode_hs256(payload_data, key, headers={}):
5302     header_data = {
5303         'alg': 'HS256',
5304         'typ': 'JWT',
5305     }
5306     if headers:
5307         header_data.update(headers)
5308     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5309     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5310     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5311     signature_b64 = base64.b64encode(h.digest())
5312     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5313     return token
5314
5315
5316 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5317 def jwt_decode_hs256(jwt):
5318     header_b64, payload_b64, signature_b64 = jwt.split('.')
5319     # add trailing ='s that may have been stripped, superfluous ='s are ignored
5320     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5321     return payload_data
5322
5323
5324 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5325
5326
5327 @functools.cache
5328 def supports_terminal_sequences(stream):
5329     if compat_os_name == 'nt':
5330         if not WINDOWS_VT_MODE:
5331             return False
5332     elif not os.getenv('TERM'):
5333         return False
5334     try:
5335         return stream.isatty()
5336     except BaseException:
5337         return False
5338
5339
5340 def windows_enable_vt_mode():
5341     """Ref: https://bugs.python.org/issue30075 """
5342     if get_windows_version() < (10, 0, 10586):
5343         return
5344
5345     import ctypes
5346     import ctypes.wintypes
5347     import msvcrt
5348
5349     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5350
5351     dll = ctypes.WinDLL('kernel32', use_last_error=False)
5352     handle = os.open('CONOUT$', os.O_RDWR)
5353     try:
5354         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5355         dw_original_mode = ctypes.wintypes.DWORD()
5356         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5357         if not success:
5358             raise Exception('GetConsoleMode failed')
5359
5360         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5361             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5362         if not success:
5363             raise Exception('SetConsoleMode failed')
5364     finally:
5365         os.close(handle)
5366
5367     global WINDOWS_VT_MODE
5368     WINDOWS_VT_MODE = True
5369     supports_terminal_sequences.cache_clear()
5370
5371
5372 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5373
5374
5375 def remove_terminal_sequences(string):
5376     return _terminal_sequences_re.sub('', string)
5377
5378
5379 def number_of_digits(number):
5380     return len('%d' % number)
5381
5382
5383 def join_nonempty(*values, delim='-', from_dict=None):
5384     if from_dict is not None:
5385         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
5386     return delim.join(map(str, filter(None, values)))
5387
5388
5389 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5390     """
5391     Find the largest format dimensions in terms of video width and, for each thumbnail:
5392     * Modify the URL: Match the width with the provided regex and replace with the former width
5393     * Update dimensions
5394
5395     This function is useful with video services that scale the provided thumbnails on demand
5396     """
5397     _keys = ('width', 'height')
5398     max_dimensions = max(
5399         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5400         default=(0, 0))
5401     if not max_dimensions[0]:
5402         return thumbnails
5403     return [
5404         merge_dicts(
5405             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5406             dict(zip(_keys, max_dimensions)), thumbnail)
5407         for thumbnail in thumbnails
5408     ]
5409
5410
5411 def parse_http_range(range):
5412     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5413     if not range:
5414         return None, None, None
5415     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5416     if not crg:
5417         return None, None, None
5418     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5419
5420
5421 def read_stdin(what):
5422     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5423     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5424     return sys.stdin
5425
5426
5427 def determine_file_encoding(data):
5428     """
5429     Detect the text encoding used
5430     @returns (encoding, bytes to skip)
5431     """
5432
5433     # BOM marks are given priority over declarations
5434     for bom, enc in BOMS:
5435         if data.startswith(bom):
5436             return enc, len(bom)
5437
5438     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5439     # We ignore the endianness to get a good enough match
5440     data = data.replace(b'\0', b'')
5441     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5442     return mobj.group(1).decode() if mobj else None, 0
5443
5444
5445 class Config:
5446     own_args = None
5447     parsed_args = None
5448     filename = None
5449     __initialized = False
5450
5451     def __init__(self, parser, label=None):
5452         self.parser, self.label = parser, label
5453         self._loaded_paths, self.configs = set(), []
5454
5455     def init(self, args=None, filename=None):
5456         assert not self.__initialized
5457         self.own_args, self.filename = args, filename
5458         return self.load_configs()
5459
5460     def load_configs(self):
5461         directory = ''
5462         if self.filename:
5463             location = os.path.realpath(self.filename)
5464             directory = os.path.dirname(location)
5465             if location in self._loaded_paths:
5466                 return False
5467             self._loaded_paths.add(location)
5468
5469         self.__initialized = True
5470         opts, _ = self.parser.parse_known_args(self.own_args)
5471         self.parsed_args = self.own_args
5472         for location in opts.config_locations or []:
5473             if location == '-':
5474                 if location in self._loaded_paths:
5475                     continue
5476                 self._loaded_paths.add(location)
5477                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5478                 continue
5479             location = os.path.join(directory, expand_path(location))
5480             if os.path.isdir(location):
5481                 location = os.path.join(location, 'yt-dlp.conf')
5482             if not os.path.exists(location):
5483                 self.parser.error(f'config location {location} does not exist')
5484             self.append_config(self.read_file(location), location)
5485         return True
5486
5487     def __str__(self):
5488         label = join_nonempty(
5489             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5490             delim=' ')
5491         return join_nonempty(
5492             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5493             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5494             delim='\n')
5495
5496     @staticmethod
5497     def read_file(filename, default=[]):
5498         try:
5499             optionf = open(filename, 'rb')
5500         except OSError:
5501             return default  # silently skip if file is not present
5502         try:
5503             enc, skip = determine_file_encoding(optionf.read(512))
5504             optionf.seek(skip, io.SEEK_SET)
5505         except OSError:
5506             enc = None  # silently skip read errors
5507         try:
5508             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5509             contents = optionf.read().decode(enc or preferredencoding())
5510             res = shlex.split(contents, comments=True)
5511         except Exception as err:
5512             raise ValueError(f'Unable to parse "{filename}": {err}')
5513         finally:
5514             optionf.close()
5515         return res
5516
5517     @staticmethod
5518     def hide_login_info(opts):
5519         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5520         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5521
5522         def _scrub_eq(o):
5523             m = eqre.match(o)
5524             if m:
5525                 return m.group('key') + '=PRIVATE'
5526             else:
5527                 return o
5528
5529         opts = list(map(_scrub_eq, opts))
5530         for idx, opt in enumerate(opts):
5531             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5532                 opts[idx + 1] = 'PRIVATE'
5533         return opts
5534
5535     def append_config(self, *args, label=None):
5536         config = type(self)(self.parser, label)
5537         config._loaded_paths = self._loaded_paths
5538         if config.init(*args):
5539             self.configs.append(config)
5540
5541     @property
5542     def all_args(self):
5543         for config in reversed(self.configs):
5544             yield from config.all_args
5545         yield from self.parsed_args or []
5546
5547     def parse_known_args(self, **kwargs):
5548         return self.parser.parse_known_args(self.all_args, **kwargs)
5549
5550     def parse_args(self):
5551         return self.parser.parse_args(self.all_args)
5552
5553
5554 class WebSocketsWrapper:
5555     """Wraps websockets module to use in non-async scopes"""
5556     pool = None
5557
5558     def __init__(self, url, headers=None, connect=True):
5559         self.loop = asyncio.new_event_loop()
5560         # XXX: "loop" is deprecated
5561         self.conn = websockets.connect(
5562             url, extra_headers=headers, ping_interval=None,
5563             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5564         if connect:
5565             self.__enter__()
5566         atexit.register(self.__exit__, None, None, None)
5567
5568     def __enter__(self):
5569         if not self.pool:
5570             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5571         return self
5572
5573     def send(self, *args):
5574         self.run_with_loop(self.pool.send(*args), self.loop)
5575
5576     def recv(self, *args):
5577         return self.run_with_loop(self.pool.recv(*args), self.loop)
5578
5579     def __exit__(self, type, value, traceback):
5580         try:
5581             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5582         finally:
5583             self.loop.close()
5584             self._cancel_all_tasks(self.loop)
5585
5586     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5587     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5588     @staticmethod
5589     def run_with_loop(main, loop):
5590         if not asyncio.iscoroutine(main):
5591             raise ValueError(f'a coroutine was expected, got {main!r}')
5592
5593         try:
5594             return loop.run_until_complete(main)
5595         finally:
5596             loop.run_until_complete(loop.shutdown_asyncgens())
5597             if hasattr(loop, 'shutdown_default_executor'):
5598                 loop.run_until_complete(loop.shutdown_default_executor())
5599
5600     @staticmethod
5601     def _cancel_all_tasks(loop):
5602         to_cancel = asyncio.all_tasks(loop)
5603
5604         if not to_cancel:
5605             return
5606
5607         for task in to_cancel:
5608             task.cancel()
5609
5610         # XXX: "loop" is removed in python 3.10+
5611         loop.run_until_complete(
5612             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5613
5614         for task in to_cancel:
5615             if task.cancelled():
5616                 continue
5617             if task.exception() is not None:
5618                 loop.call_exception_handler({
5619                     'message': 'unhandled exception during asyncio.run() shutdown',
5620                     'exception': task.exception(),
5621                     'task': task,
5622                 })
5623
5624
5625 def merge_headers(*dicts):
5626     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5627     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5628
5629
5630 def cached_method(f):
5631     """Cache a method"""
5632     signature = inspect.signature(f)
5633
5634     @functools.wraps(f)
5635     def wrapper(self, *args, **kwargs):
5636         bound_args = signature.bind(self, *args, **kwargs)
5637         bound_args.apply_defaults()
5638         key = tuple(bound_args.arguments.values())[1:]
5639
5640         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5641         if key not in cache:
5642             cache[key] = f(self, *args, **kwargs)
5643         return cache[key]
5644     return wrapper
5645
5646
5647 class classproperty:
5648     """property access for class methods with optional caching"""
5649     def __new__(cls, func=None, *args, **kwargs):
5650         if not func:
5651             return functools.partial(cls, *args, **kwargs)
5652         return super().__new__(cls)
5653
5654     def __init__(self, func, *, cache=False):
5655         functools.update_wrapper(self, func)
5656         self.func = func
5657         self._cache = {} if cache else None
5658
5659     def __get__(self, _, cls):
5660         if self._cache is None:
5661             return self.func(cls)
5662         elif cls not in self._cache:
5663             self._cache[cls] = self.func(cls)
5664         return self._cache[cls]
5665
5666
5667 class function_with_repr:
5668     def __init__(self, func, repr_=None):
5669         functools.update_wrapper(self, func)
5670         self.func, self.__repr = func, repr_
5671
5672     def __call__(self, *args, **kwargs):
5673         return self.func(*args, **kwargs)
5674
5675     def __repr__(self):
5676         if self.__repr:
5677             return self.__repr
5678         return f'{self.func.__module__}.{self.func.__qualname__}'
5679
5680
5681 class Namespace(types.SimpleNamespace):
5682     """Immutable namespace"""
5683
5684     def __iter__(self):
5685         return iter(self.__dict__.values())
5686
5687     @property
5688     def items_(self):
5689         return self.__dict__.items()
5690
5691
5692 MEDIA_EXTENSIONS = Namespace(
5693     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5694     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5695     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5696     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5697     thumbnails=('jpg', 'png', 'webp'),
5698     storyboards=('mhtml', ),
5699     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5700     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5701 )
5702 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5703 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5704
5705 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5706
5707
5708 class RetryManager:
5709     """Usage:
5710         for retry in RetryManager(...):
5711             try:
5712                 ...
5713             except SomeException as err:
5714                 retry.error = err
5715                 continue
5716     """
5717     attempt, _error = 0, None
5718
5719     def __init__(self, _retries, _error_callback, **kwargs):
5720         self.retries = _retries or 0
5721         self.error_callback = functools.partial(_error_callback, **kwargs)
5722
5723     def _should_retry(self):
5724         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5725
5726     @property
5727     def error(self):
5728         if self._error is NO_DEFAULT:
5729             return None
5730         return self._error
5731
5732     @error.setter
5733     def error(self, value):
5734         self._error = value
5735
5736     def __iter__(self):
5737         while self._should_retry():
5738             self.error = NO_DEFAULT
5739             self.attempt += 1
5740             yield self
5741             if self.error:
5742                 self.error_callback(self.error, self.attempt, self.retries)
5743
5744     @staticmethod
5745     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5746         """Utility function for reporting retries"""
5747         if count > retries:
5748             if error:
5749                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5750             raise e
5751
5752         if not count:
5753             return warn(e)
5754         elif isinstance(e, ExtractorError):
5755             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5756         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5757
5758         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5759         if delay:
5760             info(f'Sleeping {delay:.2f} seconds ...')
5761             time.sleep(delay)
5762
5763
5764 def make_archive_id(ie, video_id):
5765     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5766     return f'{ie_key.lower()} {video_id}'
5767
5768
5769 def truncate_string(s, left, right=0):
5770     assert left > 3 and right >= 0
5771     if s is None or len(s) <= left + right:
5772         return s
5773     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5774
5775
5776 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5777     assert 'all' in alias_dict, '"all" alias is required'
5778     requested = list(start or [])
5779     for val in options:
5780         discard = val.startswith('-')
5781         if discard:
5782             val = val[1:]
5783
5784         if val in alias_dict:
5785             val = alias_dict[val] if not discard else [
5786                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5787             # NB: Do not allow regex in aliases for performance
5788             requested = orderedSet_from_options(val, alias_dict, start=requested)
5789             continue
5790
5791         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5792                    else [val] if val in alias_dict['all'] else None)
5793         if current is None:
5794             raise ValueError(val)
5795
5796         if discard:
5797             for item in current:
5798                 while item in requested:
5799                     requested.remove(item)
5800         else:
5801             requested.extend(current)
5802
5803     return orderedSet(requested)
5804
5805
5806 class FormatSorter:
5807     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5808
5809     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5810                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5811                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5812     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5813                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5814                     'fps', 'fs_approx', 'source', 'id')
5815
5816     settings = {
5817         'vcodec': {'type': 'ordered', 'regex': True,
5818                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5819         'acodec': {'type': 'ordered', 'regex': True,
5820                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5821         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5822                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5823         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5824                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5825         'vext': {'type': 'ordered', 'field': 'video_ext',
5826                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5827                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5828         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5829                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5830                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5831         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5832         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5833                        'field': ('vcodec', 'acodec'),
5834                        'function': lambda it: int(any(v != 'none' for v in it))},
5835         'ie_pref': {'priority': True, 'type': 'extractor'},
5836         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5837         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5838         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5839         'quality': {'convert': 'float', 'default': -1},
5840         'filesize': {'convert': 'bytes'},
5841         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5842         'id': {'convert': 'string', 'field': 'format_id'},
5843         'height': {'convert': 'float_none'},
5844         'width': {'convert': 'float_none'},
5845         'fps': {'convert': 'float_none'},
5846         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5847         'tbr': {'convert': 'float_none'},
5848         'vbr': {'convert': 'float_none'},
5849         'abr': {'convert': 'float_none'},
5850         'asr': {'convert': 'float_none'},
5851         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5852
5853         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5854         'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
5855         'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
5856         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5857         'res': {'type': 'multiple', 'field': ('height', 'width'),
5858                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5859
5860         # Actual field names
5861         'format_id': {'type': 'alias', 'field': 'id'},
5862         'preference': {'type': 'alias', 'field': 'ie_pref'},
5863         'language_preference': {'type': 'alias', 'field': 'lang'},
5864         'source_preference': {'type': 'alias', 'field': 'source'},
5865         'protocol': {'type': 'alias', 'field': 'proto'},
5866         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5867         'audio_channels': {'type': 'alias', 'field': 'channels'},
5868
5869         # Deprecated
5870         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5871         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5872         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5873         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5874         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5875         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5876         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5877         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5878         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5879         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5880         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5881         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5882         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5883         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5884         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5885         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5886         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5887         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5888         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5889         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5890     }
5891
5892     def __init__(self, ydl, field_preference):
5893         self.ydl = ydl
5894         self._order = []
5895         self.evaluate_params(self.ydl.params, field_preference)
5896         if ydl.params.get('verbose'):
5897             self.print_verbose_info(self.ydl.write_debug)
5898
5899     def _get_field_setting(self, field, key):
5900         if field not in self.settings:
5901             if key in ('forced', 'priority'):
5902                 return False
5903             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5904                                         'deprecated and may be removed in a future version')
5905             self.settings[field] = {}
5906         propObj = self.settings[field]
5907         if key not in propObj:
5908             type = propObj.get('type')
5909             if key == 'field':
5910                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5911             elif key == 'convert':
5912                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5913             else:
5914                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5915             propObj[key] = default
5916         return propObj[key]
5917
5918     def _resolve_field_value(self, field, value, convertNone=False):
5919         if value is None:
5920             if not convertNone:
5921                 return None
5922         else:
5923             value = value.lower()
5924         conversion = self._get_field_setting(field, 'convert')
5925         if conversion == 'ignore':
5926             return None
5927         if conversion == 'string':
5928             return value
5929         elif conversion == 'float_none':
5930             return float_or_none(value)
5931         elif conversion == 'bytes':
5932             return parse_bytes(value)
5933         elif conversion == 'order':
5934             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5935             use_regex = self._get_field_setting(field, 'regex')
5936             list_length = len(order_list)
5937             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5938             if use_regex and value is not None:
5939                 for i, regex in enumerate(order_list):
5940                     if regex and re.match(regex, value):
5941                         return list_length - i
5942                 return list_length - empty_pos  # not in list
5943             else:  # not regex or  value = None
5944                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5945         else:
5946             if value.isnumeric():
5947                 return float(value)
5948             else:
5949                 self.settings[field]['convert'] = 'string'
5950                 return value
5951
5952     def evaluate_params(self, params, sort_extractor):
5953         self._use_free_order = params.get('prefer_free_formats', False)
5954         self._sort_user = params.get('format_sort', [])
5955         self._sort_extractor = sort_extractor
5956
5957         def add_item(field, reverse, closest, limit_text):
5958             field = field.lower()
5959             if field in self._order:
5960                 return
5961             self._order.append(field)
5962             limit = self._resolve_field_value(field, limit_text)
5963             data = {
5964                 'reverse': reverse,
5965                 'closest': False if limit is None else closest,
5966                 'limit_text': limit_text,
5967                 'limit': limit}
5968             if field in self.settings:
5969                 self.settings[field].update(data)
5970             else:
5971                 self.settings[field] = data
5972
5973         sort_list = (
5974             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5975             + (tuple() if params.get('format_sort_force', False)
5976                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5977             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5978
5979         for item in sort_list:
5980             match = re.match(self.regex, item)
5981             if match is None:
5982                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5983             field = match.group('field')
5984             if field is None:
5985                 continue
5986             if self._get_field_setting(field, 'type') == 'alias':
5987                 alias, field = field, self._get_field_setting(field, 'field')
5988                 if self._get_field_setting(alias, 'deprecated'):
5989                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5990                                                 f'be removed in a future version. Please use {field} instead')
5991             reverse = match.group('reverse') is not None
5992             closest = match.group('separator') == '~'
5993             limit_text = match.group('limit')
5994
5995             has_limit = limit_text is not None
5996             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5997             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5998
5999             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6000             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6001             limit_count = len(limits)
6002             for (i, f) in enumerate(fields):
6003                 add_item(f, reverse, closest,
6004                          limits[i] if i < limit_count
6005                          else limits[0] if has_limit and not has_multiple_limits
6006                          else None)
6007
6008     def print_verbose_info(self, write_debug):
6009         if self._sort_user:
6010             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6011         if self._sort_extractor:
6012             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6013         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6014             '+' if self._get_field_setting(field, 'reverse') else '', field,
6015             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6016                           self._get_field_setting(field, 'limit_text'),
6017                           self._get_field_setting(field, 'limit'))
6018             if self._get_field_setting(field, 'limit_text') is not None else '')
6019             for field in self._order if self._get_field_setting(field, 'visible')]))
6020
6021     def _calculate_field_preference_from_value(self, format, field, type, value):
6022         reverse = self._get_field_setting(field, 'reverse')
6023         closest = self._get_field_setting(field, 'closest')
6024         limit = self._get_field_setting(field, 'limit')
6025
6026         if type == 'extractor':
6027             maximum = self._get_field_setting(field, 'max')
6028             if value is None or (maximum is not None and value >= maximum):
6029                 value = -1
6030         elif type == 'boolean':
6031             in_list = self._get_field_setting(field, 'in_list')
6032             not_in_list = self._get_field_setting(field, 'not_in_list')
6033             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6034         elif type == 'ordered':
6035             value = self._resolve_field_value(field, value, True)
6036
6037         # try to convert to number
6038         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6039         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6040         if is_num:
6041             value = val_num
6042
6043         return ((-10, 0) if value is None
6044                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
6045                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6046                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6047                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6048                 else (-1, value, 0))
6049
6050     def _calculate_field_preference(self, format, field):
6051         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
6052         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6053         if type == 'multiple':
6054             type = 'field'  # Only 'field' is allowed in multiple for now
6055             actual_fields = self._get_field_setting(field, 'field')
6056
6057             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6058         else:
6059             value = get_value(field)
6060         return self._calculate_field_preference_from_value(format, field, type, value)
6061
6062     def calculate_preference(self, format):
6063         # Determine missing protocol
6064         if not format.get('protocol'):
6065             format['protocol'] = determine_protocol(format)
6066
6067         # Determine missing ext
6068         if not format.get('ext') and 'url' in format:
6069             format['ext'] = determine_ext(format['url'])
6070         if format.get('vcodec') == 'none':
6071             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6072             format['video_ext'] = 'none'
6073         else:
6074             format['video_ext'] = format['ext']
6075             format['audio_ext'] = 'none'
6076         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
6077         #    format['preference'] = -1000
6078
6079         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
6080             # HEVC-over-FLV is out-of-spec by FLV's original spec
6081             # ref. https://trac.ffmpeg.org/ticket/6389
6082             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
6083             format['preference'] = -100
6084
6085         # Determine missing bitrates
6086         if format.get('tbr') is None:
6087             if format.get('vbr') is not None and format.get('abr') is not None:
6088                 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6089         else:
6090             if format.get('vcodec') != 'none' and format.get('vbr') is None:
6091                 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6092             if format.get('acodec') != 'none' and format.get('abr') is None:
6093                 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6094
6095         return tuple(self._calculate_field_preference(format, field) for field in self._order)