yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import inspect
  19 import io
  20 import itertools
  21 import json
  22 import locale
  23 import math
  24 import mimetypes
  25 import netrc
  26 import operator
  27 import os
  28 import platform
  29 import random
  30 import re
  31 import shlex
  32 import socket
  33 import ssl
  34 import struct
  35 import subprocess
  36 import sys
  37 import tempfile
  38 import time
  39 import traceback
  40 import types
  41 import unicodedata
  42 import urllib.error
  43 import urllib.parse
  44 import urllib.request
  45 import xml.etree.ElementTree
  46
  47 from . import traversal
  48
  49 from ..compat import functools  # isort: split
  50 from ..compat import (
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_HTMLParseError,
  54     compat_os_name,
  55     compat_shlex_quote,
  56 )
  57 from ..dependencies import websockets, xattr
  58
  59 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  60
  61 # This is not clearly defined otherwise
  62 compiled_regex_type = type(re.compile(''))
  63
  64
  65 class NO_DEFAULT:
  66     pass
  67
  68
  69 def IDENTITY(x):
  70     return x
  71
  72
  73 ENGLISH_MONTH_NAMES = [
  74     'January', 'February', 'March', 'April', 'May', 'June',
  75     'July', 'August', 'September', 'October', 'November', 'December']
  76
  77 MONTH_NAMES = {
  78     'en': ENGLISH_MONTH_NAMES,
  79     'fr': [
  80         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  81         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  82     # these follow the genitive grammatical case (dopełniacz)
  83     # some websites might be using nominative, which will require another month list
  84     # https://en.wikibooks.org/wiki/Polish/Noun_cases
  85     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  86            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  87 }
  88
  89 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  90 TIMEZONE_NAMES = {
  91     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  92     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
  93     'EST': -5, 'EDT': -4,  # Eastern
  94     'CST': -6, 'CDT': -5,  # Central
  95     'MST': -7, 'MDT': -6,  # Mountain
  96     'PST': -8, 'PDT': -7   # Pacific
  97 }
  98
  99 # needed for sanitizing filenames in restricted mode
 100 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 101                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 102                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 103
 104 DATE_FORMATS = (
 105     '%d %B %Y',
 106     '%d %b %Y',
 107     '%B %d %Y',
 108     '%B %dst %Y',
 109     '%B %dnd %Y',
 110     '%B %drd %Y',
 111     '%B %dth %Y',
 112     '%b %d %Y',
 113     '%b %dst %Y',
 114     '%b %dnd %Y',
 115     '%b %drd %Y',
 116     '%b %dth %Y',
 117     '%b %dst %Y %I:%M',
 118     '%b %dnd %Y %I:%M',
 119     '%b %drd %Y %I:%M',
 120     '%b %dth %Y %I:%M',
 121     '%Y %m %d',
 122     '%Y-%m-%d',
 123     '%Y.%m.%d.',
 124     '%Y/%m/%d',
 125     '%Y/%m/%d %H:%M',
 126     '%Y/%m/%d %H:%M:%S',
 127     '%Y%m%d%H%M',
 128     '%Y%m%d%H%M%S',
 129     '%Y%m%d',
 130     '%Y-%m-%d %H:%M',
 131     '%Y-%m-%d %H:%M:%S',
 132     '%Y-%m-%d %H:%M:%S.%f',
 133     '%Y-%m-%d %H:%M:%S:%f',
 134     '%d.%m.%Y %H:%M',
 135     '%d.%m.%Y %H.%M',
 136     '%Y-%m-%dT%H:%M:%SZ',
 137     '%Y-%m-%dT%H:%M:%S.%fZ',
 138     '%Y-%m-%dT%H:%M:%S.%f0Z',
 139     '%Y-%m-%dT%H:%M:%S',
 140     '%Y-%m-%dT%H:%M:%S.%f',
 141     '%Y-%m-%dT%H:%M',
 142     '%b %d %Y at %H:%M',
 143     '%b %d %Y at %H:%M:%S',
 144     '%B %d %Y at %H:%M',
 145     '%B %d %Y at %H:%M:%S',
 146     '%H:%M %d-%b-%Y',
 147 )
 148
 149 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 150 DATE_FORMATS_DAY_FIRST.extend([
 151     '%d-%m-%Y',
 152     '%d.%m.%Y',
 153     '%d.%m.%y',
 154     '%d/%m/%Y',
 155     '%d/%m/%y',
 156     '%d/%m/%Y %H:%M:%S',
 157     '%d-%m-%Y %H:%M',
 158     '%H:%M %d/%m/%Y',
 159 ])
 160
 161 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 162 DATE_FORMATS_MONTH_FIRST.extend([
 163     '%m-%d-%Y',
 164     '%m.%d.%Y',
 165     '%m/%d/%Y',
 166     '%m/%d/%y',
 167     '%m/%d/%Y %H:%M:%S',
 168 ])
 169
 170 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 171 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 172
 173 NUMBER_RE = r'\d+(?:\.\d+)?'
 174
 175
 176 @functools.cache
 177 def preferredencoding():
 178     """Get preferred encoding.
 179
 180     Returns the best encoding scheme for the system, based on
 181     locale.getpreferredencoding() and some further tweaks.
 182     """
 183     try:
 184         pref = locale.getpreferredencoding()
 185         'TEST'.encode(pref)
 186     except Exception:
 187         pref = 'UTF-8'
 188
 189     return pref
 190
 191
 192 def write_json_file(obj, fn):
 193     """ Encode obj as JSON and write it to fn, atomically if possible """
 194
 195     tf = tempfile.NamedTemporaryFile(
 196         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 197         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 198
 199     try:
 200         with tf:
 201             json.dump(obj, tf, ensure_ascii=False)
 202         if sys.platform == 'win32':
 203             # Need to remove existing file on Windows, else os.rename raises
 204             # WindowsError or FileExistsError.
 205             with contextlib.suppress(OSError):
 206                 os.unlink(fn)
 207         with contextlib.suppress(OSError):
 208             mask = os.umask(0)
 209             os.umask(mask)
 210             os.chmod(tf.name, 0o666 & ~mask)
 211         os.rename(tf.name, fn)
 212     except Exception:
 213         with contextlib.suppress(OSError):
 214             os.remove(tf.name)
 215         raise
 216
 217
 218 def find_xpath_attr(node, xpath, key, val=None):
 219     """ Find the xpath xpath[@key=val] """
 220     assert re.match(r'^[a-zA-Z_-]+$', key)
 221     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 222     return node.find(expr)
 223
 224 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 225 # the namespace parameter
 226
 227
 228 def xpath_with_ns(path, ns_map):
 229     components = [c.split(':') for c in path.split('/')]
 230     replaced = []
 231     for c in components:
 232         if len(c) == 1:
 233             replaced.append(c[0])
 234         else:
 235             ns, tag = c
 236             replaced.append('{%s}%s' % (ns_map[ns], tag))
 237     return '/'.join(replaced)
 238
 239
 240 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 241     def _find_xpath(xpath):
 242         return node.find(xpath)
 243
 244     if isinstance(xpath, str):
 245         n = _find_xpath(xpath)
 246     else:
 247         for xp in xpath:
 248             n = _find_xpath(xp)
 249             if n is not None:
 250                 break
 251
 252     if n is None:
 253         if default is not NO_DEFAULT:
 254             return default
 255         elif fatal:
 256             name = xpath if name is None else name
 257             raise ExtractorError('Could not find XML element %s' % name)
 258         else:
 259             return None
 260     return n
 261
 262
 263 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 264     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 265     if n is None or n == default:
 266         return n
 267     if n.text is None:
 268         if default is not NO_DEFAULT:
 269             return default
 270         elif fatal:
 271             name = xpath if name is None else name
 272             raise ExtractorError('Could not find XML element\'s text %s' % name)
 273         else:
 274             return None
 275     return n.text
 276
 277
 278 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 279     n = find_xpath_attr(node, xpath, key)
 280     if n is None:
 281         if default is not NO_DEFAULT:
 282             return default
 283         elif fatal:
 284             name = f'{xpath}[@{key}]' if name is None else name
 285             raise ExtractorError('Could not find XML attribute %s' % name)
 286         else:
 287             return None
 288     return n.attrib[key]
 289
 290
 291 def get_element_by_id(id, html, **kwargs):
 292     """Return the content of the tag with the specified ID in the passed HTML document"""
 293     return get_element_by_attribute('id', id, html, **kwargs)
 294
 295
 296 def get_element_html_by_id(id, html, **kwargs):
 297     """Return the html of the tag with the specified ID in the passed HTML document"""
 298     return get_element_html_by_attribute('id', id, html, **kwargs)
 299
 300
 301 def get_element_by_class(class_name, html):
 302     """Return the content of the first tag with the specified class in the passed HTML document"""
 303     retval = get_elements_by_class(class_name, html)
 304     return retval[0] if retval else None
 305
 306
 307 def get_element_html_by_class(class_name, html):
 308     """Return the html of the first tag with the specified class in the passed HTML document"""
 309     retval = get_elements_html_by_class(class_name, html)
 310     return retval[0] if retval else None
 311
 312
 313 def get_element_by_attribute(attribute, value, html, **kwargs):
 314     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 315     return retval[0] if retval else None
 316
 317
 318 def get_element_html_by_attribute(attribute, value, html, **kargs):
 319     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 320     return retval[0] if retval else None
 321
 322
 323 def get_elements_by_class(class_name, html, **kargs):
 324     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 325     return get_elements_by_attribute(
 326         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 327         html, escape_value=False)
 328
 329
 330 def get_elements_html_by_class(class_name, html):
 331     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 332     return get_elements_html_by_attribute(
 333         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 334         html, escape_value=False)
 335
 336
 337 def get_elements_by_attribute(*args, **kwargs):
 338     """Return the content of the tag with the specified attribute in the passed HTML document"""
 339     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 340
 341
 342 def get_elements_html_by_attribute(*args, **kwargs):
 343     """Return the html of the tag with the specified attribute in the passed HTML document"""
 344     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 345
 346
 347 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 348     """
 349     Return the text (content) and the html (whole) of the tag with the specified
 350     attribute in the passed HTML document
 351     """
 352     if not value:
 353         return
 354
 355     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 356
 357     value = re.escape(value) if escape_value else value
 358
 359     partial_element_re = rf'''(?x)
 360         <(?P<tag>{tag})
 361          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 362          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 363         '''
 364
 365     for m in re.finditer(partial_element_re, html):
 366         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 367
 368         yield (
 369             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 370             whole
 371         )
 372
 373
 374 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 375     """
 376     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 377     closing tag for the first opening tag it has encountered, and can be used
 378     as a context manager
 379     """
 380
 381     class HTMLBreakOnClosingTagException(Exception):
 382         pass
 383
 384     def __init__(self):
 385         self.tagstack = collections.deque()
 386         html.parser.HTMLParser.__init__(self)
 387
 388     def __enter__(self):
 389         return self
 390
 391     def __exit__(self, *_):
 392         self.close()
 393
 394     def close(self):
 395         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 396         # so data remains buffered; we no longer have any interest in it, thus
 397         # override this method to discard it
 398         pass
 399
 400     def handle_starttag(self, tag, _):
 401         self.tagstack.append(tag)
 402
 403     def handle_endtag(self, tag):
 404         if not self.tagstack:
 405             raise compat_HTMLParseError('no tags in the stack')
 406         while self.tagstack:
 407             inner_tag = self.tagstack.pop()
 408             if inner_tag == tag:
 409                 break
 410         else:
 411             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 412         if not self.tagstack:
 413             raise self.HTMLBreakOnClosingTagException()
 414
 415
 416 # XXX: This should be far less strict
 417 def get_element_text_and_html_by_tag(tag, html):
 418     """
 419     For the first element with the specified tag in the passed HTML document
 420     return its' content (text) and the whole element (html)
 421     """
 422     def find_or_raise(haystack, needle, exc):
 423         try:
 424             return haystack.index(needle)
 425         except ValueError:
 426             raise exc
 427     closing_tag = f'</{tag}>'
 428     whole_start = find_or_raise(
 429         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 430     content_start = find_or_raise(
 431         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 432     content_start += whole_start + 1
 433     with HTMLBreakOnClosingTagParser() as parser:
 434         parser.feed(html[whole_start:content_start])
 435         if not parser.tagstack or parser.tagstack[0] != tag:
 436             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 437         offset = content_start
 438         while offset < len(html):
 439             next_closing_tag_start = find_or_raise(
 440                 html[offset:], closing_tag,
 441                 compat_HTMLParseError(f'closing {tag} tag not found'))
 442             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 443             try:
 444                 parser.feed(html[offset:offset + next_closing_tag_end])
 445                 offset += next_closing_tag_end
 446             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 447                 return html[content_start:offset + next_closing_tag_start], \
 448                     html[whole_start:offset + next_closing_tag_end]
 449         raise compat_HTMLParseError('unexpected end of html')
 450
 451
 452 class HTMLAttributeParser(html.parser.HTMLParser):
 453     """Trivial HTML parser to gather the attributes for a single element"""
 454
 455     def __init__(self):
 456         self.attrs = {}
 457         html.parser.HTMLParser.__init__(self)
 458
 459     def handle_starttag(self, tag, attrs):
 460         self.attrs = dict(attrs)
 461         raise compat_HTMLParseError('done')
 462
 463
 464 class HTMLListAttrsParser(html.parser.HTMLParser):
 465     """HTML parser to gather the attributes for the elements of a list"""
 466
 467     def __init__(self):
 468         html.parser.HTMLParser.__init__(self)
 469         self.items = []
 470         self._level = 0
 471
 472     def handle_starttag(self, tag, attrs):
 473         if tag == 'li' and self._level == 0:
 474             self.items.append(dict(attrs))
 475         self._level += 1
 476
 477     def handle_endtag(self, tag):
 478         self._level -= 1
 479
 480
 481 def extract_attributes(html_element):
 482     """Given a string for an HTML element such as
 483     <el
 484          a="foo" B="bar" c="&98;az" d=boz
 485          empty= noval entity="&amp;"
 486          sq='"' dq="'"
 487     >
 488     Decode and return a dictionary of attributes.
 489     {
 490         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 491         'empty': '', 'noval': None, 'entity': '&',
 492         'sq': '"', 'dq': '\''
 493     }.
 494     """
 495     parser = HTMLAttributeParser()
 496     with contextlib.suppress(compat_HTMLParseError):
 497         parser.feed(html_element)
 498         parser.close()
 499     return parser.attrs
 500
 501
 502 def parse_list(webpage):
 503     """Given a string for an series of HTML <li> elements,
 504     return a dictionary of their attributes"""
 505     parser = HTMLListAttrsParser()
 506     parser.feed(webpage)
 507     parser.close()
 508     return parser.items
 509
 510
 511 def clean_html(html):
 512     """Clean an HTML snippet into a readable string"""
 513
 514     if html is None:  # Convenience for sanitizing descriptions etc.
 515         return html
 516
 517     html = re.sub(r'\s+', ' ', html)
 518     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 519     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 520     # Strip html tags
 521     html = re.sub('<.*?>', '', html)
 522     # Replace html entities
 523     html = unescapeHTML(html)
 524     return html.strip()
 525
 526
 527 class LenientJSONDecoder(json.JSONDecoder):
 528     # TODO: Write tests
 529     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 530         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 531         self._close_attempts = 2 * close_objects
 532         super().__init__(*args, **kwargs)
 533
 534     @staticmethod
 535     def _close_object(err):
 536         doc = err.doc[:err.pos]
 537         # We need to add comma first to get the correct error message
 538         if err.msg.startswith('Expecting \',\''):
 539             return doc + ','
 540         elif not doc.endswith(','):
 541             return
 542
 543         if err.msg.startswith('Expecting property name'):
 544             return doc[:-1] + '}'
 545         elif err.msg.startswith('Expecting value'):
 546             return doc[:-1] + ']'
 547
 548     def decode(self, s):
 549         if self.transform_source:
 550             s = self.transform_source(s)
 551         for attempt in range(self._close_attempts + 1):
 552             try:
 553                 if self.ignore_extra:
 554                     return self.raw_decode(s.lstrip())[0]
 555                 return super().decode(s)
 556             except json.JSONDecodeError as e:
 557                 if e.pos is None:
 558                     raise
 559                 elif attempt < self._close_attempts:
 560                     s = self._close_object(e)
 561                     if s is not None:
 562                         continue
 563                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 564         assert False, 'Too many attempts to decode JSON'
 565
 566
 567 def sanitize_open(filename, open_mode):
 568     """Try to open the given filename, and slightly tweak it if this fails.
 569
 570     Attempts to open the given filename. If this fails, it tries to change
 571     the filename slightly, step by step, until it's either able to open it
 572     or it fails and raises a final exception, like the standard open()
 573     function.
 574
 575     It returns the tuple (stream, definitive_file_name).
 576     """
 577     if filename == '-':
 578         if sys.platform == 'win32':
 579             import msvcrt
 580
 581             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 582             with contextlib.suppress(io.UnsupportedOperation):
 583                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 584         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 585
 586     for attempt in range(2):
 587         try:
 588             try:
 589                 if sys.platform == 'win32':
 590                     # FIXME: An exclusive lock also locks the file from being read.
 591                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 592                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 593                     raise LockingUnsupportedError()
 594                 stream = locked_file(filename, open_mode, block=False).__enter__()
 595             except OSError:
 596                 stream = open(filename, open_mode)
 597             return stream, filename
 598         except OSError as err:
 599             if attempt or err.errno in (errno.EACCES,):
 600                 raise
 601             old_filename, filename = filename, sanitize_path(filename)
 602             if old_filename == filename:
 603                 raise
 604
 605
 606 def timeconvert(timestr):
 607     """Convert RFC 2822 defined time string into system timestamp"""
 608     timestamp = None
 609     timetuple = email.utils.parsedate_tz(timestr)
 610     if timetuple is not None:
 611         timestamp = email.utils.mktime_tz(timetuple)
 612     return timestamp
 613
 614
 615 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 616     """Sanitizes a string so it could be used as part of a filename.
 617     @param restricted   Use a stricter subset of allowed characters
 618     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 619                         If unset, yt-dlp's new sanitization rules are in effect
 620     """
 621     if s == '':
 622         return ''
 623
 624     def replace_insane(char):
 625         if restricted and char in ACCENT_CHARS:
 626             return ACCENT_CHARS[char]
 627         elif not restricted and char == '\n':
 628             return '\0 '
 629         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 630             # Replace with their full-width unicode counterparts
 631             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 632         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 633             return ''
 634         elif char == '"':
 635             return '' if restricted else '\''
 636         elif char == ':':
 637             return '\0_\0-' if restricted else '\0 \0-'
 638         elif char in '\\/|*<>':
 639             return '\0_'
 640         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 641             return '\0_'
 642         return char
 643
 644     # Replace look-alike Unicode glyphs
 645     if restricted and (is_id is NO_DEFAULT or not is_id):
 646         s = unicodedata.normalize('NFKC', s)
 647     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 648     result = ''.join(map(replace_insane, s))
 649     if is_id is NO_DEFAULT:
 650         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 651         STRIP_RE = r'(?:\0.|[ _-])*'
 652         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 653     result = result.replace('\0', '') or '_'
 654
 655     if not is_id:
 656         while '__' in result:
 657             result = result.replace('__', '_')
 658         result = result.strip('_')
 659         # Common case of "Foreign band name - English song title"
 660         if restricted and result.startswith('-_'):
 661             result = result[2:]
 662         if result.startswith('-'):
 663             result = '_' + result[len('-'):]
 664         result = result.lstrip('.')
 665         if not result:
 666             result = '_'
 667     return result
 668
 669
 670 def sanitize_path(s, force=False):
 671     """Sanitizes and normalizes path on Windows"""
 672     if sys.platform == 'win32':
 673         force = False
 674         drive_or_unc, _ = os.path.splitdrive(s)
 675     elif force:
 676         drive_or_unc = ''
 677     else:
 678         return s
 679
 680     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 681     if drive_or_unc:
 682         norm_path.pop(0)
 683     sanitized_path = [
 684         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 685         for path_part in norm_path]
 686     if drive_or_unc:
 687         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 688     elif force and s and s[0] == os.path.sep:
 689         sanitized_path.insert(0, os.path.sep)
 690     return os.path.join(*sanitized_path)
 691
 692
 693 def sanitize_url(url, *, scheme='http'):
 694     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 695     # the number of unwanted failures due to missing protocol
 696     if url is None:
 697         return
 698     elif url.startswith('//'):
 699         return f'{scheme}:{url}'
 700     # Fix some common typos seen so far
 701     COMMON_TYPOS = (
 702         # https://github.com/ytdl-org/youtube-dl/issues/15649
 703         (r'^httpss://', r'https://'),
 704         # https://bx1.be/lives/direct-tv/
 705         (r'^rmtp([es]?)://', r'rtmp\1://'),
 706     )
 707     for mistake, fixup in COMMON_TYPOS:
 708         if re.match(mistake, url):
 709             return re.sub(mistake, fixup, url)
 710     return url
 711
 712
 713 def extract_basic_auth(url):
 714     parts = urllib.parse.urlsplit(url)
 715     if parts.username is None:
 716         return url, None
 717     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 718         parts.hostname if parts.port is None
 719         else '%s:%d' % (parts.hostname, parts.port))))
 720     auth_payload = base64.b64encode(
 721         ('%s:%s' % (parts.username, parts.password or '')).encode())
 722     return url, f'Basic {auth_payload.decode()}'
 723
 724
 725 def expand_path(s):
 726     """Expand shell variables and ~"""
 727     return os.path.expandvars(compat_expanduser(s))
 728
 729
 730 def orderedSet(iterable, *, lazy=False):
 731     """Remove all duplicates from the input iterable"""
 732     def _iter():
 733         seen = []  # Do not use set since the items can be unhashable
 734         for x in iterable:
 735             if x not in seen:
 736                 seen.append(x)
 737                 yield x
 738
 739     return _iter() if lazy else list(_iter())
 740
 741
 742 def _htmlentity_transform(entity_with_semicolon):
 743     """Transforms an HTML entity to a character."""
 744     entity = entity_with_semicolon[:-1]
 745
 746     # Known non-numeric HTML entity
 747     if entity in html.entities.name2codepoint:
 748         return chr(html.entities.name2codepoint[entity])
 749
 750     # TODO: HTML5 allows entities without a semicolon.
 751     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 752     if entity_with_semicolon in html.entities.html5:
 753         return html.entities.html5[entity_with_semicolon]
 754
 755     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 756     if mobj is not None:
 757         numstr = mobj.group(1)
 758         if numstr.startswith('x'):
 759             base = 16
 760             numstr = '0%s' % numstr
 761         else:
 762             base = 10
 763         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 764         with contextlib.suppress(ValueError):
 765             return chr(int(numstr, base))
 766
 767     # Unknown entity in name, return its literal representation
 768     return '&%s;' % entity
 769
 770
 771 def unescapeHTML(s):
 772     if s is None:
 773         return None
 774     assert isinstance(s, str)
 775
 776     return re.sub(
 777         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 778
 779
 780 def escapeHTML(text):
 781     return (
 782         text
 783         .replace('&', '&amp;')
 784         .replace('<', '&lt;')
 785         .replace('>', '&gt;')
 786         .replace('"', '&quot;')
 787         .replace("'", '&#39;')
 788     )
 789
 790
 791 class netrc_from_content(netrc.netrc):
 792     def __init__(self, content):
 793         self.hosts, self.macros = {}, {}
 794         with io.StringIO(content) as stream:
 795             self._parse('-', stream, False)
 796
 797
 798 class Popen(subprocess.Popen):
 799     if sys.platform == 'win32':
 800         _startupinfo = subprocess.STARTUPINFO()
 801         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 802     else:
 803         _startupinfo = None
 804
 805     @staticmethod
 806     def _fix_pyinstaller_ld_path(env):
 807         """Restore LD_LIBRARY_PATH when using PyInstaller
 808             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 809                  https://github.com/yt-dlp/yt-dlp/issues/4573
 810         """
 811         if not hasattr(sys, '_MEIPASS'):
 812             return
 813
 814         def _fix(key):
 815             orig = env.get(f'{key}_ORIG')
 816             if orig is None:
 817                 env.pop(key, None)
 818             else:
 819                 env[key] = orig
 820
 821         _fix('LD_LIBRARY_PATH')  # Linux
 822         _fix('DYLD_LIBRARY_PATH')  # macOS
 823
 824     def __init__(self, *args, env=None, text=False, **kwargs):
 825         if env is None:
 826             env = os.environ.copy()
 827         self._fix_pyinstaller_ld_path(env)
 828
 829         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 830         if text is True:
 831             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 832             kwargs.setdefault('encoding', 'utf-8')
 833             kwargs.setdefault('errors', 'replace')
 834         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 835
 836     def communicate_or_kill(self, *args, **kwargs):
 837         try:
 838             return self.communicate(*args, **kwargs)
 839         except BaseException:  # Including KeyboardInterrupt
 840             self.kill(timeout=None)
 841             raise
 842
 843     def kill(self, *, timeout=0):
 844         super().kill()
 845         if timeout != 0:
 846             self.wait(timeout=timeout)
 847
 848     @classmethod
 849     def run(cls, *args, timeout=None, **kwargs):
 850         with cls(*args, **kwargs) as proc:
 851             default = '' if proc.__text_mode else b''
 852             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 853             return stdout or default, stderr or default, proc.returncode
 854
 855
 856 def encodeArgument(s):
 857     # Legacy code that uses byte strings
 858     # Uncomment the following line after fixing all post processors
 859     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 860     return s if isinstance(s, str) else s.decode('ascii')
 861
 862
 863 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 864
 865
 866 def timetuple_from_msec(msec):
 867     secs, msec = divmod(msec, 1000)
 868     mins, secs = divmod(secs, 60)
 869     hrs, mins = divmod(mins, 60)
 870     return _timetuple(hrs, mins, secs, msec)
 871
 872
 873 def formatSeconds(secs, delim=':', msec=False):
 874     time = timetuple_from_msec(secs * 1000)
 875     if time.hours:
 876         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 877     elif time.minutes:
 878         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 879     else:
 880         ret = '%d' % time.seconds
 881     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 882
 883
 884 def bug_reports_message(before=';'):
 885     from ..update import REPOSITORY
 886
 887     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 888            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 889
 890     before = before.rstrip()
 891     if not before or before.endswith(('.', '!', '?')):
 892         msg = msg[0].title() + msg[1:]
 893
 894     return (before + ' ' if before else '') + msg
 895
 896
 897 class YoutubeDLError(Exception):
 898     """Base exception for YoutubeDL errors."""
 899     msg = None
 900
 901     def __init__(self, msg=None):
 902         if msg is not None:
 903             self.msg = msg
 904         elif self.msg is None:
 905             self.msg = type(self).__name__
 906         super().__init__(self.msg)
 907
 908
 909 class ExtractorError(YoutubeDLError):
 910     """Error during info extraction."""
 911
 912     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 913         """ tb, if given, is the original traceback (so that it can be printed out).
 914         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 915         """
 916         from ..networking.exceptions import network_exceptions
 917         if sys.exc_info()[0] in network_exceptions:
 918             expected = True
 919
 920         self.orig_msg = str(msg)
 921         self.traceback = tb
 922         self.expected = expected
 923         self.cause = cause
 924         self.video_id = video_id
 925         self.ie = ie
 926         self.exc_info = sys.exc_info()  # preserve original exception
 927         if isinstance(self.exc_info[1], ExtractorError):
 928             self.exc_info = self.exc_info[1].exc_info
 929         super().__init__(self.__msg)
 930
 931     @property
 932     def __msg(self):
 933         return ''.join((
 934             format_field(self.ie, None, '[%s] '),
 935             format_field(self.video_id, None, '%s: '),
 936             self.orig_msg,
 937             format_field(self.cause, None, ' (caused by %r)'),
 938             '' if self.expected else bug_reports_message()))
 939
 940     def format_traceback(self):
 941         return join_nonempty(
 942             self.traceback and ''.join(traceback.format_tb(self.traceback)),
 943             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
 944             delim='\n') or None
 945
 946     def __setattr__(self, name, value):
 947         super().__setattr__(name, value)
 948         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
 949             self.msg = self.__msg or type(self).__name__
 950             self.args = (self.msg, )  # Cannot be property
 951
 952
 953 class UnsupportedError(ExtractorError):
 954     def __init__(self, url):
 955         super().__init__(
 956             'Unsupported URL: %s' % url, expected=True)
 957         self.url = url
 958
 959
 960 class RegexNotFoundError(ExtractorError):
 961     """Error when a regex didn't match"""
 962     pass
 963
 964
 965 class GeoRestrictedError(ExtractorError):
 966     """Geographic restriction Error exception.
 967
 968     This exception may be thrown when a video is not available from your
 969     geographic location due to geographic restrictions imposed by a website.
 970     """
 971
 972     def __init__(self, msg, countries=None, **kwargs):
 973         kwargs['expected'] = True
 974         super().__init__(msg, **kwargs)
 975         self.countries = countries
 976
 977
 978 class UserNotLive(ExtractorError):
 979     """Error when a channel/user is not live"""
 980
 981     def __init__(self, msg=None, **kwargs):
 982         kwargs['expected'] = True
 983         super().__init__(msg or 'The channel is not currently live', **kwargs)
 984
 985
 986 class DownloadError(YoutubeDLError):
 987     """Download Error exception.
 988
 989     This exception may be thrown by FileDownloader objects if they are not
 990     configured to continue on errors. They will contain the appropriate
 991     error message.
 992     """
 993
 994     def __init__(self, msg, exc_info=None):
 995         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 996         super().__init__(msg)
 997         self.exc_info = exc_info
 998
 999
1000 class EntryNotInPlaylist(YoutubeDLError):
1001     """Entry not in playlist exception.
1002
1003     This exception will be thrown by YoutubeDL when a requested entry
1004     is not found in the playlist info_dict
1005     """
1006     msg = 'Entry not found in info'
1007
1008
1009 class SameFileError(YoutubeDLError):
1010     """Same File exception.
1011
1012     This exception will be thrown by FileDownloader objects if they detect
1013     multiple files would have to be downloaded to the same file on disk.
1014     """
1015     msg = 'Fixed output name but more than one file to download'
1016
1017     def __init__(self, filename=None):
1018         if filename is not None:
1019             self.msg += f': {filename}'
1020         super().__init__(self.msg)
1021
1022
1023 class PostProcessingError(YoutubeDLError):
1024     """Post Processing exception.
1025
1026     This exception may be raised by PostProcessor's .run() method to
1027     indicate an error in the postprocessing task.
1028     """
1029
1030
1031 class DownloadCancelled(YoutubeDLError):
1032     """ Exception raised when the download queue should be interrupted """
1033     msg = 'The download was cancelled'
1034
1035
1036 class ExistingVideoReached(DownloadCancelled):
1037     """ --break-on-existing triggered """
1038     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1039
1040
1041 class RejectedVideoReached(DownloadCancelled):
1042     """ --break-match-filter triggered """
1043     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1044
1045
1046 class MaxDownloadsReached(DownloadCancelled):
1047     """ --max-downloads limit has been reached. """
1048     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1049
1050
1051 class ReExtractInfo(YoutubeDLError):
1052     """ Video info needs to be re-extracted. """
1053
1054     def __init__(self, msg, expected=False):
1055         super().__init__(msg)
1056         self.expected = expected
1057
1058
1059 class ThrottledDownload(ReExtractInfo):
1060     """ Download speed below --throttled-rate. """
1061     msg = 'The download speed is below throttle limit'
1062
1063     def __init__(self):
1064         super().__init__(self.msg, expected=False)
1065
1066
1067 class UnavailableVideoError(YoutubeDLError):
1068     """Unavailable Format exception.
1069
1070     This exception will be thrown when a video is requested
1071     in a format that is not available for that video.
1072     """
1073     msg = 'Unable to download video'
1074
1075     def __init__(self, err=None):
1076         if err is not None:
1077             self.msg += f': {err}'
1078         super().__init__(self.msg)
1079
1080
1081 class ContentTooShortError(YoutubeDLError):
1082     """Content Too Short exception.
1083
1084     This exception may be raised by FileDownloader objects when a file they
1085     download is too small for what the server announced first, indicating
1086     the connection was probably interrupted.
1087     """
1088
1089     def __init__(self, downloaded, expected):
1090         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1091         # Both in bytes
1092         self.downloaded = downloaded
1093         self.expected = expected
1094
1095
1096 class XAttrMetadataError(YoutubeDLError):
1097     def __init__(self, code=None, msg='Unknown error'):
1098         super().__init__(msg)
1099         self.code = code
1100         self.msg = msg
1101
1102         # Parsing code and msg
1103         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1104                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1105             self.reason = 'NO_SPACE'
1106         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1107             self.reason = 'VALUE_TOO_LONG'
1108         else:
1109             self.reason = 'NOT_SUPPORTED'
1110
1111
1112 class XAttrUnavailableError(YoutubeDLError):
1113     pass
1114
1115
1116 def is_path_like(f):
1117     return isinstance(f, (str, bytes, os.PathLike))
1118
1119
1120 def extract_timezone(date_str):
1121     m = re.search(
1122         r'''(?x)
1123             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1124             (?P<tz>Z|                                            # just the UTC Z, or
1125                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1126                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1127                    [ ]?                                          # optional space
1128                 (?P<sign>\+|-)                                   # +/-
1129                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1130             $)
1131         ''', date_str)
1132     if not m:
1133         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1134         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1135         if timezone is not None:
1136             date_str = date_str[:-len(m.group('tz'))]
1137         timezone = datetime.timedelta(hours=timezone or 0)
1138     else:
1139         date_str = date_str[:-len(m.group('tz'))]
1140         if not m.group('sign'):
1141             timezone = datetime.timedelta()
1142         else:
1143             sign = 1 if m.group('sign') == '+' else -1
1144             timezone = datetime.timedelta(
1145                 hours=sign * int(m.group('hours')),
1146                 minutes=sign * int(m.group('minutes')))
1147     return timezone, date_str
1148
1149
1150 def parse_iso8601(date_str, delimiter='T', timezone=None):
1151     """ Return a UNIX timestamp from the given date """
1152
1153     if date_str is None:
1154         return None
1155
1156     date_str = re.sub(r'\.[0-9]+', '', date_str)
1157
1158     if timezone is None:
1159         timezone, date_str = extract_timezone(date_str)
1160
1161     with contextlib.suppress(ValueError):
1162         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1163         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1164         return calendar.timegm(dt.timetuple())
1165
1166
1167 def date_formats(day_first=True):
1168     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1169
1170
1171 def unified_strdate(date_str, day_first=True):
1172     """Return a string with the date in the format YYYYMMDD"""
1173
1174     if date_str is None:
1175         return None
1176     upload_date = None
1177     # Replace commas
1178     date_str = date_str.replace(',', ' ')
1179     # Remove AM/PM + timezone
1180     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1181     _, date_str = extract_timezone(date_str)
1182
1183     for expression in date_formats(day_first):
1184         with contextlib.suppress(ValueError):
1185             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1186     if upload_date is None:
1187         timetuple = email.utils.parsedate_tz(date_str)
1188         if timetuple:
1189             with contextlib.suppress(ValueError):
1190                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1191     if upload_date is not None:
1192         return str(upload_date)
1193
1194
1195 def unified_timestamp(date_str, day_first=True):
1196     if not isinstance(date_str, str):
1197         return None
1198
1199     date_str = re.sub(r'\s+', ' ', re.sub(
1200         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1201
1202     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1203     timezone, date_str = extract_timezone(date_str)
1204
1205     # Remove AM/PM + timezone
1206     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1207
1208     # Remove unrecognized timezones from ISO 8601 alike timestamps
1209     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1210     if m:
1211         date_str = date_str[:-len(m.group('tz'))]
1212
1213     # Python only supports microseconds, so remove nanoseconds
1214     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1215     if m:
1216         date_str = m.group(1)
1217
1218     for expression in date_formats(day_first):
1219         with contextlib.suppress(ValueError):
1220             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1221             return calendar.timegm(dt.timetuple())
1222
1223     timetuple = email.utils.parsedate_tz(date_str)
1224     if timetuple:
1225         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1226
1227
1228 def determine_ext(url, default_ext='unknown_video'):
1229     if url is None or '.' not in url:
1230         return default_ext
1231     guess = url.partition('?')[0].rpartition('.')[2]
1232     if re.match(r'^[A-Za-z0-9]+$', guess):
1233         return guess
1234     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1235     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1236         return guess.rstrip('/')
1237     else:
1238         return default_ext
1239
1240
1241 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1242     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1243
1244
1245 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1246     R"""
1247     Return a datetime object from a string.
1248     Supported format:
1249         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1250
1251     @param format       strftime format of DATE
1252     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1253                         auto: round to the unit provided in date_str (if applicable).
1254     """
1255     auto_precision = False
1256     if precision == 'auto':
1257         auto_precision = True
1258         precision = 'microsecond'
1259     today = datetime_round(datetime.datetime.utcnow(), precision)
1260     if date_str in ('now', 'today'):
1261         return today
1262     if date_str == 'yesterday':
1263         return today - datetime.timedelta(days=1)
1264     match = re.match(
1265         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1266         date_str)
1267     if match is not None:
1268         start_time = datetime_from_str(match.group('start'), precision, format)
1269         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1270         unit = match.group('unit')
1271         if unit == 'month' or unit == 'year':
1272             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1273             unit = 'day'
1274         else:
1275             if unit == 'week':
1276                 unit = 'day'
1277                 time *= 7
1278             delta = datetime.timedelta(**{unit + 's': time})
1279             new_date = start_time + delta
1280         if auto_precision:
1281             return datetime_round(new_date, unit)
1282         return new_date
1283
1284     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1285
1286
1287 def date_from_str(date_str, format='%Y%m%d', strict=False):
1288     R"""
1289     Return a date object from a string using datetime_from_str
1290
1291     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1292                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1293     """
1294     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1295         raise ValueError(f'Invalid date format "{date_str}"')
1296     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1297
1298
1299 def datetime_add_months(dt, months):
1300     """Increment/Decrement a datetime object by months."""
1301     month = dt.month + months - 1
1302     year = dt.year + month // 12
1303     month = month % 12 + 1
1304     day = min(dt.day, calendar.monthrange(year, month)[1])
1305     return dt.replace(year, month, day)
1306
1307
1308 def datetime_round(dt, precision='day'):
1309     """
1310     Round a datetime object's time to a specific precision
1311     """
1312     if precision == 'microsecond':
1313         return dt
1314
1315     unit_seconds = {
1316         'day': 86400,
1317         'hour': 3600,
1318         'minute': 60,
1319         'second': 1,
1320     }
1321     roundto = lambda x, n: ((x + n / 2) // n) * n
1322     timestamp = calendar.timegm(dt.timetuple())
1323     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1324
1325
1326 def hyphenate_date(date_str):
1327     """
1328     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1329     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1330     if match is not None:
1331         return '-'.join(match.groups())
1332     else:
1333         return date_str
1334
1335
1336 class DateRange:
1337     """Represents a time interval between two dates"""
1338
1339     def __init__(self, start=None, end=None):
1340         """start and end must be strings in the format accepted by date"""
1341         if start is not None:
1342             self.start = date_from_str(start, strict=True)
1343         else:
1344             self.start = datetime.datetime.min.date()
1345         if end is not None:
1346             self.end = date_from_str(end, strict=True)
1347         else:
1348             self.end = datetime.datetime.max.date()
1349         if self.start > self.end:
1350             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1351
1352     @classmethod
1353     def day(cls, day):
1354         """Returns a range that only contains the given day"""
1355         return cls(day, day)
1356
1357     def __contains__(self, date):
1358         """Check if the date is in the range"""
1359         if not isinstance(date, datetime.date):
1360             date = date_from_str(date)
1361         return self.start <= date <= self.end
1362
1363     def __repr__(self):
1364         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1365
1366     def __eq__(self, other):
1367         return (isinstance(other, DateRange)
1368                 and self.start == other.start and self.end == other.end)
1369
1370
1371 @functools.cache
1372 def system_identifier():
1373     python_implementation = platform.python_implementation()
1374     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1375         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1376     libc_ver = []
1377     with contextlib.suppress(OSError):  # We may not have access to the executable
1378         libc_ver = platform.libc_ver()
1379
1380     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1381         platform.python_version(),
1382         python_implementation,
1383         platform.machine(),
1384         platform.architecture()[0],
1385         platform.platform(),
1386         ssl.OPENSSL_VERSION,
1387         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1388     )
1389
1390
1391 @functools.cache
1392 def get_windows_version():
1393     ''' Get Windows version. returns () if it's not running on Windows '''
1394     if compat_os_name == 'nt':
1395         return version_tuple(platform.win32_ver()[1])
1396     else:
1397         return ()
1398
1399
1400 def write_string(s, out=None, encoding=None):
1401     assert isinstance(s, str)
1402     out = out or sys.stderr
1403     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1404     if not out:
1405         return
1406
1407     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1408         s = re.sub(r'([\r\n]+)', r' \1', s)
1409
1410     enc, buffer = None, out
1411     if 'b' in getattr(out, 'mode', ''):
1412         enc = encoding or preferredencoding()
1413     elif hasattr(out, 'buffer'):
1414         buffer = out.buffer
1415         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1416
1417     buffer.write(s.encode(enc, 'ignore') if enc else s)
1418     out.flush()
1419
1420
1421 # TODO: Use global logger
1422 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1423     from .. import _IN_CLI
1424     if _IN_CLI:
1425         if msg in deprecation_warning._cache:
1426             return
1427         deprecation_warning._cache.add(msg)
1428         if printer:
1429             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1430         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1431     else:
1432         import warnings
1433         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1434
1435
1436 deprecation_warning._cache = set()
1437
1438
1439 def bytes_to_intlist(bs):
1440     if not bs:
1441         return []
1442     if isinstance(bs[0], int):  # Python 3
1443         return list(bs)
1444     else:
1445         return [ord(c) for c in bs]
1446
1447
1448 def intlist_to_bytes(xs):
1449     if not xs:
1450         return b''
1451     return struct.pack('%dB' % len(xs), *xs)
1452
1453
1454 class LockingUnsupportedError(OSError):
1455     msg = 'File locking is not supported'
1456
1457     def __init__(self):
1458         super().__init__(self.msg)
1459
1460
1461 # Cross-platform file locking
1462 if sys.platform == 'win32':
1463     import ctypes
1464     import ctypes.wintypes
1465     import msvcrt
1466
1467     class OVERLAPPED(ctypes.Structure):
1468         _fields_ = [
1469             ('Internal', ctypes.wintypes.LPVOID),
1470             ('InternalHigh', ctypes.wintypes.LPVOID),
1471             ('Offset', ctypes.wintypes.DWORD),
1472             ('OffsetHigh', ctypes.wintypes.DWORD),
1473             ('hEvent', ctypes.wintypes.HANDLE),
1474         ]
1475
1476     kernel32 = ctypes.WinDLL('kernel32')
1477     LockFileEx = kernel32.LockFileEx
1478     LockFileEx.argtypes = [
1479         ctypes.wintypes.HANDLE,     # hFile
1480         ctypes.wintypes.DWORD,      # dwFlags
1481         ctypes.wintypes.DWORD,      # dwReserved
1482         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1483         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1484         ctypes.POINTER(OVERLAPPED)  # Overlapped
1485     ]
1486     LockFileEx.restype = ctypes.wintypes.BOOL
1487     UnlockFileEx = kernel32.UnlockFileEx
1488     UnlockFileEx.argtypes = [
1489         ctypes.wintypes.HANDLE,     # hFile
1490         ctypes.wintypes.DWORD,      # dwReserved
1491         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1492         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1493         ctypes.POINTER(OVERLAPPED)  # Overlapped
1494     ]
1495     UnlockFileEx.restype = ctypes.wintypes.BOOL
1496     whole_low = 0xffffffff
1497     whole_high = 0x7fffffff
1498
1499     def _lock_file(f, exclusive, block):
1500         overlapped = OVERLAPPED()
1501         overlapped.Offset = 0
1502         overlapped.OffsetHigh = 0
1503         overlapped.hEvent = 0
1504         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1505
1506         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1507                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1508                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1509             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1510             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1511
1512     def _unlock_file(f):
1513         assert f._lock_file_overlapped_p
1514         handle = msvcrt.get_osfhandle(f.fileno())
1515         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1516             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1517
1518 else:
1519     try:
1520         import fcntl
1521
1522         def _lock_file(f, exclusive, block):
1523             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1524             if not block:
1525                 flags |= fcntl.LOCK_NB
1526             try:
1527                 fcntl.flock(f, flags)
1528             except BlockingIOError:
1529                 raise
1530             except OSError:  # AOSP does not have flock()
1531                 fcntl.lockf(f, flags)
1532
1533         def _unlock_file(f):
1534             with contextlib.suppress(OSError):
1535                 return fcntl.flock(f, fcntl.LOCK_UN)
1536             with contextlib.suppress(OSError):
1537                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1538             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1539
1540     except ImportError:
1541
1542         def _lock_file(f, exclusive, block):
1543             raise LockingUnsupportedError()
1544
1545         def _unlock_file(f):
1546             raise LockingUnsupportedError()
1547
1548
1549 class locked_file:
1550     locked = False
1551
1552     def __init__(self, filename, mode, block=True, encoding=None):
1553         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1554             raise NotImplementedError(mode)
1555         self.mode, self.block = mode, block
1556
1557         writable = any(f in mode for f in 'wax+')
1558         readable = any(f in mode for f in 'r+')
1559         flags = functools.reduce(operator.ior, (
1560             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
1561             getattr(os, 'O_BINARY', 0),  # Windows only
1562             getattr(os, 'O_NOINHERIT', 0),  # Windows only
1563             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
1564             os.O_APPEND if 'a' in mode else 0,
1565             os.O_EXCL if 'x' in mode else 0,
1566             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1567         ))
1568
1569         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1570
1571     def __enter__(self):
1572         exclusive = 'r' not in self.mode
1573         try:
1574             _lock_file(self.f, exclusive, self.block)
1575             self.locked = True
1576         except OSError:
1577             self.f.close()
1578             raise
1579         if 'w' in self.mode:
1580             try:
1581                 self.f.truncate()
1582             except OSError as e:
1583                 if e.errno not in (
1584                     errno.ESPIPE,  # Illegal seek - expected for FIFO
1585                     errno.EINVAL,  # Invalid argument - expected for /dev/null
1586                 ):
1587                     raise
1588         return self
1589
1590     def unlock(self):
1591         if not self.locked:
1592             return
1593         try:
1594             _unlock_file(self.f)
1595         finally:
1596             self.locked = False
1597
1598     def __exit__(self, *_):
1599         try:
1600             self.unlock()
1601         finally:
1602             self.f.close()
1603
1604     open = __enter__
1605     close = __exit__
1606
1607     def __getattr__(self, attr):
1608         return getattr(self.f, attr)
1609
1610     def __iter__(self):
1611         return iter(self.f)
1612
1613
1614 @functools.cache
1615 def get_filesystem_encoding():
1616     encoding = sys.getfilesystemencoding()
1617     return encoding if encoding is not None else 'utf-8'
1618
1619
1620 def shell_quote(args):
1621     quoted_args = []
1622     encoding = get_filesystem_encoding()
1623     for a in args:
1624         if isinstance(a, bytes):
1625             # We may get a filename encoded with 'encodeFilename'
1626             a = a.decode(encoding)
1627         quoted_args.append(compat_shlex_quote(a))
1628     return ' '.join(quoted_args)
1629
1630
1631 def smuggle_url(url, data):
1632     """ Pass additional data in a URL for internal use. """
1633
1634     url, idata = unsmuggle_url(url, {})
1635     data.update(idata)
1636     sdata = urllib.parse.urlencode(
1637         {'__youtubedl_smuggle': json.dumps(data)})
1638     return url + '#' + sdata
1639
1640
1641 def unsmuggle_url(smug_url, default=None):
1642     if '#__youtubedl_smuggle' not in smug_url:
1643         return smug_url, default
1644     url, _, sdata = smug_url.rpartition('#')
1645     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1646     data = json.loads(jsond)
1647     return url, data
1648
1649
1650 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1651     """ Formats numbers with decimal sufixes like K, M, etc """
1652     num, factor = float_or_none(num), float(factor)
1653     if num is None or num < 0:
1654         return None
1655     POSSIBLE_SUFFIXES = 'kMGTPEZY'
1656     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1657     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1658     if factor == 1024:
1659         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1660     converted = num / (factor ** exponent)
1661     return fmt % (converted, suffix)
1662
1663
1664 def format_bytes(bytes):
1665     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1666
1667
1668 def lookup_unit_table(unit_table, s, strict=False):
1669     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1670     units_re = '|'.join(re.escape(u) for u in unit_table)
1671     m = (re.fullmatch if strict else re.match)(
1672         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1673     if not m:
1674         return None
1675
1676     num = float(m.group('num').replace(',', '.'))
1677     mult = unit_table[m.group('unit')]
1678     return round(num * mult)
1679
1680
1681 def parse_bytes(s):
1682     """Parse a string indicating a byte quantity into an integer"""
1683     return lookup_unit_table(
1684         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1685         s.upper(), strict=True)
1686
1687
1688 def parse_filesize(s):
1689     if s is None:
1690         return None
1691
1692     # The lower-case forms are of course incorrect and unofficial,
1693     # but we support those too
1694     _UNIT_TABLE = {
1695         'B': 1,
1696         'b': 1,
1697         'bytes': 1,
1698         'KiB': 1024,
1699         'KB': 1000,
1700         'kB': 1024,
1701         'Kb': 1000,
1702         'kb': 1000,
1703         'kilobytes': 1000,
1704         'kibibytes': 1024,
1705         'MiB': 1024 ** 2,
1706         'MB': 1000 ** 2,
1707         'mB': 1024 ** 2,
1708         'Mb': 1000 ** 2,
1709         'mb': 1000 ** 2,
1710         'megabytes': 1000 ** 2,
1711         'mebibytes': 1024 ** 2,
1712         'GiB': 1024 ** 3,
1713         'GB': 1000 ** 3,
1714         'gB': 1024 ** 3,
1715         'Gb': 1000 ** 3,
1716         'gb': 1000 ** 3,
1717         'gigabytes': 1000 ** 3,
1718         'gibibytes': 1024 ** 3,
1719         'TiB': 1024 ** 4,
1720         'TB': 1000 ** 4,
1721         'tB': 1024 ** 4,
1722         'Tb': 1000 ** 4,
1723         'tb': 1000 ** 4,
1724         'terabytes': 1000 ** 4,
1725         'tebibytes': 1024 ** 4,
1726         'PiB': 1024 ** 5,
1727         'PB': 1000 ** 5,
1728         'pB': 1024 ** 5,
1729         'Pb': 1000 ** 5,
1730         'pb': 1000 ** 5,
1731         'petabytes': 1000 ** 5,
1732         'pebibytes': 1024 ** 5,
1733         'EiB': 1024 ** 6,
1734         'EB': 1000 ** 6,
1735         'eB': 1024 ** 6,
1736         'Eb': 1000 ** 6,
1737         'eb': 1000 ** 6,
1738         'exabytes': 1000 ** 6,
1739         'exbibytes': 1024 ** 6,
1740         'ZiB': 1024 ** 7,
1741         'ZB': 1000 ** 7,
1742         'zB': 1024 ** 7,
1743         'Zb': 1000 ** 7,
1744         'zb': 1000 ** 7,
1745         'zettabytes': 1000 ** 7,
1746         'zebibytes': 1024 ** 7,
1747         'YiB': 1024 ** 8,
1748         'YB': 1000 ** 8,
1749         'yB': 1024 ** 8,
1750         'Yb': 1000 ** 8,
1751         'yb': 1000 ** 8,
1752         'yottabytes': 1000 ** 8,
1753         'yobibytes': 1024 ** 8,
1754     }
1755
1756     return lookup_unit_table(_UNIT_TABLE, s)
1757
1758
1759 def parse_count(s):
1760     if s is None:
1761         return None
1762
1763     s = re.sub(r'^[^\d]+\s', '', s).strip()
1764
1765     if re.match(r'^[\d,.]+$', s):
1766         return str_to_int(s)
1767
1768     _UNIT_TABLE = {
1769         'k': 1000,
1770         'K': 1000,
1771         'm': 1000 ** 2,
1772         'M': 1000 ** 2,
1773         'kk': 1000 ** 2,
1774         'KK': 1000 ** 2,
1775         'b': 1000 ** 3,
1776         'B': 1000 ** 3,
1777     }
1778
1779     ret = lookup_unit_table(_UNIT_TABLE, s)
1780     if ret is not None:
1781         return ret
1782
1783     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1784     if mobj:
1785         return str_to_int(mobj.group(1))
1786
1787
1788 def parse_resolution(s, *, lenient=False):
1789     if s is None:
1790         return {}
1791
1792     if lenient:
1793         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1794     else:
1795         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1796     if mobj:
1797         return {
1798             'width': int(mobj.group('w')),
1799             'height': int(mobj.group('h')),
1800         }
1801
1802     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1803     if mobj:
1804         return {'height': int(mobj.group(1))}
1805
1806     mobj = re.search(r'\b([48])[kK]\b', s)
1807     if mobj:
1808         return {'height': int(mobj.group(1)) * 540}
1809
1810     return {}
1811
1812
1813 def parse_bitrate(s):
1814     if not isinstance(s, str):
1815         return
1816     mobj = re.search(r'\b(\d+)\s*kbps', s)
1817     if mobj:
1818         return int(mobj.group(1))
1819
1820
1821 def month_by_name(name, lang='en'):
1822     """ Return the number of a month by (locale-independently) English name """
1823
1824     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1825
1826     try:
1827         return month_names.index(name) + 1
1828     except ValueError:
1829         return None
1830
1831
1832 def month_by_abbreviation(abbrev):
1833     """ Return the number of a month by (locale-independently) English
1834         abbreviations """
1835
1836     try:
1837         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1838     except ValueError:
1839         return None
1840
1841
1842 def fix_xml_ampersands(xml_str):
1843     """Replace all the '&' by '&amp;' in XML"""
1844     return re.sub(
1845         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1846         '&amp;',
1847         xml_str)
1848
1849
1850 def setproctitle(title):
1851     assert isinstance(title, str)
1852
1853     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1854     try:
1855         import ctypes
1856     except ImportError:
1857         return
1858
1859     try:
1860         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1861     except OSError:
1862         return
1863     except TypeError:
1864         # LoadLibrary in Windows Python 2.7.13 only expects
1865         # a bytestring, but since unicode_literals turns
1866         # every string into a unicode string, it fails.
1867         return
1868     title_bytes = title.encode()
1869     buf = ctypes.create_string_buffer(len(title_bytes))
1870     buf.value = title_bytes
1871     try:
1872         libc.prctl(15, buf, 0, 0, 0)
1873     except AttributeError:
1874         return  # Strange libc, just skip this
1875
1876
1877 def remove_start(s, start):
1878     return s[len(start):] if s is not None and s.startswith(start) else s
1879
1880
1881 def remove_end(s, end):
1882     return s[:-len(end)] if s is not None and s.endswith(end) else s
1883
1884
1885 def remove_quotes(s):
1886     if s is None or len(s) < 2:
1887         return s
1888     for quote in ('"', "'", ):
1889         if s[0] == quote and s[-1] == quote:
1890             return s[1:-1]
1891     return s
1892
1893
1894 def get_domain(url):
1895     """
1896     This implementation is inconsistent, but is kept for compatibility.
1897     Use this only for "webpage_url_domain"
1898     """
1899     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1900
1901
1902 def url_basename(url):
1903     path = urllib.parse.urlparse(url).path
1904     return path.strip('/').split('/')[-1]
1905
1906
1907 def base_url(url):
1908     return re.match(r'https?://[^?#]+/', url).group()
1909
1910
1911 def urljoin(base, path):
1912     if isinstance(path, bytes):
1913         path = path.decode()
1914     if not isinstance(path, str) or not path:
1915         return None
1916     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1917         return path
1918     if isinstance(base, bytes):
1919         base = base.decode()
1920     if not isinstance(base, str) or not re.match(
1921             r'^(?:https?:)?//', base):
1922         return None
1923     return urllib.parse.urljoin(base, path)
1924
1925
1926 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1927     if get_attr and v is not None:
1928         v = getattr(v, get_attr, None)
1929     try:
1930         return int(v) * invscale // scale
1931     except (ValueError, TypeError, OverflowError):
1932         return default
1933
1934
1935 def str_or_none(v, default=None):
1936     return default if v is None else str(v)
1937
1938
1939 def str_to_int(int_str):
1940     """ A more relaxed version of int_or_none """
1941     if isinstance(int_str, int):
1942         return int_str
1943     elif isinstance(int_str, str):
1944         int_str = re.sub(r'[,\.\+]', '', int_str)
1945         return int_or_none(int_str)
1946
1947
1948 def float_or_none(v, scale=1, invscale=1, default=None):
1949     if v is None:
1950         return default
1951     try:
1952         return float(v) * invscale / scale
1953     except (ValueError, TypeError):
1954         return default
1955
1956
1957 def bool_or_none(v, default=None):
1958     return v if isinstance(v, bool) else default
1959
1960
1961 def strip_or_none(v, default=None):
1962     return v.strip() if isinstance(v, str) else default
1963
1964
1965 def url_or_none(url):
1966     if not url or not isinstance(url, str):
1967         return None
1968     url = url.strip()
1969     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
1970
1971
1972 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
1973     datetime_object = None
1974     try:
1975         if isinstance(timestamp, (int, float)):  # unix timestamp
1976             # Using naive datetime here can break timestamp() in Windows
1977             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
1978             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
1979             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
1980             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
1981                                + datetime.timedelta(seconds=timestamp))
1982         elif isinstance(timestamp, str):  # assume YYYYMMDD
1983             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
1984         date_format = re.sub(  # Support %s on windows
1985             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
1986         return datetime_object.strftime(date_format)
1987     except (ValueError, TypeError, AttributeError):
1988         return default
1989
1990
1991 def parse_duration(s):
1992     if not isinstance(s, str):
1993         return None
1994     s = s.strip()
1995     if not s:
1996         return None
1997
1998     days, hours, mins, secs, ms = [None] * 5
1999     m = re.match(r'''(?x)
2000             (?P<before_secs>
2001                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2002             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2003             (?P<ms>[.:][0-9]+)?Z?$
2004         ''', s)
2005     if m:
2006         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2007     else:
2008         m = re.match(
2009             r'''(?ix)(?:P?
2010                 (?:
2011                     [0-9]+\s*y(?:ears?)?,?\s*
2012                 )?
2013                 (?:
2014                     [0-9]+\s*m(?:onths?)?,?\s*
2015                 )?
2016                 (?:
2017                     [0-9]+\s*w(?:eeks?)?,?\s*
2018                 )?
2019                 (?:
2020                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2021                 )?
2022                 T)?
2023                 (?:
2024                     (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2025                 )?
2026                 (?:
2027                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2028                 )?
2029                 (?:
2030                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2031                 )?Z?$''', s)
2032         if m:
2033             days, hours, mins, secs, ms = m.groups()
2034         else:
2035             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2036             if m:
2037                 hours, mins = m.groups()
2038             else:
2039                 return None
2040
2041     if ms:
2042         ms = ms.replace(':', '.')
2043     return sum(float(part or 0) * mult for part, mult in (
2044         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2045
2046
2047 def prepend_extension(filename, ext, expected_real_ext=None):
2048     name, real_ext = os.path.splitext(filename)
2049     return (
2050         f'{name}.{ext}{real_ext}'
2051         if not expected_real_ext or real_ext[1:] == expected_real_ext
2052         else f'{filename}.{ext}')
2053
2054
2055 def replace_extension(filename, ext, expected_real_ext=None):
2056     name, real_ext = os.path.splitext(filename)
2057     return '{}.{}'.format(
2058         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2059         ext)
2060
2061
2062 def check_executable(exe, args=[]):
2063     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2064     args can be a list of arguments for a short output (like -version) """
2065     try:
2066         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2067     except OSError:
2068         return False
2069     return exe
2070
2071
2072 def _get_exe_version_output(exe, args):
2073     try:
2074         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2075         # SIGTTOU if yt-dlp is run in the background.
2076         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2077         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2078                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2079         if ret:
2080             return None
2081     except OSError:
2082         return False
2083     return stdout
2084
2085
2086 def detect_exe_version(output, version_re=None, unrecognized='present'):
2087     assert isinstance(output, str)
2088     if version_re is None:
2089         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2090     m = re.search(version_re, output)
2091     if m:
2092         return m.group(1)
2093     else:
2094         return unrecognized
2095
2096
2097 def get_exe_version(exe, args=['--version'],
2098                     version_re=None, unrecognized=('present', 'broken')):
2099     """ Returns the version of the specified executable,
2100     or False if the executable is not present """
2101     unrecognized = variadic(unrecognized)
2102     assert len(unrecognized) in (1, 2)
2103     out = _get_exe_version_output(exe, args)
2104     if out is None:
2105         return unrecognized[-1]
2106     return out and detect_exe_version(out, version_re, unrecognized[0])
2107
2108
2109 def frange(start=0, stop=None, step=1):
2110     """Float range"""
2111     if stop is None:
2112         start, stop = 0, start
2113     sign = [-1, 1][step > 0] if step else 0
2114     while sign * start < sign * stop:
2115         yield start
2116         start += step
2117
2118
2119 class LazyList(collections.abc.Sequence):
2120     """Lazy immutable list from an iterable
2121     Note that slices of a LazyList are lists and not LazyList"""
2122
2123     class IndexError(IndexError):
2124         pass
2125
2126     def __init__(self, iterable, *, reverse=False, _cache=None):
2127         self._iterable = iter(iterable)
2128         self._cache = [] if _cache is None else _cache
2129         self._reversed = reverse
2130
2131     def __iter__(self):
2132         if self._reversed:
2133             # We need to consume the entire iterable to iterate in reverse
2134             yield from self.exhaust()
2135             return
2136         yield from self._cache
2137         for item in self._iterable:
2138             self._cache.append(item)
2139             yield item
2140
2141     def _exhaust(self):
2142         self._cache.extend(self._iterable)
2143         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2144         return self._cache
2145
2146     def exhaust(self):
2147         """Evaluate the entire iterable"""
2148         return self._exhaust()[::-1 if self._reversed else 1]
2149
2150     @staticmethod
2151     def _reverse_index(x):
2152         return None if x is None else ~x
2153
2154     def __getitem__(self, idx):
2155         if isinstance(idx, slice):
2156             if self._reversed:
2157                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2158             start, stop, step = idx.start, idx.stop, idx.step or 1
2159         elif isinstance(idx, int):
2160             if self._reversed:
2161                 idx = self._reverse_index(idx)
2162             start, stop, step = idx, idx, 0
2163         else:
2164             raise TypeError('indices must be integers or slices')
2165         if ((start or 0) < 0 or (stop or 0) < 0
2166                 or (start is None and step < 0)
2167                 or (stop is None and step > 0)):
2168             # We need to consume the entire iterable to be able to slice from the end
2169             # Obviously, never use this with infinite iterables
2170             self._exhaust()
2171             try:
2172                 return self._cache[idx]
2173             except IndexError as e:
2174                 raise self.IndexError(e) from e
2175         n = max(start or 0, stop or 0) - len(self._cache) + 1
2176         if n > 0:
2177             self._cache.extend(itertools.islice(self._iterable, n))
2178         try:
2179             return self._cache[idx]
2180         except IndexError as e:
2181             raise self.IndexError(e) from e
2182
2183     def __bool__(self):
2184         try:
2185             self[-1] if self._reversed else self[0]
2186         except self.IndexError:
2187             return False
2188         return True
2189
2190     def __len__(self):
2191         self._exhaust()
2192         return len(self._cache)
2193
2194     def __reversed__(self):
2195         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2196
2197     def __copy__(self):
2198         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2199
2200     def __repr__(self):
2201         # repr and str should mimic a list. So we exhaust the iterable
2202         return repr(self.exhaust())
2203
2204     def __str__(self):
2205         return repr(self.exhaust())
2206
2207
2208 class PagedList:
2209
2210     class IndexError(IndexError):
2211         pass
2212
2213     def __len__(self):
2214         # This is only useful for tests
2215         return len(self.getslice())
2216
2217     def __init__(self, pagefunc, pagesize, use_cache=True):
2218         self._pagefunc = pagefunc
2219         self._pagesize = pagesize
2220         self._pagecount = float('inf')
2221         self._use_cache = use_cache
2222         self._cache = {}
2223
2224     def getpage(self, pagenum):
2225         page_results = self._cache.get(pagenum)
2226         if page_results is None:
2227             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2228         if self._use_cache:
2229             self._cache[pagenum] = page_results
2230         return page_results
2231
2232     def getslice(self, start=0, end=None):
2233         return list(self._getslice(start, end))
2234
2235     def _getslice(self, start, end):
2236         raise NotImplementedError('This method must be implemented by subclasses')
2237
2238     def __getitem__(self, idx):
2239         assert self._use_cache, 'Indexing PagedList requires cache'
2240         if not isinstance(idx, int) or idx < 0:
2241             raise TypeError('indices must be non-negative integers')
2242         entries = self.getslice(idx, idx + 1)
2243         if not entries:
2244             raise self.IndexError()
2245         return entries[0]
2246
2247
2248 class OnDemandPagedList(PagedList):
2249     """Download pages until a page with less than maximum results"""
2250
2251     def _getslice(self, start, end):
2252         for pagenum in itertools.count(start // self._pagesize):
2253             firstid = pagenum * self._pagesize
2254             nextfirstid = pagenum * self._pagesize + self._pagesize
2255             if start >= nextfirstid:
2256                 continue
2257
2258             startv = (
2259                 start % self._pagesize
2260                 if firstid <= start < nextfirstid
2261                 else 0)
2262             endv = (
2263                 ((end - 1) % self._pagesize) + 1
2264                 if (end is not None and firstid <= end <= nextfirstid)
2265                 else None)
2266
2267             try:
2268                 page_results = self.getpage(pagenum)
2269             except Exception:
2270                 self._pagecount = pagenum - 1
2271                 raise
2272             if startv != 0 or endv is not None:
2273                 page_results = page_results[startv:endv]
2274             yield from page_results
2275
2276             # A little optimization - if current page is not "full", ie. does
2277             # not contain page_size videos then we can assume that this page
2278             # is the last one - there are no more ids on further pages -
2279             # i.e. no need to query again.
2280             if len(page_results) + startv < self._pagesize:
2281                 break
2282
2283             # If we got the whole page, but the next page is not interesting,
2284             # break out early as well
2285             if end == nextfirstid:
2286                 break
2287
2288
2289 class InAdvancePagedList(PagedList):
2290     """PagedList with total number of pages known in advance"""
2291
2292     def __init__(self, pagefunc, pagecount, pagesize):
2293         PagedList.__init__(self, pagefunc, pagesize, True)
2294         self._pagecount = pagecount
2295
2296     def _getslice(self, start, end):
2297         start_page = start // self._pagesize
2298         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2299         skip_elems = start - start_page * self._pagesize
2300         only_more = None if end is None else end - start
2301         for pagenum in range(start_page, end_page):
2302             page_results = self.getpage(pagenum)
2303             if skip_elems:
2304                 page_results = page_results[skip_elems:]
2305                 skip_elems = None
2306             if only_more is not None:
2307                 if len(page_results) < only_more:
2308                     only_more -= len(page_results)
2309                 else:
2310                     yield from page_results[:only_more]
2311                     break
2312             yield from page_results
2313
2314
2315 class PlaylistEntries:
2316     MissingEntry = object()
2317     is_exhausted = False
2318
2319     def __init__(self, ydl, info_dict):
2320         self.ydl = ydl
2321
2322         # _entries must be assigned now since infodict can change during iteration
2323         entries = info_dict.get('entries')
2324         if entries is None:
2325             raise EntryNotInPlaylist('There are no entries')
2326         elif isinstance(entries, list):
2327             self.is_exhausted = True
2328
2329         requested_entries = info_dict.get('requested_entries')
2330         self.is_incomplete = requested_entries is not None
2331         if self.is_incomplete:
2332             assert self.is_exhausted
2333             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2334             for i, entry in zip(requested_entries, entries):
2335                 self._entries[i - 1] = entry
2336         elif isinstance(entries, (list, PagedList, LazyList)):
2337             self._entries = entries
2338         else:
2339             self._entries = LazyList(entries)
2340
2341     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2342         (?P<start>[+-]?\d+)?
2343         (?P<range>[:-]
2344             (?P<end>[+-]?\d+|inf(?:inite)?)?
2345             (?::(?P<step>[+-]?\d+))?
2346         )?''')
2347
2348     @classmethod
2349     def parse_playlist_items(cls, string):
2350         for segment in string.split(','):
2351             if not segment:
2352                 raise ValueError('There is two or more consecutive commas')
2353             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2354             if not mobj:
2355                 raise ValueError(f'{segment!r} is not a valid specification')
2356             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2357             if int_or_none(step) == 0:
2358                 raise ValueError(f'Step in {segment!r} cannot be zero')
2359             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2360
2361     def get_requested_items(self):
2362         playlist_items = self.ydl.params.get('playlist_items')
2363         playlist_start = self.ydl.params.get('playliststart', 1)
2364         playlist_end = self.ydl.params.get('playlistend')
2365         # For backwards compatibility, interpret -1 as whole list
2366         if playlist_end in (-1, None):
2367             playlist_end = ''
2368         if not playlist_items:
2369             playlist_items = f'{playlist_start}:{playlist_end}'
2370         elif playlist_start != 1 or playlist_end:
2371             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2372
2373         for index in self.parse_playlist_items(playlist_items):
2374             for i, entry in self[index]:
2375                 yield i, entry
2376                 if not entry:
2377                     continue
2378                 try:
2379                     # The item may have just been added to archive. Don't break due to it
2380                     if not self.ydl.params.get('lazy_playlist'):
2381                         # TODO: Add auto-generated fields
2382                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2383                 except (ExistingVideoReached, RejectedVideoReached):
2384                     return
2385
2386     def get_full_count(self):
2387         if self.is_exhausted and not self.is_incomplete:
2388             return len(self)
2389         elif isinstance(self._entries, InAdvancePagedList):
2390             if self._entries._pagesize == 1:
2391                 return self._entries._pagecount
2392
2393     @functools.cached_property
2394     def _getter(self):
2395         if isinstance(self._entries, list):
2396             def get_entry(i):
2397                 try:
2398                     entry = self._entries[i]
2399                 except IndexError:
2400                     entry = self.MissingEntry
2401                     if not self.is_incomplete:
2402                         raise self.IndexError()
2403                 if entry is self.MissingEntry:
2404                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2405                 return entry
2406         else:
2407             def get_entry(i):
2408                 try:
2409                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2410                 except (LazyList.IndexError, PagedList.IndexError):
2411                     raise self.IndexError()
2412         return get_entry
2413
2414     def __getitem__(self, idx):
2415         if isinstance(idx, int):
2416             idx = slice(idx, idx)
2417
2418         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2419         step = 1 if idx.step is None else idx.step
2420         if idx.start is None:
2421             start = 0 if step > 0 else len(self) - 1
2422         else:
2423             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2424
2425         # NB: Do not call len(self) when idx == [:]
2426         if idx.stop is None:
2427             stop = 0 if step < 0 else float('inf')
2428         else:
2429             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2430         stop += [-1, 1][step > 0]
2431
2432         for i in frange(start, stop, step):
2433             if i < 0:
2434                 continue
2435             try:
2436                 entry = self._getter(i)
2437             except self.IndexError:
2438                 self.is_exhausted = True
2439                 if step > 0:
2440                     break
2441                 continue
2442             yield i + 1, entry
2443
2444     def __len__(self):
2445         return len(tuple(self[:]))
2446
2447     class IndexError(IndexError):
2448         pass
2449
2450
2451 def uppercase_escape(s):
2452     unicode_escape = codecs.getdecoder('unicode_escape')
2453     return re.sub(
2454         r'\\U[0-9a-fA-F]{8}',
2455         lambda m: unicode_escape(m.group(0))[0],
2456         s)
2457
2458
2459 def lowercase_escape(s):
2460     unicode_escape = codecs.getdecoder('unicode_escape')
2461     return re.sub(
2462         r'\\u[0-9a-fA-F]{4}',
2463         lambda m: unicode_escape(m.group(0))[0],
2464         s)
2465
2466
2467 def escape_rfc3986(s):
2468     """Escape non-ASCII characters as suggested by RFC 3986"""
2469     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2470
2471
2472 def escape_url(url):
2473     """Escape URL as suggested by RFC 3986"""
2474     url_parsed = urllib.parse.urlparse(url)
2475     return url_parsed._replace(
2476         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2477         path=escape_rfc3986(url_parsed.path),
2478         params=escape_rfc3986(url_parsed.params),
2479         query=escape_rfc3986(url_parsed.query),
2480         fragment=escape_rfc3986(url_parsed.fragment)
2481     ).geturl()
2482
2483
2484 def parse_qs(url, **kwargs):
2485     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2486
2487
2488 def read_batch_urls(batch_fd):
2489     def fixup(url):
2490         if not isinstance(url, str):
2491             url = url.decode('utf-8', 'replace')
2492         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2493         for bom in BOM_UTF8:
2494             if url.startswith(bom):
2495                 url = url[len(bom):]
2496         url = url.lstrip()
2497         if not url or url.startswith(('#', ';', ']')):
2498             return False
2499         # "#" cannot be stripped out since it is part of the URI
2500         # However, it can be safely stripped out if following a whitespace
2501         return re.split(r'\s#', url, 1)[0].rstrip()
2502
2503     with contextlib.closing(batch_fd) as fd:
2504         return [url for url in map(fixup, fd) if url]
2505
2506
2507 def urlencode_postdata(*args, **kargs):
2508     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2509
2510
2511 def update_url(url, *, query_update=None, **kwargs):
2512     """Replace URL components specified by kwargs
2513        @param url           str or parse url tuple
2514        @param query_update  update query
2515        @returns             str
2516     """
2517     if isinstance(url, str):
2518         if not kwargs and not query_update:
2519             return url
2520         else:
2521             url = urllib.parse.urlparse(url)
2522     if query_update:
2523         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2524         kwargs['query'] = urllib.parse.urlencode({
2525             **urllib.parse.parse_qs(url.query),
2526             **query_update
2527         }, True)
2528     return urllib.parse.urlunparse(url._replace(**kwargs))
2529
2530
2531 def update_url_query(url, query):
2532     return update_url(url, query_update=query)
2533
2534
2535 def _multipart_encode_impl(data, boundary):
2536     content_type = 'multipart/form-data; boundary=%s' % boundary
2537
2538     out = b''
2539     for k, v in data.items():
2540         out += b'--' + boundary.encode('ascii') + b'\r\n'
2541         if isinstance(k, str):
2542             k = k.encode()
2543         if isinstance(v, str):
2544             v = v.encode()
2545         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2546         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2547         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2548         if boundary.encode('ascii') in content:
2549             raise ValueError('Boundary overlaps with data')
2550         out += content
2551
2552     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2553
2554     return out, content_type
2555
2556
2557 def multipart_encode(data, boundary=None):
2558     '''
2559     Encode a dict to RFC 7578-compliant form-data
2560
2561     data:
2562         A dict where keys and values can be either Unicode or bytes-like
2563         objects.
2564     boundary:
2565         If specified a Unicode object, it's used as the boundary. Otherwise
2566         a random boundary is generated.
2567
2568     Reference: https://tools.ietf.org/html/rfc7578
2569     '''
2570     has_specified_boundary = boundary is not None
2571
2572     while True:
2573         if boundary is None:
2574             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2575
2576         try:
2577             out, content_type = _multipart_encode_impl(data, boundary)
2578             break
2579         except ValueError:
2580             if has_specified_boundary:
2581                 raise
2582             boundary = None
2583
2584     return out, content_type
2585
2586
2587 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2588     if blocked_types is NO_DEFAULT:
2589         blocked_types = (str, bytes, collections.abc.Mapping)
2590     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2591
2592
2593 def variadic(x, allowed_types=NO_DEFAULT):
2594     if not isinstance(allowed_types, (tuple, type)):
2595         deprecation_warning('allowed_types should be a tuple or a type')
2596         allowed_types = tuple(allowed_types)
2597     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2598
2599
2600 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2601     for f in funcs:
2602         try:
2603             val = f(*args, **kwargs)
2604         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2605             pass
2606         else:
2607             if expected_type is None or isinstance(val, expected_type):
2608                 return val
2609
2610
2611 def try_get(src, getter, expected_type=None):
2612     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2613
2614
2615 def filter_dict(dct, cndn=lambda _, v: v is not None):
2616     return {k: v for k, v in dct.items() if cndn(k, v)}
2617
2618
2619 def merge_dicts(*dicts):
2620     merged = {}
2621     for a_dict in dicts:
2622         for k, v in a_dict.items():
2623             if (v is not None and k not in merged
2624                     or isinstance(v, str) and merged[k] == ''):
2625                 merged[k] = v
2626     return merged
2627
2628
2629 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2630     return string if isinstance(string, str) else str(string, encoding, errors)
2631
2632
2633 US_RATINGS = {
2634     'G': 0,
2635     'PG': 10,
2636     'PG-13': 13,
2637     'R': 16,
2638     'NC': 18,
2639 }
2640
2641
2642 TV_PARENTAL_GUIDELINES = {
2643     'TV-Y': 0,
2644     'TV-Y7': 7,
2645     'TV-G': 0,
2646     'TV-PG': 0,
2647     'TV-14': 14,
2648     'TV-MA': 17,
2649 }
2650
2651
2652 def parse_age_limit(s):
2653     # isinstance(False, int) is True. So type() must be used instead
2654     if type(s) is int:  # noqa: E721
2655         return s if 0 <= s <= 21 else None
2656     elif not isinstance(s, str):
2657         return None
2658     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2659     if m:
2660         return int(m.group('age'))
2661     s = s.upper()
2662     if s in US_RATINGS:
2663         return US_RATINGS[s]
2664     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2665     if m:
2666         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2667     return None
2668
2669
2670 def strip_jsonp(code):
2671     return re.sub(
2672         r'''(?sx)^
2673             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2674             (?:\s*&&\s*(?P=func_name))?
2675             \s*\(\s*(?P<callback_data>.*)\);?
2676             \s*?(?://[^\n]*)*$''',
2677         r'\g<callback_data>', code)
2678
2679
2680 def js_to_json(code, vars={}, *, strict=False):
2681     # vars is a dict of var, val pairs to substitute
2682     STRING_QUOTES = '\'"`'
2683     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2684     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2685     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2686     INTEGER_TABLE = (
2687         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2688         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2689     )
2690
2691     def process_escape(match):
2692         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2693         escape = match.group(1) or match.group(2)
2694
2695         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2696                 else R'\u00' if escape == 'x'
2697                 else '' if escape == '\n'
2698                 else escape)
2699
2700     def template_substitute(match):
2701         evaluated = js_to_json(match.group(1), vars, strict=strict)
2702         if evaluated[0] == '"':
2703             return json.loads(evaluated)
2704         return evaluated
2705
2706     def fix_kv(m):
2707         v = m.group(0)
2708         if v in ('true', 'false', 'null'):
2709             return v
2710         elif v in ('undefined', 'void 0'):
2711             return 'null'
2712         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2713             return ''
2714
2715         if v[0] in STRING_QUOTES:
2716             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2717             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2718             return f'"{escaped}"'
2719
2720         for regex, base in INTEGER_TABLE:
2721             im = re.match(regex, v)
2722             if im:
2723                 i = int(im.group(1), base)
2724                 return f'"{i}":' if v.endswith(':') else str(i)
2725
2726         if v in vars:
2727             try:
2728                 if not strict:
2729                     json.loads(vars[v])
2730             except json.JSONDecodeError:
2731                 return json.dumps(vars[v])
2732             else:
2733                 return vars[v]
2734
2735         if not strict:
2736             return f'"{v}"'
2737
2738         raise ValueError(f'Unknown value: {v}')
2739
2740     def create_map(mobj):
2741         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2742
2743     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2744     if not strict:
2745         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
2746         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2747         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2748         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2749
2750     return re.sub(rf'''(?sx)
2751         {STRING_RE}|
2752         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2753         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2754         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2755         [0-9]+(?={SKIP_RE}:)|
2756         !+
2757         ''', fix_kv, code)
2758
2759
2760 def qualities(quality_ids):
2761     """ Get a numeric quality value out of a list of possible values """
2762     def q(qid):
2763         try:
2764             return quality_ids.index(qid)
2765         except ValueError:
2766             return -1
2767     return q
2768
2769
2770 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2771
2772
2773 DEFAULT_OUTTMPL = {
2774     'default': '%(title)s [%(id)s].%(ext)s',
2775     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2776 }
2777 OUTTMPL_TYPES = {
2778     'chapter': None,
2779     'subtitle': None,
2780     'thumbnail': None,
2781     'description': 'description',
2782     'annotation': 'annotations.xml',
2783     'infojson': 'info.json',
2784     'link': None,
2785     'pl_video': None,
2786     'pl_thumbnail': None,
2787     'pl_description': 'description',
2788     'pl_infojson': 'info.json',
2789 }
2790
2791 # As of [1] format syntax is:
2792 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2793 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2794 STR_FORMAT_RE_TMPL = r'''(?x)
2795     (?<!%)(?P<prefix>(?:%%)*)
2796     %
2797     (?P<has_key>\((?P<key>{0})\))?
2798     (?P<format>
2799         (?P<conversion>[#0\-+ ]+)?
2800         (?P<min_width>\d+)?
2801         (?P<precision>\.\d+)?
2802         (?P<len_mod>[hlL])?  # unused in python
2803         {1}  # conversion type
2804     )
2805 '''
2806
2807
2808 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2809
2810
2811 def limit_length(s, length):
2812     """ Add ellipses to overly long strings """
2813     if s is None:
2814         return None
2815     ELLIPSES = '...'
2816     if len(s) > length:
2817         return s[:length - len(ELLIPSES)] + ELLIPSES
2818     return s
2819
2820
2821 def version_tuple(v):
2822     return tuple(int(e) for e in re.split(r'[-.]', v))
2823
2824
2825 def is_outdated_version(version, limit, assume_new=True):
2826     if not version:
2827         return not assume_new
2828     try:
2829         return version_tuple(version) < version_tuple(limit)
2830     except ValueError:
2831         return not assume_new
2832
2833
2834 def ytdl_is_updateable():
2835     """ Returns if yt-dlp can be updated with -U """
2836
2837     from ..update import is_non_updateable
2838
2839     return not is_non_updateable()
2840
2841
2842 def args_to_str(args):
2843     # Get a short string representation for a subprocess command
2844     return ' '.join(compat_shlex_quote(a) for a in args)
2845
2846
2847 def error_to_str(err):
2848     return f'{type(err).__name__}: {err}'
2849
2850
2851 def mimetype2ext(mt, default=NO_DEFAULT):
2852     if not isinstance(mt, str):
2853         if default is not NO_DEFAULT:
2854             return default
2855         return None
2856
2857     MAP = {
2858         # video
2859         '3gpp': '3gp',
2860         'mp2t': 'ts',
2861         'mp4': 'mp4',
2862         'mpeg': 'mpeg',
2863         'mpegurl': 'm3u8',
2864         'quicktime': 'mov',
2865         'webm': 'webm',
2866         'vp9': 'vp9',
2867         'x-flv': 'flv',
2868         'x-m4v': 'm4v',
2869         'x-matroska': 'mkv',
2870         'x-mng': 'mng',
2871         'x-mp4-fragmented': 'mp4',
2872         'x-ms-asf': 'asf',
2873         'x-ms-wmv': 'wmv',
2874         'x-msvideo': 'avi',
2875
2876         # application (streaming playlists)
2877         'dash+xml': 'mpd',
2878         'f4m+xml': 'f4m',
2879         'hds+xml': 'f4m',
2880         'vnd.apple.mpegurl': 'm3u8',
2881         'vnd.ms-sstr+xml': 'ism',
2882         'x-mpegurl': 'm3u8',
2883
2884         # audio
2885         'audio/mp4': 'm4a',
2886         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2887         # Using .mp3 as it's the most popular one
2888         'audio/mpeg': 'mp3',
2889         'audio/webm': 'webm',
2890         'audio/x-matroska': 'mka',
2891         'audio/x-mpegurl': 'm3u',
2892         'midi': 'mid',
2893         'ogg': 'ogg',
2894         'wav': 'wav',
2895         'wave': 'wav',
2896         'x-aac': 'aac',
2897         'x-flac': 'flac',
2898         'x-m4a': 'm4a',
2899         'x-realaudio': 'ra',
2900         'x-wav': 'wav',
2901
2902         # image
2903         'avif': 'avif',
2904         'bmp': 'bmp',
2905         'gif': 'gif',
2906         'jpeg': 'jpg',
2907         'png': 'png',
2908         'svg+xml': 'svg',
2909         'tiff': 'tif',
2910         'vnd.wap.wbmp': 'wbmp',
2911         'webp': 'webp',
2912         'x-icon': 'ico',
2913         'x-jng': 'jng',
2914         'x-ms-bmp': 'bmp',
2915
2916         # caption
2917         'filmstrip+json': 'fs',
2918         'smptett+xml': 'tt',
2919         'ttaf+xml': 'dfxp',
2920         'ttml+xml': 'ttml',
2921         'x-ms-sami': 'sami',
2922
2923         # misc
2924         'gzip': 'gz',
2925         'json': 'json',
2926         'xml': 'xml',
2927         'zip': 'zip',
2928     }
2929
2930     mimetype = mt.partition(';')[0].strip().lower()
2931     _, _, subtype = mimetype.rpartition('/')
2932
2933     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2934     if ext:
2935         return ext
2936     elif default is not NO_DEFAULT:
2937         return default
2938     return subtype.replace('+', '.')
2939
2940
2941 def ext2mimetype(ext_or_url):
2942     if not ext_or_url:
2943         return None
2944     if '.' not in ext_or_url:
2945         ext_or_url = f'file.{ext_or_url}'
2946     return mimetypes.guess_type(ext_or_url)[0]
2947
2948
2949 def parse_codecs(codecs_str):
2950     # http://tools.ietf.org/html/rfc6381
2951     if not codecs_str:
2952         return {}
2953     split_codecs = list(filter(None, map(
2954         str.strip, codecs_str.strip().strip(',').split(','))))
2955     vcodec, acodec, scodec, hdr = None, None, None, None
2956     for full_codec in split_codecs:
2957         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2958         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2959                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2960             if vcodec:
2961                 continue
2962             vcodec = full_codec
2963             if parts[0] in ('dvh1', 'dvhe'):
2964                 hdr = 'DV'
2965             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2966                 hdr = 'HDR10'
2967             elif parts[:2] == ['vp9', '2']:
2968                 hdr = 'HDR10'
2969         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
2970                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2971             acodec = acodec or full_codec
2972         elif parts[0] in ('stpp', 'wvtt'):
2973             scodec = scodec or full_codec
2974         else:
2975             write_string(f'WARNING: Unknown codec {full_codec}\n')
2976     if vcodec or acodec or scodec:
2977         return {
2978             'vcodec': vcodec or 'none',
2979             'acodec': acodec or 'none',
2980             'dynamic_range': hdr,
2981             **({'scodec': scodec} if scodec is not None else {}),
2982         }
2983     elif len(split_codecs) == 2:
2984         return {
2985             'vcodec': split_codecs[0],
2986             'acodec': split_codecs[1],
2987         }
2988     return {}
2989
2990
2991 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
2992     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
2993
2994     allow_mkv = not preferences or 'mkv' in preferences
2995
2996     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
2997         return 'mkv'  # TODO: any other format allows this?
2998
2999     # TODO: All codecs supported by parse_codecs isn't handled here
3000     COMPATIBLE_CODECS = {
3001         'mp4': {
3002             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3003             'h264', 'aacl', 'ec-3',  # Set in ISM
3004         },
3005         'webm': {
3006             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3007             'vp9x', 'vp8x',  # in the webm spec
3008         },
3009     }
3010
3011     sanitize_codec = functools.partial(
3012         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3013     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3014
3015     for ext in preferences or COMPATIBLE_CODECS.keys():
3016         codec_set = COMPATIBLE_CODECS.get(ext, set())
3017         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3018             return ext
3019
3020     COMPATIBLE_EXTS = (
3021         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3022         {'webm', 'weba'},
3023     )
3024     for ext in preferences or vexts:
3025         current_exts = {ext, *vexts, *aexts}
3026         if ext == 'mkv' or current_exts == {ext} or any(
3027                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3028             return ext
3029     return 'mkv' if allow_mkv else preferences[-1]
3030
3031
3032 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3033     getheader = url_handle.headers.get
3034
3035     cd = getheader('Content-Disposition')
3036     if cd:
3037         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3038         if m:
3039             e = determine_ext(m.group('filename'), default_ext=None)
3040             if e:
3041                 return e
3042
3043     meta_ext = getheader('x-amz-meta-name')
3044     if meta_ext:
3045         e = meta_ext.rpartition('.')[2]
3046         if e:
3047             return e
3048
3049     return mimetype2ext(getheader('Content-Type'), default=default)
3050
3051
3052 def encode_data_uri(data, mime_type):
3053     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3054
3055
3056 def age_restricted(content_limit, age_limit):
3057     """ Returns True iff the content should be blocked """
3058
3059     if age_limit is None:  # No limit set
3060         return False
3061     if content_limit is None:
3062         return False  # Content available for everyone
3063     return age_limit < content_limit
3064
3065
3066 # List of known byte-order-marks (BOM)
3067 BOMS = [
3068     (b'\xef\xbb\xbf', 'utf-8'),
3069     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3070     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3071     (b'\xff\xfe', 'utf-16-le'),
3072     (b'\xfe\xff', 'utf-16-be'),
3073 ]
3074
3075
3076 def is_html(first_bytes):
3077     """ Detect whether a file contains HTML by examining its first bytes. """
3078
3079     encoding = 'utf-8'
3080     for bom, enc in BOMS:
3081         while first_bytes.startswith(bom):
3082             encoding, first_bytes = enc, first_bytes[len(bom):]
3083
3084     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3085
3086
3087 def determine_protocol(info_dict):
3088     protocol = info_dict.get('protocol')
3089     if protocol is not None:
3090         return protocol
3091
3092     url = sanitize_url(info_dict['url'])
3093     if url.startswith('rtmp'):
3094         return 'rtmp'
3095     elif url.startswith('mms'):
3096         return 'mms'
3097     elif url.startswith('rtsp'):
3098         return 'rtsp'
3099
3100     ext = determine_ext(url)
3101     if ext == 'm3u8':
3102         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3103     elif ext == 'f4m':
3104         return 'f4m'
3105
3106     return urllib.parse.urlparse(url).scheme
3107
3108
3109 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3110     """ Render a list of rows, each as a list of values.
3111     Text after a \t will be right aligned """
3112     def width(string):
3113         return len(remove_terminal_sequences(string).replace('\t', ''))
3114
3115     def get_max_lens(table):
3116         return [max(width(str(v)) for v in col) for col in zip(*table)]
3117
3118     def filter_using_list(row, filterArray):
3119         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3120
3121     max_lens = get_max_lens(data) if hide_empty else []
3122     header_row = filter_using_list(header_row, max_lens)
3123     data = [filter_using_list(row, max_lens) for row in data]
3124
3125     table = [header_row] + data
3126     max_lens = get_max_lens(table)
3127     extra_gap += 1
3128     if delim:
3129         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3130         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3131     for row in table:
3132         for pos, text in enumerate(map(str, row)):
3133             if '\t' in text:
3134                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3135             else:
3136                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3137     ret = '\n'.join(''.join(row).rstrip() for row in table)
3138     return ret
3139
3140
3141 def _match_one(filter_part, dct, incomplete):
3142     # TODO: Generalize code with YoutubeDL._build_format_filter
3143     STRING_OPERATORS = {
3144         '*=': operator.contains,
3145         '^=': lambda attr, value: attr.startswith(value),
3146         '$=': lambda attr, value: attr.endswith(value),
3147         '~=': lambda attr, value: re.search(value, attr),
3148     }
3149     COMPARISON_OPERATORS = {
3150         **STRING_OPERATORS,
3151         '<=': operator.le,  # "<=" must be defined above "<"
3152         '<': operator.lt,
3153         '>=': operator.ge,
3154         '>': operator.gt,
3155         '=': operator.eq,
3156     }
3157
3158     if isinstance(incomplete, bool):
3159         is_incomplete = lambda _: incomplete
3160     else:
3161         is_incomplete = lambda k: k in incomplete
3162
3163     operator_rex = re.compile(r'''(?x)
3164         (?P<key>[a-z_]+)
3165         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3166         (?:
3167             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3168             (?P<strval>.+?)
3169         )
3170         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3171     m = operator_rex.fullmatch(filter_part.strip())
3172     if m:
3173         m = m.groupdict()
3174         unnegated_op = COMPARISON_OPERATORS[m['op']]
3175         if m['negation']:
3176             op = lambda attr, value: not unnegated_op(attr, value)
3177         else:
3178             op = unnegated_op
3179         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3180         if m['quote']:
3181             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3182         actual_value = dct.get(m['key'])
3183         numeric_comparison = None
3184         if isinstance(actual_value, (int, float)):
3185             # If the original field is a string and matching comparisonvalue is
3186             # a number we should respect the origin of the original field
3187             # and process comparison value as a string (see
3188             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3189             try:
3190                 numeric_comparison = int(comparison_value)
3191             except ValueError:
3192                 numeric_comparison = parse_filesize(comparison_value)
3193                 if numeric_comparison is None:
3194                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3195                 if numeric_comparison is None:
3196                     numeric_comparison = parse_duration(comparison_value)
3197         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3198             raise ValueError('Operator %s only supports string values!' % m['op'])
3199         if actual_value is None:
3200             return is_incomplete(m['key']) or m['none_inclusive']
3201         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3202
3203     UNARY_OPERATORS = {
3204         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3205         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3206     }
3207     operator_rex = re.compile(r'''(?x)
3208         (?P<op>%s)\s*(?P<key>[a-z_]+)
3209         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3210     m = operator_rex.fullmatch(filter_part.strip())
3211     if m:
3212         op = UNARY_OPERATORS[m.group('op')]
3213         actual_value = dct.get(m.group('key'))
3214         if is_incomplete(m.group('key')) and actual_value is None:
3215             return True
3216         return op(actual_value)
3217
3218     raise ValueError('Invalid filter part %r' % filter_part)
3219
3220
3221 def match_str(filter_str, dct, incomplete=False):
3222     """ Filter a dictionary with a simple string syntax.
3223     @returns           Whether the filter passes
3224     @param incomplete  Set of keys that is expected to be missing from dct.
3225                        Can be True/False to indicate all/none of the keys may be missing.
3226                        All conditions on incomplete keys pass if the key is missing
3227     """
3228     return all(
3229         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3230         for filter_part in re.split(r'(?<!\\)&', filter_str))
3231
3232
3233 def match_filter_func(filters, breaking_filters=None):
3234     if not filters and not breaking_filters:
3235         return None
3236     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3237     filters = set(variadic(filters or []))
3238
3239     interactive = '-' in filters
3240     if interactive:
3241         filters.remove('-')
3242
3243     def _match_func(info_dict, incomplete=False):
3244         ret = breaking_filters(info_dict, incomplete)
3245         if ret is not None:
3246             raise RejectedVideoReached(ret)
3247
3248         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3249             return NO_DEFAULT if interactive and not incomplete else None
3250         else:
3251             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3252             filter_str = ') | ('.join(map(str.strip, filters))
3253             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3254     return _match_func
3255
3256
3257 class download_range_func:
3258     def __init__(self, chapters, ranges, from_info=False):
3259         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3260
3261     def __call__(self, info_dict, ydl):
3262
3263         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3264                    else 'Cannot match chapters since chapter information is unavailable')
3265         for regex in self.chapters or []:
3266             for i, chapter in enumerate(info_dict.get('chapters') or []):
3267                 if re.search(regex, chapter['title']):
3268                     warning = None
3269                     yield {**chapter, 'index': i}
3270         if self.chapters and warning:
3271             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3272
3273         for start, end in self.ranges or []:
3274             yield {
3275                 'start_time': self._handle_negative_timestamp(start, info_dict),
3276                 'end_time': self._handle_negative_timestamp(end, info_dict),
3277             }
3278
3279         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3280             yield {
3281                 'start_time': info_dict.get('start_time') or 0,
3282                 'end_time': info_dict.get('end_time') or float('inf'),
3283             }
3284         elif not self.ranges and not self.chapters:
3285             yield {}
3286
3287     @staticmethod
3288     def _handle_negative_timestamp(time, info):
3289         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3290
3291     def __eq__(self, other):
3292         return (isinstance(other, download_range_func)
3293                 and self.chapters == other.chapters and self.ranges == other.ranges)
3294
3295     def __repr__(self):
3296         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3297
3298
3299 def parse_dfxp_time_expr(time_expr):
3300     if not time_expr:
3301         return
3302
3303     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3304     if mobj:
3305         return float(mobj.group('time_offset'))
3306
3307     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3308     if mobj:
3309         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3310
3311
3312 def srt_subtitles_timecode(seconds):
3313     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3314
3315
3316 def ass_subtitles_timecode(seconds):
3317     time = timetuple_from_msec(seconds * 1000)
3318     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3319
3320
3321 def dfxp2srt(dfxp_data):
3322     '''
3323     @param dfxp_data A bytes-like object containing DFXP data
3324     @returns A unicode object containing converted SRT data
3325     '''
3326     LEGACY_NAMESPACES = (
3327         (b'http://www.w3.org/ns/ttml', [
3328             b'http://www.w3.org/2004/11/ttaf1',
3329             b'http://www.w3.org/2006/04/ttaf1',
3330             b'http://www.w3.org/2006/10/ttaf1',
3331         ]),
3332         (b'http://www.w3.org/ns/ttml#styling', [
3333             b'http://www.w3.org/ns/ttml#style',
3334         ]),
3335     )
3336
3337     SUPPORTED_STYLING = [
3338         'color',
3339         'fontFamily',
3340         'fontSize',
3341         'fontStyle',
3342         'fontWeight',
3343         'textDecoration'
3344     ]
3345
3346     _x = functools.partial(xpath_with_ns, ns_map={
3347         'xml': 'http://www.w3.org/XML/1998/namespace',
3348         'ttml': 'http://www.w3.org/ns/ttml',
3349         'tts': 'http://www.w3.org/ns/ttml#styling',
3350     })
3351
3352     styles = {}
3353     default_style = {}
3354
3355     class TTMLPElementParser:
3356         _out = ''
3357         _unclosed_elements = []
3358         _applied_styles = []
3359
3360         def start(self, tag, attrib):
3361             if tag in (_x('ttml:br'), 'br'):
3362                 self._out += '\n'
3363             else:
3364                 unclosed_elements = []
3365                 style = {}
3366                 element_style_id = attrib.get('style')
3367                 if default_style:
3368                     style.update(default_style)
3369                 if element_style_id:
3370                     style.update(styles.get(element_style_id, {}))
3371                 for prop in SUPPORTED_STYLING:
3372                     prop_val = attrib.get(_x('tts:' + prop))
3373                     if prop_val:
3374                         style[prop] = prop_val
3375                 if style:
3376                     font = ''
3377                     for k, v in sorted(style.items()):
3378                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3379                             continue
3380                         if k == 'color':
3381                             font += ' color="%s"' % v
3382                         elif k == 'fontSize':
3383                             font += ' size="%s"' % v
3384                         elif k == 'fontFamily':
3385                             font += ' face="%s"' % v
3386                         elif k == 'fontWeight' and v == 'bold':
3387                             self._out += '<b>'
3388                             unclosed_elements.append('b')
3389                         elif k == 'fontStyle' and v == 'italic':
3390                             self._out += '<i>'
3391                             unclosed_elements.append('i')
3392                         elif k == 'textDecoration' and v == 'underline':
3393                             self._out += '<u>'
3394                             unclosed_elements.append('u')
3395                     if font:
3396                         self._out += '<font' + font + '>'
3397                         unclosed_elements.append('font')
3398                     applied_style = {}
3399                     if self._applied_styles:
3400                         applied_style.update(self._applied_styles[-1])
3401                     applied_style.update(style)
3402                     self._applied_styles.append(applied_style)
3403                 self._unclosed_elements.append(unclosed_elements)
3404
3405         def end(self, tag):
3406             if tag not in (_x('ttml:br'), 'br'):
3407                 unclosed_elements = self._unclosed_elements.pop()
3408                 for element in reversed(unclosed_elements):
3409                     self._out += '</%s>' % element
3410                 if unclosed_elements and self._applied_styles:
3411                     self._applied_styles.pop()
3412
3413         def data(self, data):
3414             self._out += data
3415
3416         def close(self):
3417             return self._out.strip()
3418
3419     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3420     # This will not trigger false positives since only UTF-8 text is being replaced
3421     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3422
3423     def parse_node(node):
3424         target = TTMLPElementParser()
3425         parser = xml.etree.ElementTree.XMLParser(target=target)
3426         parser.feed(xml.etree.ElementTree.tostring(node))
3427         return parser.close()
3428
3429     for k, v in LEGACY_NAMESPACES:
3430         for ns in v:
3431             dfxp_data = dfxp_data.replace(ns, k)
3432
3433     dfxp = compat_etree_fromstring(dfxp_data)
3434     out = []
3435     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3436
3437     if not paras:
3438         raise ValueError('Invalid dfxp/TTML subtitle')
3439
3440     repeat = False
3441     while True:
3442         for style in dfxp.findall(_x('.//ttml:style')):
3443             style_id = style.get('id') or style.get(_x('xml:id'))
3444             if not style_id:
3445                 continue
3446             parent_style_id = style.get('style')
3447             if parent_style_id:
3448                 if parent_style_id not in styles:
3449                     repeat = True
3450                     continue
3451                 styles[style_id] = styles[parent_style_id].copy()
3452             for prop in SUPPORTED_STYLING:
3453                 prop_val = style.get(_x('tts:' + prop))
3454                 if prop_val:
3455                     styles.setdefault(style_id, {})[prop] = prop_val
3456         if repeat:
3457             repeat = False
3458         else:
3459             break
3460
3461     for p in ('body', 'div'):
3462         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3463         if ele is None:
3464             continue
3465         style = styles.get(ele.get('style'))
3466         if not style:
3467             continue
3468         default_style.update(style)
3469
3470     for para, index in zip(paras, itertools.count(1)):
3471         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3472         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3473         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3474         if begin_time is None:
3475             continue
3476         if not end_time:
3477             if not dur:
3478                 continue
3479             end_time = begin_time + dur
3480         out.append('%d\n%s --> %s\n%s\n\n' % (
3481             index,
3482             srt_subtitles_timecode(begin_time),
3483             srt_subtitles_timecode(end_time),
3484             parse_node(para)))
3485
3486     return ''.join(out)
3487
3488
3489 def cli_option(params, command_option, param, separator=None):
3490     param = params.get(param)
3491     return ([] if param is None
3492             else [command_option, str(param)] if separator is None
3493             else [f'{command_option}{separator}{param}'])
3494
3495
3496 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3497     param = params.get(param)
3498     assert param in (True, False, None)
3499     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3500
3501
3502 def cli_valueless_option(params, command_option, param, expected_value=True):
3503     return [command_option] if params.get(param) == expected_value else []
3504
3505
3506 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3507     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3508         if use_compat:
3509             return argdict
3510         else:
3511             argdict = None
3512     if argdict is None:
3513         return default
3514     assert isinstance(argdict, dict)
3515
3516     assert isinstance(keys, (list, tuple))
3517     for key_list in keys:
3518         arg_list = list(filter(
3519             lambda x: x is not None,
3520             [argdict.get(key.lower()) for key in variadic(key_list)]))
3521         if arg_list:
3522             return [arg for args in arg_list for arg in args]
3523     return default
3524
3525
3526 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3527     main_key, exe = main_key.lower(), exe.lower()
3528     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3529     keys = [f'{root_key}{k}' for k in (keys or [''])]
3530     if root_key in keys:
3531         if main_key != exe:
3532             keys.append((main_key, exe))
3533         keys.append('default')
3534     else:
3535         use_compat = False
3536     return cli_configuration_args(argdict, keys, default, use_compat)
3537
3538
3539 class ISO639Utils:
3540     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3541     _lang_map = {
3542         'aa': 'aar',
3543         'ab': 'abk',
3544         'ae': 'ave',
3545         'af': 'afr',
3546         'ak': 'aka',
3547         'am': 'amh',
3548         'an': 'arg',
3549         'ar': 'ara',
3550         'as': 'asm',
3551         'av': 'ava',
3552         'ay': 'aym',
3553         'az': 'aze',
3554         'ba': 'bak',
3555         'be': 'bel',
3556         'bg': 'bul',
3557         'bh': 'bih',
3558         'bi': 'bis',
3559         'bm': 'bam',
3560         'bn': 'ben',
3561         'bo': 'bod',
3562         'br': 'bre',
3563         'bs': 'bos',
3564         'ca': 'cat',
3565         'ce': 'che',
3566         'ch': 'cha',
3567         'co': 'cos',
3568         'cr': 'cre',
3569         'cs': 'ces',
3570         'cu': 'chu',
3571         'cv': 'chv',
3572         'cy': 'cym',
3573         'da': 'dan',
3574         'de': 'deu',
3575         'dv': 'div',
3576         'dz': 'dzo',
3577         'ee': 'ewe',
3578         'el': 'ell',
3579         'en': 'eng',
3580         'eo': 'epo',
3581         'es': 'spa',
3582         'et': 'est',
3583         'eu': 'eus',
3584         'fa': 'fas',
3585         'ff': 'ful',
3586         'fi': 'fin',
3587         'fj': 'fij',
3588         'fo': 'fao',
3589         'fr': 'fra',
3590         'fy': 'fry',
3591         'ga': 'gle',
3592         'gd': 'gla',
3593         'gl': 'glg',
3594         'gn': 'grn',
3595         'gu': 'guj',
3596         'gv': 'glv',
3597         'ha': 'hau',
3598         'he': 'heb',
3599         'iw': 'heb',  # Replaced by he in 1989 revision
3600         'hi': 'hin',
3601         'ho': 'hmo',
3602         'hr': 'hrv',
3603         'ht': 'hat',
3604         'hu': 'hun',
3605         'hy': 'hye',
3606         'hz': 'her',
3607         'ia': 'ina',
3608         'id': 'ind',
3609         'in': 'ind',  # Replaced by id in 1989 revision
3610         'ie': 'ile',
3611         'ig': 'ibo',
3612         'ii': 'iii',
3613         'ik': 'ipk',
3614         'io': 'ido',
3615         'is': 'isl',
3616         'it': 'ita',
3617         'iu': 'iku',
3618         'ja': 'jpn',
3619         'jv': 'jav',
3620         'ka': 'kat',
3621         'kg': 'kon',
3622         'ki': 'kik',
3623         'kj': 'kua',
3624         'kk': 'kaz',
3625         'kl': 'kal',
3626         'km': 'khm',
3627         'kn': 'kan',
3628         'ko': 'kor',
3629         'kr': 'kau',
3630         'ks': 'kas',
3631         'ku': 'kur',
3632         'kv': 'kom',
3633         'kw': 'cor',
3634         'ky': 'kir',
3635         'la': 'lat',
3636         'lb': 'ltz',
3637         'lg': 'lug',
3638         'li': 'lim',
3639         'ln': 'lin',
3640         'lo': 'lao',
3641         'lt': 'lit',
3642         'lu': 'lub',
3643         'lv': 'lav',
3644         'mg': 'mlg',
3645         'mh': 'mah',
3646         'mi': 'mri',
3647         'mk': 'mkd',
3648         'ml': 'mal',
3649         'mn': 'mon',
3650         'mr': 'mar',
3651         'ms': 'msa',
3652         'mt': 'mlt',
3653         'my': 'mya',
3654         'na': 'nau',
3655         'nb': 'nob',
3656         'nd': 'nde',
3657         'ne': 'nep',
3658         'ng': 'ndo',
3659         'nl': 'nld',
3660         'nn': 'nno',
3661         'no': 'nor',
3662         'nr': 'nbl',
3663         'nv': 'nav',
3664         'ny': 'nya',
3665         'oc': 'oci',
3666         'oj': 'oji',
3667         'om': 'orm',
3668         'or': 'ori',
3669         'os': 'oss',
3670         'pa': 'pan',
3671         'pe': 'per',
3672         'pi': 'pli',
3673         'pl': 'pol',
3674         'ps': 'pus',
3675         'pt': 'por',
3676         'qu': 'que',
3677         'rm': 'roh',
3678         'rn': 'run',
3679         'ro': 'ron',
3680         'ru': 'rus',
3681         'rw': 'kin',
3682         'sa': 'san',
3683         'sc': 'srd',
3684         'sd': 'snd',
3685         'se': 'sme',
3686         'sg': 'sag',
3687         'si': 'sin',
3688         'sk': 'slk',
3689         'sl': 'slv',
3690         'sm': 'smo',
3691         'sn': 'sna',
3692         'so': 'som',
3693         'sq': 'sqi',
3694         'sr': 'srp',
3695         'ss': 'ssw',
3696         'st': 'sot',
3697         'su': 'sun',
3698         'sv': 'swe',
3699         'sw': 'swa',
3700         'ta': 'tam',
3701         'te': 'tel',
3702         'tg': 'tgk',
3703         'th': 'tha',
3704         'ti': 'tir',
3705         'tk': 'tuk',
3706         'tl': 'tgl',
3707         'tn': 'tsn',
3708         'to': 'ton',
3709         'tr': 'tur',
3710         'ts': 'tso',
3711         'tt': 'tat',
3712         'tw': 'twi',
3713         'ty': 'tah',
3714         'ug': 'uig',
3715         'uk': 'ukr',
3716         'ur': 'urd',
3717         'uz': 'uzb',
3718         've': 'ven',
3719         'vi': 'vie',
3720         'vo': 'vol',
3721         'wa': 'wln',
3722         'wo': 'wol',
3723         'xh': 'xho',
3724         'yi': 'yid',
3725         'ji': 'yid',  # Replaced by yi in 1989 revision
3726         'yo': 'yor',
3727         'za': 'zha',
3728         'zh': 'zho',
3729         'zu': 'zul',
3730     }
3731
3732     @classmethod
3733     def short2long(cls, code):
3734         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3735         return cls._lang_map.get(code[:2])
3736
3737     @classmethod
3738     def long2short(cls, code):
3739         """Convert language code from ISO 639-2/T to ISO 639-1"""
3740         for short_name, long_name in cls._lang_map.items():
3741             if long_name == code:
3742                 return short_name
3743
3744
3745 class ISO3166Utils:
3746     # From http://data.okfn.org/data/core/country-list
3747     _country_map = {
3748         'AF': 'Afghanistan',
3749         'AX': 'Åland Islands',
3750         'AL': 'Albania',
3751         'DZ': 'Algeria',
3752         'AS': 'American Samoa',
3753         'AD': 'Andorra',
3754         'AO': 'Angola',
3755         'AI': 'Anguilla',
3756         'AQ': 'Antarctica',
3757         'AG': 'Antigua and Barbuda',
3758         'AR': 'Argentina',
3759         'AM': 'Armenia',
3760         'AW': 'Aruba',
3761         'AU': 'Australia',
3762         'AT': 'Austria',
3763         'AZ': 'Azerbaijan',
3764         'BS': 'Bahamas',
3765         'BH': 'Bahrain',
3766         'BD': 'Bangladesh',
3767         'BB': 'Barbados',
3768         'BY': 'Belarus',
3769         'BE': 'Belgium',
3770         'BZ': 'Belize',
3771         'BJ': 'Benin',
3772         'BM': 'Bermuda',
3773         'BT': 'Bhutan',
3774         'BO': 'Bolivia, Plurinational State of',
3775         'BQ': 'Bonaire, Sint Eustatius and Saba',
3776         'BA': 'Bosnia and Herzegovina',
3777         'BW': 'Botswana',
3778         'BV': 'Bouvet Island',
3779         'BR': 'Brazil',
3780         'IO': 'British Indian Ocean Territory',
3781         'BN': 'Brunei Darussalam',
3782         'BG': 'Bulgaria',
3783         'BF': 'Burkina Faso',
3784         'BI': 'Burundi',
3785         'KH': 'Cambodia',
3786         'CM': 'Cameroon',
3787         'CA': 'Canada',
3788         'CV': 'Cape Verde',
3789         'KY': 'Cayman Islands',
3790         'CF': 'Central African Republic',
3791         'TD': 'Chad',
3792         'CL': 'Chile',
3793         'CN': 'China',
3794         'CX': 'Christmas Island',
3795         'CC': 'Cocos (Keeling) Islands',
3796         'CO': 'Colombia',
3797         'KM': 'Comoros',
3798         'CG': 'Congo',
3799         'CD': 'Congo, the Democratic Republic of the',
3800         'CK': 'Cook Islands',
3801         'CR': 'Costa Rica',
3802         'CI': 'Côte d\'Ivoire',
3803         'HR': 'Croatia',
3804         'CU': 'Cuba',
3805         'CW': 'Curaçao',
3806         'CY': 'Cyprus',
3807         'CZ': 'Czech Republic',
3808         'DK': 'Denmark',
3809         'DJ': 'Djibouti',
3810         'DM': 'Dominica',
3811         'DO': 'Dominican Republic',
3812         'EC': 'Ecuador',
3813         'EG': 'Egypt',
3814         'SV': 'El Salvador',
3815         'GQ': 'Equatorial Guinea',
3816         'ER': 'Eritrea',
3817         'EE': 'Estonia',
3818         'ET': 'Ethiopia',
3819         'FK': 'Falkland Islands (Malvinas)',
3820         'FO': 'Faroe Islands',
3821         'FJ': 'Fiji',
3822         'FI': 'Finland',
3823         'FR': 'France',
3824         'GF': 'French Guiana',
3825         'PF': 'French Polynesia',
3826         'TF': 'French Southern Territories',
3827         'GA': 'Gabon',
3828         'GM': 'Gambia',
3829         'GE': 'Georgia',
3830         'DE': 'Germany',
3831         'GH': 'Ghana',
3832         'GI': 'Gibraltar',
3833         'GR': 'Greece',
3834         'GL': 'Greenland',
3835         'GD': 'Grenada',
3836         'GP': 'Guadeloupe',
3837         'GU': 'Guam',
3838         'GT': 'Guatemala',
3839         'GG': 'Guernsey',
3840         'GN': 'Guinea',
3841         'GW': 'Guinea-Bissau',
3842         'GY': 'Guyana',
3843         'HT': 'Haiti',
3844         'HM': 'Heard Island and McDonald Islands',
3845         'VA': 'Holy See (Vatican City State)',
3846         'HN': 'Honduras',
3847         'HK': 'Hong Kong',
3848         'HU': 'Hungary',
3849         'IS': 'Iceland',
3850         'IN': 'India',
3851         'ID': 'Indonesia',
3852         'IR': 'Iran, Islamic Republic of',
3853         'IQ': 'Iraq',
3854         'IE': 'Ireland',
3855         'IM': 'Isle of Man',
3856         'IL': 'Israel',
3857         'IT': 'Italy',
3858         'JM': 'Jamaica',
3859         'JP': 'Japan',
3860         'JE': 'Jersey',
3861         'JO': 'Jordan',
3862         'KZ': 'Kazakhstan',
3863         'KE': 'Kenya',
3864         'KI': 'Kiribati',
3865         'KP': 'Korea, Democratic People\'s Republic of',
3866         'KR': 'Korea, Republic of',
3867         'KW': 'Kuwait',
3868         'KG': 'Kyrgyzstan',
3869         'LA': 'Lao People\'s Democratic Republic',
3870         'LV': 'Latvia',
3871         'LB': 'Lebanon',
3872         'LS': 'Lesotho',
3873         'LR': 'Liberia',
3874         'LY': 'Libya',
3875         'LI': 'Liechtenstein',
3876         'LT': 'Lithuania',
3877         'LU': 'Luxembourg',
3878         'MO': 'Macao',
3879         'MK': 'Macedonia, the Former Yugoslav Republic of',
3880         'MG': 'Madagascar',
3881         'MW': 'Malawi',
3882         'MY': 'Malaysia',
3883         'MV': 'Maldives',
3884         'ML': 'Mali',
3885         'MT': 'Malta',
3886         'MH': 'Marshall Islands',
3887         'MQ': 'Martinique',
3888         'MR': 'Mauritania',
3889         'MU': 'Mauritius',
3890         'YT': 'Mayotte',
3891         'MX': 'Mexico',
3892         'FM': 'Micronesia, Federated States of',
3893         'MD': 'Moldova, Republic of',
3894         'MC': 'Monaco',
3895         'MN': 'Mongolia',
3896         'ME': 'Montenegro',
3897         'MS': 'Montserrat',
3898         'MA': 'Morocco',
3899         'MZ': 'Mozambique',
3900         'MM': 'Myanmar',
3901         'NA': 'Namibia',
3902         'NR': 'Nauru',
3903         'NP': 'Nepal',
3904         'NL': 'Netherlands',
3905         'NC': 'New Caledonia',
3906         'NZ': 'New Zealand',
3907         'NI': 'Nicaragua',
3908         'NE': 'Niger',
3909         'NG': 'Nigeria',
3910         'NU': 'Niue',
3911         'NF': 'Norfolk Island',
3912         'MP': 'Northern Mariana Islands',
3913         'NO': 'Norway',
3914         'OM': 'Oman',
3915         'PK': 'Pakistan',
3916         'PW': 'Palau',
3917         'PS': 'Palestine, State of',
3918         'PA': 'Panama',
3919         'PG': 'Papua New Guinea',
3920         'PY': 'Paraguay',
3921         'PE': 'Peru',
3922         'PH': 'Philippines',
3923         'PN': 'Pitcairn',
3924         'PL': 'Poland',
3925         'PT': 'Portugal',
3926         'PR': 'Puerto Rico',
3927         'QA': 'Qatar',
3928         'RE': 'Réunion',
3929         'RO': 'Romania',
3930         'RU': 'Russian Federation',
3931         'RW': 'Rwanda',
3932         'BL': 'Saint Barthélemy',
3933         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3934         'KN': 'Saint Kitts and Nevis',
3935         'LC': 'Saint Lucia',
3936         'MF': 'Saint Martin (French part)',
3937         'PM': 'Saint Pierre and Miquelon',
3938         'VC': 'Saint Vincent and the Grenadines',
3939         'WS': 'Samoa',
3940         'SM': 'San Marino',
3941         'ST': 'Sao Tome and Principe',
3942         'SA': 'Saudi Arabia',
3943         'SN': 'Senegal',
3944         'RS': 'Serbia',
3945         'SC': 'Seychelles',
3946         'SL': 'Sierra Leone',
3947         'SG': 'Singapore',
3948         'SX': 'Sint Maarten (Dutch part)',
3949         'SK': 'Slovakia',
3950         'SI': 'Slovenia',
3951         'SB': 'Solomon Islands',
3952         'SO': 'Somalia',
3953         'ZA': 'South Africa',
3954         'GS': 'South Georgia and the South Sandwich Islands',
3955         'SS': 'South Sudan',
3956         'ES': 'Spain',
3957         'LK': 'Sri Lanka',
3958         'SD': 'Sudan',
3959         'SR': 'Suriname',
3960         'SJ': 'Svalbard and Jan Mayen',
3961         'SZ': 'Swaziland',
3962         'SE': 'Sweden',
3963         'CH': 'Switzerland',
3964         'SY': 'Syrian Arab Republic',
3965         'TW': 'Taiwan, Province of China',
3966         'TJ': 'Tajikistan',
3967         'TZ': 'Tanzania, United Republic of',
3968         'TH': 'Thailand',
3969         'TL': 'Timor-Leste',
3970         'TG': 'Togo',
3971         'TK': 'Tokelau',
3972         'TO': 'Tonga',
3973         'TT': 'Trinidad and Tobago',
3974         'TN': 'Tunisia',
3975         'TR': 'Turkey',
3976         'TM': 'Turkmenistan',
3977         'TC': 'Turks and Caicos Islands',
3978         'TV': 'Tuvalu',
3979         'UG': 'Uganda',
3980         'UA': 'Ukraine',
3981         'AE': 'United Arab Emirates',
3982         'GB': 'United Kingdom',
3983         'US': 'United States',
3984         'UM': 'United States Minor Outlying Islands',
3985         'UY': 'Uruguay',
3986         'UZ': 'Uzbekistan',
3987         'VU': 'Vanuatu',
3988         'VE': 'Venezuela, Bolivarian Republic of',
3989         'VN': 'Viet Nam',
3990         'VG': 'Virgin Islands, British',
3991         'VI': 'Virgin Islands, U.S.',
3992         'WF': 'Wallis and Futuna',
3993         'EH': 'Western Sahara',
3994         'YE': 'Yemen',
3995         'ZM': 'Zambia',
3996         'ZW': 'Zimbabwe',
3997         # Not ISO 3166 codes, but used for IP blocks
3998         'AP': 'Asia/Pacific Region',
3999         'EU': 'Europe',
4000     }
4001
4002     @classmethod
4003     def short2full(cls, code):
4004         """Convert an ISO 3166-2 country code to the corresponding full name"""
4005         return cls._country_map.get(code.upper())
4006
4007
4008 class GeoUtils:
4009     # Major IPv4 address blocks per country
4010     _country_ip_map = {
4011         'AD': '46.172.224.0/19',
4012         'AE': '94.200.0.0/13',
4013         'AF': '149.54.0.0/17',
4014         'AG': '209.59.64.0/18',
4015         'AI': '204.14.248.0/21',
4016         'AL': '46.99.0.0/16',
4017         'AM': '46.70.0.0/15',
4018         'AO': '105.168.0.0/13',
4019         'AP': '182.50.184.0/21',
4020         'AQ': '23.154.160.0/24',
4021         'AR': '181.0.0.0/12',
4022         'AS': '202.70.112.0/20',
4023         'AT': '77.116.0.0/14',
4024         'AU': '1.128.0.0/11',
4025         'AW': '181.41.0.0/18',
4026         'AX': '185.217.4.0/22',
4027         'AZ': '5.197.0.0/16',
4028         'BA': '31.176.128.0/17',
4029         'BB': '65.48.128.0/17',
4030         'BD': '114.130.0.0/16',
4031         'BE': '57.0.0.0/8',
4032         'BF': '102.178.0.0/15',
4033         'BG': '95.42.0.0/15',
4034         'BH': '37.131.0.0/17',
4035         'BI': '154.117.192.0/18',
4036         'BJ': '137.255.0.0/16',
4037         'BL': '185.212.72.0/23',
4038         'BM': '196.12.64.0/18',
4039         'BN': '156.31.0.0/16',
4040         'BO': '161.56.0.0/16',
4041         'BQ': '161.0.80.0/20',
4042         'BR': '191.128.0.0/12',
4043         'BS': '24.51.64.0/18',
4044         'BT': '119.2.96.0/19',
4045         'BW': '168.167.0.0/16',
4046         'BY': '178.120.0.0/13',
4047         'BZ': '179.42.192.0/18',
4048         'CA': '99.224.0.0/11',
4049         'CD': '41.243.0.0/16',
4050         'CF': '197.242.176.0/21',
4051         'CG': '160.113.0.0/16',
4052         'CH': '85.0.0.0/13',
4053         'CI': '102.136.0.0/14',
4054         'CK': '202.65.32.0/19',
4055         'CL': '152.172.0.0/14',
4056         'CM': '102.244.0.0/14',
4057         'CN': '36.128.0.0/10',
4058         'CO': '181.240.0.0/12',
4059         'CR': '201.192.0.0/12',
4060         'CU': '152.206.0.0/15',
4061         'CV': '165.90.96.0/19',
4062         'CW': '190.88.128.0/17',
4063         'CY': '31.153.0.0/16',
4064         'CZ': '88.100.0.0/14',
4065         'DE': '53.0.0.0/8',
4066         'DJ': '197.241.0.0/17',
4067         'DK': '87.48.0.0/12',
4068         'DM': '192.243.48.0/20',
4069         'DO': '152.166.0.0/15',
4070         'DZ': '41.96.0.0/12',
4071         'EC': '186.68.0.0/15',
4072         'EE': '90.190.0.0/15',
4073         'EG': '156.160.0.0/11',
4074         'ER': '196.200.96.0/20',
4075         'ES': '88.0.0.0/11',
4076         'ET': '196.188.0.0/14',
4077         'EU': '2.16.0.0/13',
4078         'FI': '91.152.0.0/13',
4079         'FJ': '144.120.0.0/16',
4080         'FK': '80.73.208.0/21',
4081         'FM': '119.252.112.0/20',
4082         'FO': '88.85.32.0/19',
4083         'FR': '90.0.0.0/9',
4084         'GA': '41.158.0.0/15',
4085         'GB': '25.0.0.0/8',
4086         'GD': '74.122.88.0/21',
4087         'GE': '31.146.0.0/16',
4088         'GF': '161.22.64.0/18',
4089         'GG': '62.68.160.0/19',
4090         'GH': '154.160.0.0/12',
4091         'GI': '95.164.0.0/16',
4092         'GL': '88.83.0.0/19',
4093         'GM': '160.182.0.0/15',
4094         'GN': '197.149.192.0/18',
4095         'GP': '104.250.0.0/19',
4096         'GQ': '105.235.224.0/20',
4097         'GR': '94.64.0.0/13',
4098         'GT': '168.234.0.0/16',
4099         'GU': '168.123.0.0/16',
4100         'GW': '197.214.80.0/20',
4101         'GY': '181.41.64.0/18',
4102         'HK': '113.252.0.0/14',
4103         'HN': '181.210.0.0/16',
4104         'HR': '93.136.0.0/13',
4105         'HT': '148.102.128.0/17',
4106         'HU': '84.0.0.0/14',
4107         'ID': '39.192.0.0/10',
4108         'IE': '87.32.0.0/12',
4109         'IL': '79.176.0.0/13',
4110         'IM': '5.62.80.0/20',
4111         'IN': '117.192.0.0/10',
4112         'IO': '203.83.48.0/21',
4113         'IQ': '37.236.0.0/14',
4114         'IR': '2.176.0.0/12',
4115         'IS': '82.221.0.0/16',
4116         'IT': '79.0.0.0/10',
4117         'JE': '87.244.64.0/18',
4118         'JM': '72.27.0.0/17',
4119         'JO': '176.29.0.0/16',
4120         'JP': '133.0.0.0/8',
4121         'KE': '105.48.0.0/12',
4122         'KG': '158.181.128.0/17',
4123         'KH': '36.37.128.0/17',
4124         'KI': '103.25.140.0/22',
4125         'KM': '197.255.224.0/20',
4126         'KN': '198.167.192.0/19',
4127         'KP': '175.45.176.0/22',
4128         'KR': '175.192.0.0/10',
4129         'KW': '37.36.0.0/14',
4130         'KY': '64.96.0.0/15',
4131         'KZ': '2.72.0.0/13',
4132         'LA': '115.84.64.0/18',
4133         'LB': '178.135.0.0/16',
4134         'LC': '24.92.144.0/20',
4135         'LI': '82.117.0.0/19',
4136         'LK': '112.134.0.0/15',
4137         'LR': '102.183.0.0/16',
4138         'LS': '129.232.0.0/17',
4139         'LT': '78.56.0.0/13',
4140         'LU': '188.42.0.0/16',
4141         'LV': '46.109.0.0/16',
4142         'LY': '41.252.0.0/14',
4143         'MA': '105.128.0.0/11',
4144         'MC': '88.209.64.0/18',
4145         'MD': '37.246.0.0/16',
4146         'ME': '178.175.0.0/17',
4147         'MF': '74.112.232.0/21',
4148         'MG': '154.126.0.0/17',
4149         'MH': '117.103.88.0/21',
4150         'MK': '77.28.0.0/15',
4151         'ML': '154.118.128.0/18',
4152         'MM': '37.111.0.0/17',
4153         'MN': '49.0.128.0/17',
4154         'MO': '60.246.0.0/16',
4155         'MP': '202.88.64.0/20',
4156         'MQ': '109.203.224.0/19',
4157         'MR': '41.188.64.0/18',
4158         'MS': '208.90.112.0/22',
4159         'MT': '46.11.0.0/16',
4160         'MU': '105.16.0.0/12',
4161         'MV': '27.114.128.0/18',
4162         'MW': '102.70.0.0/15',
4163         'MX': '187.192.0.0/11',
4164         'MY': '175.136.0.0/13',
4165         'MZ': '197.218.0.0/15',
4166         'NA': '41.182.0.0/16',
4167         'NC': '101.101.0.0/18',
4168         'NE': '197.214.0.0/18',
4169         'NF': '203.17.240.0/22',
4170         'NG': '105.112.0.0/12',
4171         'NI': '186.76.0.0/15',
4172         'NL': '145.96.0.0/11',
4173         'NO': '84.208.0.0/13',
4174         'NP': '36.252.0.0/15',
4175         'NR': '203.98.224.0/19',
4176         'NU': '49.156.48.0/22',
4177         'NZ': '49.224.0.0/14',
4178         'OM': '5.36.0.0/15',
4179         'PA': '186.72.0.0/15',
4180         'PE': '186.160.0.0/14',
4181         'PF': '123.50.64.0/18',
4182         'PG': '124.240.192.0/19',
4183         'PH': '49.144.0.0/13',
4184         'PK': '39.32.0.0/11',
4185         'PL': '83.0.0.0/11',
4186         'PM': '70.36.0.0/20',
4187         'PR': '66.50.0.0/16',
4188         'PS': '188.161.0.0/16',
4189         'PT': '85.240.0.0/13',
4190         'PW': '202.124.224.0/20',
4191         'PY': '181.120.0.0/14',
4192         'QA': '37.210.0.0/15',
4193         'RE': '102.35.0.0/16',
4194         'RO': '79.112.0.0/13',
4195         'RS': '93.86.0.0/15',
4196         'RU': '5.136.0.0/13',
4197         'RW': '41.186.0.0/16',
4198         'SA': '188.48.0.0/13',
4199         'SB': '202.1.160.0/19',
4200         'SC': '154.192.0.0/11',
4201         'SD': '102.120.0.0/13',
4202         'SE': '78.64.0.0/12',
4203         'SG': '8.128.0.0/10',
4204         'SI': '188.196.0.0/14',
4205         'SK': '78.98.0.0/15',
4206         'SL': '102.143.0.0/17',
4207         'SM': '89.186.32.0/19',
4208         'SN': '41.82.0.0/15',
4209         'SO': '154.115.192.0/18',
4210         'SR': '186.179.128.0/17',
4211         'SS': '105.235.208.0/21',
4212         'ST': '197.159.160.0/19',
4213         'SV': '168.243.0.0/16',
4214         'SX': '190.102.0.0/20',
4215         'SY': '5.0.0.0/16',
4216         'SZ': '41.84.224.0/19',
4217         'TC': '65.255.48.0/20',
4218         'TD': '154.68.128.0/19',
4219         'TG': '196.168.0.0/14',
4220         'TH': '171.96.0.0/13',
4221         'TJ': '85.9.128.0/18',
4222         'TK': '27.96.24.0/21',
4223         'TL': '180.189.160.0/20',
4224         'TM': '95.85.96.0/19',
4225         'TN': '197.0.0.0/11',
4226         'TO': '175.176.144.0/21',
4227         'TR': '78.160.0.0/11',
4228         'TT': '186.44.0.0/15',
4229         'TV': '202.2.96.0/19',
4230         'TW': '120.96.0.0/11',
4231         'TZ': '156.156.0.0/14',
4232         'UA': '37.52.0.0/14',
4233         'UG': '102.80.0.0/13',
4234         'US': '6.0.0.0/8',
4235         'UY': '167.56.0.0/13',
4236         'UZ': '84.54.64.0/18',
4237         'VA': '212.77.0.0/19',
4238         'VC': '207.191.240.0/21',
4239         'VE': '186.88.0.0/13',
4240         'VG': '66.81.192.0/20',
4241         'VI': '146.226.0.0/16',
4242         'VN': '14.160.0.0/11',
4243         'VU': '202.80.32.0/20',
4244         'WF': '117.20.32.0/21',
4245         'WS': '202.4.32.0/19',
4246         'YE': '134.35.0.0/16',
4247         'YT': '41.242.116.0/22',
4248         'ZA': '41.0.0.0/11',
4249         'ZM': '102.144.0.0/13',
4250         'ZW': '102.177.192.0/18',
4251     }
4252
4253     @classmethod
4254     def random_ipv4(cls, code_or_block):
4255         if len(code_or_block) == 2:
4256             block = cls._country_ip_map.get(code_or_block.upper())
4257             if not block:
4258                 return None
4259         else:
4260             block = code_or_block
4261         addr, preflen = block.split('/')
4262         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4263         addr_max = addr_min | (0xffffffff >> int(preflen))
4264         return str(socket.inet_ntoa(
4265             struct.pack('!L', random.randint(addr_min, addr_max))))
4266
4267
4268 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4269 # released into Public Domain
4270 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4271
4272 def long_to_bytes(n, blocksize=0):
4273     """long_to_bytes(n:long, blocksize:int) : string
4274     Convert a long integer to a byte string.
4275
4276     If optional blocksize is given and greater than zero, pad the front of the
4277     byte string with binary zeros so that the length is a multiple of
4278     blocksize.
4279     """
4280     # after much testing, this algorithm was deemed to be the fastest
4281     s = b''
4282     n = int(n)
4283     while n > 0:
4284         s = struct.pack('>I', n & 0xffffffff) + s
4285         n = n >> 32
4286     # strip off leading zeros
4287     for i in range(len(s)):
4288         if s[i] != b'\000'[0]:
4289             break
4290     else:
4291         # only happens when n == 0
4292         s = b'\000'
4293         i = 0
4294     s = s[i:]
4295     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4296     # de-padding being done above, but sigh...
4297     if blocksize > 0 and len(s) % blocksize:
4298         s = (blocksize - len(s) % blocksize) * b'\000' + s
4299     return s
4300
4301
4302 def bytes_to_long(s):
4303     """bytes_to_long(string) : long
4304     Convert a byte string to a long integer.
4305
4306     This is (essentially) the inverse of long_to_bytes().
4307     """
4308     acc = 0
4309     length = len(s)
4310     if length % 4:
4311         extra = (4 - length % 4)
4312         s = b'\000' * extra + s
4313         length = length + extra
4314     for i in range(0, length, 4):
4315         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4316     return acc
4317
4318
4319 def ohdave_rsa_encrypt(data, exponent, modulus):
4320     '''
4321     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4322
4323     Input:
4324         data: data to encrypt, bytes-like object
4325         exponent, modulus: parameter e and N of RSA algorithm, both integer
4326     Output: hex string of encrypted data
4327
4328     Limitation: supports one block encryption only
4329     '''
4330
4331     payload = int(binascii.hexlify(data[::-1]), 16)
4332     encrypted = pow(payload, exponent, modulus)
4333     return '%x' % encrypted
4334
4335
4336 def pkcs1pad(data, length):
4337     """
4338     Padding input data with PKCS#1 scheme
4339
4340     @param {int[]} data        input data
4341     @param {int}   length      target length
4342     @returns {int[]}           padded data
4343     """
4344     if len(data) > length - 11:
4345         raise ValueError('Input data too long for PKCS#1 padding')
4346
4347     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4348     return [0, 2] + pseudo_random + [0] + data
4349
4350
4351 def _base_n_table(n, table):
4352     if not table and not n:
4353         raise ValueError('Either table or n must be specified')
4354     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4355
4356     if n and n != len(table):
4357         raise ValueError(f'base {n} exceeds table length {len(table)}')
4358     return table
4359
4360
4361 def encode_base_n(num, n=None, table=None):
4362     """Convert given int to a base-n string"""
4363     table = _base_n_table(n, table)
4364     if not num:
4365         return table[0]
4366
4367     result, base = '', len(table)
4368     while num:
4369         result = table[num % base] + result
4370         num = num // base
4371     return result
4372
4373
4374 def decode_base_n(string, n=None, table=None):
4375     """Convert given base-n string to int"""
4376     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4377     result, base = 0, len(table)
4378     for char in string:
4379         result = result * base + table[char]
4380     return result
4381
4382
4383 def decode_packed_codes(code):
4384     mobj = re.search(PACKED_CODES_RE, code)
4385     obfuscated_code, base, count, symbols = mobj.groups()
4386     base = int(base)
4387     count = int(count)
4388     symbols = symbols.split('|')
4389     symbol_table = {}
4390
4391     while count:
4392         count -= 1
4393         base_n_count = encode_base_n(count, base)
4394         symbol_table[base_n_count] = symbols[count] or base_n_count
4395
4396     return re.sub(
4397         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4398         obfuscated_code)
4399
4400
4401 def caesar(s, alphabet, shift):
4402     if shift == 0:
4403         return s
4404     l = len(alphabet)
4405     return ''.join(
4406         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4407         for c in s)
4408
4409
4410 def rot47(s):
4411     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4412
4413
4414 def parse_m3u8_attributes(attrib):
4415     info = {}
4416     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4417         if val.startswith('"'):
4418             val = val[1:-1]
4419         info[key] = val
4420     return info
4421
4422
4423 def urshift(val, n):
4424     return val >> n if val >= 0 else (val + 0x100000000) >> n
4425
4426
4427 def write_xattr(path, key, value):
4428     # Windows: Write xattrs to NTFS Alternate Data Streams:
4429     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4430     if compat_os_name == 'nt':
4431         assert ':' not in key
4432         assert os.path.exists(path)
4433
4434         try:
4435             with open(f'{path}:{key}', 'wb') as f:
4436                 f.write(value)
4437         except OSError as e:
4438             raise XAttrMetadataError(e.errno, e.strerror)
4439         return
4440
4441     # UNIX Method 1. Use xattrs/pyxattrs modules
4442
4443     setxattr = None
4444     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4445         # Unicode arguments are not supported in pyxattr until version 0.5.0
4446         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4447         if version_tuple(xattr.__version__) >= (0, 5, 0):
4448             setxattr = xattr.set
4449     elif xattr:
4450         setxattr = xattr.setxattr
4451
4452     if setxattr:
4453         try:
4454             setxattr(path, key, value)
4455         except OSError as e:
4456             raise XAttrMetadataError(e.errno, e.strerror)
4457         return
4458
4459     # UNIX Method 2. Use setfattr/xattr executables
4460     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4461            else 'xattr' if check_executable('xattr', ['-h']) else None)
4462     if not exe:
4463         raise XAttrUnavailableError(
4464             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4465             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4466
4467     value = value.decode()
4468     try:
4469         _, stderr, returncode = Popen.run(
4470             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4471             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4472     except OSError as e:
4473         raise XAttrMetadataError(e.errno, e.strerror)
4474     if returncode:
4475         raise XAttrMetadataError(returncode, stderr)
4476
4477
4478 def random_birthday(year_field, month_field, day_field):
4479     start_date = datetime.date(1950, 1, 1)
4480     end_date = datetime.date(1995, 12, 31)
4481     offset = random.randint(0, (end_date - start_date).days)
4482     random_date = start_date + datetime.timedelta(offset)
4483     return {
4484         year_field: str(random_date.year),
4485         month_field: str(random_date.month),
4486         day_field: str(random_date.day),
4487     }
4488
4489
4490 def find_available_port(interface=''):
4491     try:
4492         with socket.socket() as sock:
4493             sock.bind((interface, 0))
4494             return sock.getsockname()[1]
4495     except OSError:
4496         return None
4497
4498
4499 # Templates for internet shortcut files, which are plain text files.
4500 DOT_URL_LINK_TEMPLATE = '''\
4501 [InternetShortcut]
4502 URL=%(url)s
4503 '''
4504
4505 DOT_WEBLOC_LINK_TEMPLATE = '''\
4506 <?xml version="1.0" encoding="UTF-8"?>
4507 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4508 <plist version="1.0">
4509 <dict>
4510 \t<key>URL</key>
4511 \t<string>%(url)s</string>
4512 </dict>
4513 </plist>
4514 '''
4515
4516 DOT_DESKTOP_LINK_TEMPLATE = '''\
4517 [Desktop Entry]
4518 Encoding=UTF-8
4519 Name=%(filename)s
4520 Type=Link
4521 URL=%(url)s
4522 Icon=text-html
4523 '''
4524
4525 LINK_TEMPLATES = {
4526     'url': DOT_URL_LINK_TEMPLATE,
4527     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4528     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4529 }
4530
4531
4532 def iri_to_uri(iri):
4533     """
4534     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4535
4536     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4537     """
4538
4539     iri_parts = urllib.parse.urlparse(iri)
4540
4541     if '[' in iri_parts.netloc:
4542         raise ValueError('IPv6 URIs are not, yet, supported.')
4543         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4544
4545     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4546
4547     net_location = ''
4548     if iri_parts.username:
4549         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4550         if iri_parts.password is not None:
4551             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4552         net_location += '@'
4553
4554     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4555     # The 'idna' encoding produces ASCII text.
4556     if iri_parts.port is not None and iri_parts.port != 80:
4557         net_location += ':' + str(iri_parts.port)
4558
4559     return urllib.parse.urlunparse(
4560         (iri_parts.scheme,
4561             net_location,
4562
4563             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4564
4565             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4566             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4567
4568             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4569             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4570
4571             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4572
4573     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4574
4575
4576 def to_high_limit_path(path):
4577     if sys.platform in ['win32', 'cygwin']:
4578         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4579         return '\\\\?\\' + os.path.abspath(path)
4580
4581     return path
4582
4583
4584 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4585     val = traversal.traverse_obj(obj, *variadic(field))
4586     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4587         return default
4588     return template % func(val)
4589
4590
4591 def clean_podcast_url(url):
4592     url = re.sub(r'''(?x)
4593         (?:
4594             (?:
4595                 chtbl\.com/track|
4596                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4597                 play\.podtrac\.com|
4598                 chrt\.fm/track|
4599                 mgln\.ai/e
4600             )(?:/[^/.]+)?|
4601             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4602             flex\.acast\.com|
4603             pd(?:
4604                 cn\.co| # https://podcorn.com/analytics-prefix/
4605                 st\.fm # https://podsights.com/docs/
4606             )/e|
4607             [0-9]\.gum\.fm|
4608             pscrb\.fm/rss/p
4609         )/''', '', url)
4610     return re.sub(r'^\w+://(\w+://)', r'\1', url)
4611
4612
4613 _HEX_TABLE = '0123456789abcdef'
4614
4615
4616 def random_uuidv4():
4617     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4618
4619
4620 def make_dir(path, to_screen=None):
4621     try:
4622         dn = os.path.dirname(path)
4623         if dn:
4624             os.makedirs(dn, exist_ok=True)
4625         return True
4626     except OSError as err:
4627         if callable(to_screen) is not None:
4628             to_screen(f'unable to create directory {err}')
4629         return False
4630
4631
4632 def get_executable_path():
4633     from ..update import _get_variant_and_executable_path
4634
4635     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4636
4637
4638 def get_user_config_dirs(package_name):
4639     # .config (e.g. ~/.config/package_name)
4640     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4641     yield os.path.join(xdg_config_home, package_name)
4642
4643     # appdata (%APPDATA%/package_name)
4644     appdata_dir = os.getenv('appdata')
4645     if appdata_dir:
4646         yield os.path.join(appdata_dir, package_name)
4647
4648     # home (~/.package_name)
4649     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4650
4651
4652 def get_system_config_dirs(package_name):
4653     # /etc/package_name
4654     yield os.path.join('/etc', package_name)
4655
4656
4657 def time_seconds(**kwargs):
4658     """
4659     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4660     """
4661     return time.time() + datetime.timedelta(**kwargs).total_seconds()
4662
4663
4664 # create a JSON Web Signature (jws) with HS256 algorithm
4665 # the resulting format is in JWS Compact Serialization
4666 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4667 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4668 def jwt_encode_hs256(payload_data, key, headers={}):
4669     header_data = {
4670         'alg': 'HS256',
4671         'typ': 'JWT',
4672     }
4673     if headers:
4674         header_data.update(headers)
4675     header_b64 = base64.b64encode(json.dumps(header_data).encode())
4676     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4677     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4678     signature_b64 = base64.b64encode(h.digest())
4679     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4680     return token
4681
4682
4683 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4684 def jwt_decode_hs256(jwt):
4685     header_b64, payload_b64, signature_b64 = jwt.split('.')
4686     # add trailing ='s that may have been stripped, superfluous ='s are ignored
4687     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4688     return payload_data
4689
4690
4691 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4692
4693
4694 @functools.cache
4695 def supports_terminal_sequences(stream):
4696     if compat_os_name == 'nt':
4697         if not WINDOWS_VT_MODE:
4698             return False
4699     elif not os.getenv('TERM'):
4700         return False
4701     try:
4702         return stream.isatty()
4703     except BaseException:
4704         return False
4705
4706
4707 def windows_enable_vt_mode():
4708     """Ref: https://bugs.python.org/issue30075 """
4709     if get_windows_version() < (10, 0, 10586):
4710         return
4711
4712     import ctypes
4713     import ctypes.wintypes
4714     import msvcrt
4715
4716     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4717
4718     dll = ctypes.WinDLL('kernel32', use_last_error=False)
4719     handle = os.open('CONOUT$', os.O_RDWR)
4720     try:
4721         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4722         dw_original_mode = ctypes.wintypes.DWORD()
4723         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4724         if not success:
4725             raise Exception('GetConsoleMode failed')
4726
4727         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4728             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4729         if not success:
4730             raise Exception('SetConsoleMode failed')
4731     finally:
4732         os.close(handle)
4733
4734     global WINDOWS_VT_MODE
4735     WINDOWS_VT_MODE = True
4736     supports_terminal_sequences.cache_clear()
4737
4738
4739 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4740
4741
4742 def remove_terminal_sequences(string):
4743     return _terminal_sequences_re.sub('', string)
4744
4745
4746 def number_of_digits(number):
4747     return len('%d' % number)
4748
4749
4750 def join_nonempty(*values, delim='-', from_dict=None):
4751     if from_dict is not None:
4752         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4753     return delim.join(map(str, filter(None, values)))
4754
4755
4756 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4757     """
4758     Find the largest format dimensions in terms of video width and, for each thumbnail:
4759     * Modify the URL: Match the width with the provided regex and replace with the former width
4760     * Update dimensions
4761
4762     This function is useful with video services that scale the provided thumbnails on demand
4763     """
4764     _keys = ('width', 'height')
4765     max_dimensions = max(
4766         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4767         default=(0, 0))
4768     if not max_dimensions[0]:
4769         return thumbnails
4770     return [
4771         merge_dicts(
4772             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4773             dict(zip(_keys, max_dimensions)), thumbnail)
4774         for thumbnail in thumbnails
4775     ]
4776
4777
4778 def parse_http_range(range):
4779     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4780     if not range:
4781         return None, None, None
4782     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4783     if not crg:
4784         return None, None, None
4785     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4786
4787
4788 def read_stdin(what):
4789     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4790     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4791     return sys.stdin
4792
4793
4794 def determine_file_encoding(data):
4795     """
4796     Detect the text encoding used
4797     @returns (encoding, bytes to skip)
4798     """
4799
4800     # BOM marks are given priority over declarations
4801     for bom, enc in BOMS:
4802         if data.startswith(bom):
4803             return enc, len(bom)
4804
4805     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4806     # We ignore the endianness to get a good enough match
4807     data = data.replace(b'\0', b'')
4808     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4809     return mobj.group(1).decode() if mobj else None, 0
4810
4811
4812 class Config:
4813     own_args = None
4814     parsed_args = None
4815     filename = None
4816     __initialized = False
4817
4818     def __init__(self, parser, label=None):
4819         self.parser, self.label = parser, label
4820         self._loaded_paths, self.configs = set(), []
4821
4822     def init(self, args=None, filename=None):
4823         assert not self.__initialized
4824         self.own_args, self.filename = args, filename
4825         return self.load_configs()
4826
4827     def load_configs(self):
4828         directory = ''
4829         if self.filename:
4830             location = os.path.realpath(self.filename)
4831             directory = os.path.dirname(location)
4832             if location in self._loaded_paths:
4833                 return False
4834             self._loaded_paths.add(location)
4835
4836         self.__initialized = True
4837         opts, _ = self.parser.parse_known_args(self.own_args)
4838         self.parsed_args = self.own_args
4839         for location in opts.config_locations or []:
4840             if location == '-':
4841                 if location in self._loaded_paths:
4842                     continue
4843                 self._loaded_paths.add(location)
4844                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4845                 continue
4846             location = os.path.join(directory, expand_path(location))
4847             if os.path.isdir(location):
4848                 location = os.path.join(location, 'yt-dlp.conf')
4849             if not os.path.exists(location):
4850                 self.parser.error(f'config location {location} does not exist')
4851             self.append_config(self.read_file(location), location)
4852         return True
4853
4854     def __str__(self):
4855         label = join_nonempty(
4856             self.label, 'config', f'"{self.filename}"' if self.filename else '',
4857             delim=' ')
4858         return join_nonempty(
4859             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4860             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4861             delim='\n')
4862
4863     @staticmethod
4864     def read_file(filename, default=[]):
4865         try:
4866             optionf = open(filename, 'rb')
4867         except OSError:
4868             return default  # silently skip if file is not present
4869         try:
4870             enc, skip = determine_file_encoding(optionf.read(512))
4871             optionf.seek(skip, io.SEEK_SET)
4872         except OSError:
4873             enc = None  # silently skip read errors
4874         try:
4875             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4876             contents = optionf.read().decode(enc or preferredencoding())
4877             res = shlex.split(contents, comments=True)
4878         except Exception as err:
4879             raise ValueError(f'Unable to parse "{filename}": {err}')
4880         finally:
4881             optionf.close()
4882         return res
4883
4884     @staticmethod
4885     def hide_login_info(opts):
4886         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4887         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4888
4889         def _scrub_eq(o):
4890             m = eqre.match(o)
4891             if m:
4892                 return m.group('key') + '=PRIVATE'
4893             else:
4894                 return o
4895
4896         opts = list(map(_scrub_eq, opts))
4897         for idx, opt in enumerate(opts):
4898             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4899                 opts[idx + 1] = 'PRIVATE'
4900         return opts
4901
4902     def append_config(self, *args, label=None):
4903         config = type(self)(self.parser, label)
4904         config._loaded_paths = self._loaded_paths
4905         if config.init(*args):
4906             self.configs.append(config)
4907
4908     @property
4909     def all_args(self):
4910         for config in reversed(self.configs):
4911             yield from config.all_args
4912         yield from self.parsed_args or []
4913
4914     def parse_known_args(self, **kwargs):
4915         return self.parser.parse_known_args(self.all_args, **kwargs)
4916
4917     def parse_args(self):
4918         return self.parser.parse_args(self.all_args)
4919
4920
4921 class WebSocketsWrapper:
4922     """Wraps websockets module to use in non-async scopes"""
4923     pool = None
4924
4925     def __init__(self, url, headers=None, connect=True):
4926         self.loop = asyncio.new_event_loop()
4927         # XXX: "loop" is deprecated
4928         self.conn = websockets.connect(
4929             url, extra_headers=headers, ping_interval=None,
4930             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
4931         if connect:
4932             self.__enter__()
4933         atexit.register(self.__exit__, None, None, None)
4934
4935     def __enter__(self):
4936         if not self.pool:
4937             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
4938         return self
4939
4940     def send(self, *args):
4941         self.run_with_loop(self.pool.send(*args), self.loop)
4942
4943     def recv(self, *args):
4944         return self.run_with_loop(self.pool.recv(*args), self.loop)
4945
4946     def __exit__(self, type, value, traceback):
4947         try:
4948             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
4949         finally:
4950             self.loop.close()
4951             self._cancel_all_tasks(self.loop)
4952
4953     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
4954     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
4955     @staticmethod
4956     def run_with_loop(main, loop):
4957         if not asyncio.iscoroutine(main):
4958             raise ValueError(f'a coroutine was expected, got {main!r}')
4959
4960         try:
4961             return loop.run_until_complete(main)
4962         finally:
4963             loop.run_until_complete(loop.shutdown_asyncgens())
4964             if hasattr(loop, 'shutdown_default_executor'):
4965                 loop.run_until_complete(loop.shutdown_default_executor())
4966
4967     @staticmethod
4968     def _cancel_all_tasks(loop):
4969         to_cancel = asyncio.all_tasks(loop)
4970
4971         if not to_cancel:
4972             return
4973
4974         for task in to_cancel:
4975             task.cancel()
4976
4977         # XXX: "loop" is removed in python 3.10+
4978         loop.run_until_complete(
4979             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
4980
4981         for task in to_cancel:
4982             if task.cancelled():
4983                 continue
4984             if task.exception() is not None:
4985                 loop.call_exception_handler({
4986                     'message': 'unhandled exception during asyncio.run() shutdown',
4987                     'exception': task.exception(),
4988                     'task': task,
4989                 })
4990
4991
4992 def merge_headers(*dicts):
4993     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4994     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
4995
4996
4997 def cached_method(f):
4998     """Cache a method"""
4999     signature = inspect.signature(f)
5000
5001     @functools.wraps(f)
5002     def wrapper(self, *args, **kwargs):
5003         bound_args = signature.bind(self, *args, **kwargs)
5004         bound_args.apply_defaults()
5005         key = tuple(bound_args.arguments.values())[1:]
5006
5007         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5008         if key not in cache:
5009             cache[key] = f(self, *args, **kwargs)
5010         return cache[key]
5011     return wrapper
5012
5013
5014 class classproperty:
5015     """property access for class methods with optional caching"""
5016     def __new__(cls, func=None, *args, **kwargs):
5017         if not func:
5018             return functools.partial(cls, *args, **kwargs)
5019         return super().__new__(cls)
5020
5021     def __init__(self, func, *, cache=False):
5022         functools.update_wrapper(self, func)
5023         self.func = func
5024         self._cache = {} if cache else None
5025
5026     def __get__(self, _, cls):
5027         if self._cache is None:
5028             return self.func(cls)
5029         elif cls not in self._cache:
5030             self._cache[cls] = self.func(cls)
5031         return self._cache[cls]
5032
5033
5034 class function_with_repr:
5035     def __init__(self, func, repr_=None):
5036         functools.update_wrapper(self, func)
5037         self.func, self.__repr = func, repr_
5038
5039     def __call__(self, *args, **kwargs):
5040         return self.func(*args, **kwargs)
5041
5042     def __repr__(self):
5043         if self.__repr:
5044             return self.__repr
5045         return f'{self.func.__module__}.{self.func.__qualname__}'
5046
5047
5048 class Namespace(types.SimpleNamespace):
5049     """Immutable namespace"""
5050
5051     def __iter__(self):
5052         return iter(self.__dict__.values())
5053
5054     @property
5055     def items_(self):
5056         return self.__dict__.items()
5057
5058
5059 MEDIA_EXTENSIONS = Namespace(
5060     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5061     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5062     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5063     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5064     thumbnails=('jpg', 'png', 'webp'),
5065     storyboards=('mhtml', ),
5066     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5067     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5068 )
5069 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5070 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5071
5072 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5073
5074
5075 class RetryManager:
5076     """Usage:
5077         for retry in RetryManager(...):
5078             try:
5079                 ...
5080             except SomeException as err:
5081                 retry.error = err
5082                 continue
5083     """
5084     attempt, _error = 0, None
5085
5086     def __init__(self, _retries, _error_callback, **kwargs):
5087         self.retries = _retries or 0
5088         self.error_callback = functools.partial(_error_callback, **kwargs)
5089
5090     def _should_retry(self):
5091         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5092
5093     @property
5094     def error(self):
5095         if self._error is NO_DEFAULT:
5096             return None
5097         return self._error
5098
5099     @error.setter
5100     def error(self, value):
5101         self._error = value
5102
5103     def __iter__(self):
5104         while self._should_retry():
5105             self.error = NO_DEFAULT
5106             self.attempt += 1
5107             yield self
5108             if self.error:
5109                 self.error_callback(self.error, self.attempt, self.retries)
5110
5111     @staticmethod
5112     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5113         """Utility function for reporting retries"""
5114         if count > retries:
5115             if error:
5116                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5117             raise e
5118
5119         if not count:
5120             return warn(e)
5121         elif isinstance(e, ExtractorError):
5122             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5123         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5124
5125         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5126         if delay:
5127             info(f'Sleeping {delay:.2f} seconds ...')
5128             time.sleep(delay)
5129
5130
5131 def make_archive_id(ie, video_id):
5132     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5133     return f'{ie_key.lower()} {video_id}'
5134
5135
5136 def truncate_string(s, left, right=0):
5137     assert left > 3 and right >= 0
5138     if s is None or len(s) <= left + right:
5139         return s
5140     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5141
5142
5143 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5144     assert 'all' in alias_dict, '"all" alias is required'
5145     requested = list(start or [])
5146     for val in options:
5147         discard = val.startswith('-')
5148         if discard:
5149             val = val[1:]
5150
5151         if val in alias_dict:
5152             val = alias_dict[val] if not discard else [
5153                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5154             # NB: Do not allow regex in aliases for performance
5155             requested = orderedSet_from_options(val, alias_dict, start=requested)
5156             continue
5157
5158         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5159                    else [val] if val in alias_dict['all'] else None)
5160         if current is None:
5161             raise ValueError(val)
5162
5163         if discard:
5164             for item in current:
5165                 while item in requested:
5166                     requested.remove(item)
5167         else:
5168             requested.extend(current)
5169
5170     return orderedSet(requested)
5171
5172
5173 # TODO: Rewrite
5174 class FormatSorter:
5175     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5176
5177     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5178                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5179                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5180     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5181                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5182                     'fps', 'fs_approx', 'source', 'id')
5183
5184     settings = {
5185         'vcodec': {'type': 'ordered', 'regex': True,
5186                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5187         'acodec': {'type': 'ordered', 'regex': True,
5188                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5189         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5190                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5191         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5192                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5193         'vext': {'type': 'ordered', 'field': 'video_ext',
5194                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5195                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5196         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5197                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5198                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5199         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5200         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5201                        'field': ('vcodec', 'acodec'),
5202                        'function': lambda it: int(any(v != 'none' for v in it))},
5203         'ie_pref': {'priority': True, 'type': 'extractor'},
5204         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5205         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5206         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5207         'quality': {'convert': 'float', 'default': -1},
5208         'filesize': {'convert': 'bytes'},
5209         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5210         'id': {'convert': 'string', 'field': 'format_id'},
5211         'height': {'convert': 'float_none'},
5212         'width': {'convert': 'float_none'},
5213         'fps': {'convert': 'float_none'},
5214         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5215         'tbr': {'convert': 'float_none'},
5216         'vbr': {'convert': 'float_none'},
5217         'abr': {'convert': 'float_none'},
5218         'asr': {'convert': 'float_none'},
5219         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5220
5221         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5222         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5223                'function': lambda it: next(filter(None, it), None)},
5224         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5225                  'function': lambda it: next(filter(None, it), None)},
5226         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5227         'res': {'type': 'multiple', 'field': ('height', 'width'),
5228                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5229
5230         # Actual field names
5231         'format_id': {'type': 'alias', 'field': 'id'},
5232         'preference': {'type': 'alias', 'field': 'ie_pref'},
5233         'language_preference': {'type': 'alias', 'field': 'lang'},
5234         'source_preference': {'type': 'alias', 'field': 'source'},
5235         'protocol': {'type': 'alias', 'field': 'proto'},
5236         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5237         'audio_channels': {'type': 'alias', 'field': 'channels'},
5238
5239         # Deprecated
5240         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5241         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5242         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5243         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5244         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5245         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5246         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5247         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5248         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5249         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5250         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5251         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5252         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5253         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5254         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5255         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5256         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5257         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5258         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5259         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5260     }
5261
5262     def __init__(self, ydl, field_preference):
5263         self.ydl = ydl
5264         self._order = []
5265         self.evaluate_params(self.ydl.params, field_preference)
5266         if ydl.params.get('verbose'):
5267             self.print_verbose_info(self.ydl.write_debug)
5268
5269     def _get_field_setting(self, field, key):
5270         if field not in self.settings:
5271             if key in ('forced', 'priority'):
5272                 return False
5273             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5274                                         'deprecated and may be removed in a future version')
5275             self.settings[field] = {}
5276         propObj = self.settings[field]
5277         if key not in propObj:
5278             type = propObj.get('type')
5279             if key == 'field':
5280                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5281             elif key == 'convert':
5282                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5283             else:
5284                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5285             propObj[key] = default
5286         return propObj[key]
5287
5288     def _resolve_field_value(self, field, value, convertNone=False):
5289         if value is None:
5290             if not convertNone:
5291                 return None
5292         else:
5293             value = value.lower()
5294         conversion = self._get_field_setting(field, 'convert')
5295         if conversion == 'ignore':
5296             return None
5297         if conversion == 'string':
5298             return value
5299         elif conversion == 'float_none':
5300             return float_or_none(value)
5301         elif conversion == 'bytes':
5302             return parse_bytes(value)
5303         elif conversion == 'order':
5304             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5305             use_regex = self._get_field_setting(field, 'regex')
5306             list_length = len(order_list)
5307             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5308             if use_regex and value is not None:
5309                 for i, regex in enumerate(order_list):
5310                     if regex and re.match(regex, value):
5311                         return list_length - i
5312                 return list_length - empty_pos  # not in list
5313             else:  # not regex or  value = None
5314                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5315         else:
5316             if value.isnumeric():
5317                 return float(value)
5318             else:
5319                 self.settings[field]['convert'] = 'string'
5320                 return value
5321
5322     def evaluate_params(self, params, sort_extractor):
5323         self._use_free_order = params.get('prefer_free_formats', False)
5324         self._sort_user = params.get('format_sort', [])
5325         self._sort_extractor = sort_extractor
5326
5327         def add_item(field, reverse, closest, limit_text):
5328             field = field.lower()
5329             if field in self._order:
5330                 return
5331             self._order.append(field)
5332             limit = self._resolve_field_value(field, limit_text)
5333             data = {
5334                 'reverse': reverse,
5335                 'closest': False if limit is None else closest,
5336                 'limit_text': limit_text,
5337                 'limit': limit}
5338             if field in self.settings:
5339                 self.settings[field].update(data)
5340             else:
5341                 self.settings[field] = data
5342
5343         sort_list = (
5344             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5345             + (tuple() if params.get('format_sort_force', False)
5346                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5347             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5348
5349         for item in sort_list:
5350             match = re.match(self.regex, item)
5351             if match is None:
5352                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5353             field = match.group('field')
5354             if field is None:
5355                 continue
5356             if self._get_field_setting(field, 'type') == 'alias':
5357                 alias, field = field, self._get_field_setting(field, 'field')
5358                 if self._get_field_setting(alias, 'deprecated'):
5359                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5360                                                 f'be removed in a future version. Please use {field} instead')
5361             reverse = match.group('reverse') is not None
5362             closest = match.group('separator') == '~'
5363             limit_text = match.group('limit')
5364
5365             has_limit = limit_text is not None
5366             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5367             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5368
5369             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5370             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5371             limit_count = len(limits)
5372             for (i, f) in enumerate(fields):
5373                 add_item(f, reverse, closest,
5374                          limits[i] if i < limit_count
5375                          else limits[0] if has_limit and not has_multiple_limits
5376                          else None)
5377
5378     def print_verbose_info(self, write_debug):
5379         if self._sort_user:
5380             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5381         if self._sort_extractor:
5382             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5383         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5384             '+' if self._get_field_setting(field, 'reverse') else '', field,
5385             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5386                           self._get_field_setting(field, 'limit_text'),
5387                           self._get_field_setting(field, 'limit'))
5388             if self._get_field_setting(field, 'limit_text') is not None else '')
5389             for field in self._order if self._get_field_setting(field, 'visible')]))
5390
5391     def _calculate_field_preference_from_value(self, format, field, type, value):
5392         reverse = self._get_field_setting(field, 'reverse')
5393         closest = self._get_field_setting(field, 'closest')
5394         limit = self._get_field_setting(field, 'limit')
5395
5396         if type == 'extractor':
5397             maximum = self._get_field_setting(field, 'max')
5398             if value is None or (maximum is not None and value >= maximum):
5399                 value = -1
5400         elif type == 'boolean':
5401             in_list = self._get_field_setting(field, 'in_list')
5402             not_in_list = self._get_field_setting(field, 'not_in_list')
5403             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5404         elif type == 'ordered':
5405             value = self._resolve_field_value(field, value, True)
5406
5407         # try to convert to number
5408         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5409         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5410         if is_num:
5411             value = val_num
5412
5413         return ((-10, 0) if value is None
5414                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5415                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5416                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5417                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5418                 else (-1, value, 0))
5419
5420     def _calculate_field_preference(self, format, field):
5421         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5422         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5423         if type == 'multiple':
5424             type = 'field'  # Only 'field' is allowed in multiple for now
5425             actual_fields = self._get_field_setting(field, 'field')
5426
5427             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5428         else:
5429             value = get_value(field)
5430         return self._calculate_field_preference_from_value(format, field, type, value)
5431
5432     def calculate_preference(self, format):
5433         # Determine missing protocol
5434         if not format.get('protocol'):
5435             format['protocol'] = determine_protocol(format)
5436
5437         # Determine missing ext
5438         if not format.get('ext') and 'url' in format:
5439             format['ext'] = determine_ext(format['url'])
5440         if format.get('vcodec') == 'none':
5441             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5442             format['video_ext'] = 'none'
5443         else:
5444             format['video_ext'] = format['ext']
5445             format['audio_ext'] = 'none'
5446         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5447         #    format['preference'] = -1000
5448
5449         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5450             # HEVC-over-FLV is out-of-spec by FLV's original spec
5451             # ref. https://trac.ffmpeg.org/ticket/6389
5452             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5453             format['preference'] = -100
5454
5455         # Determine missing bitrates
5456         if format.get('vcodec') == 'none':
5457             format['vbr'] = 0
5458         if format.get('acodec') == 'none':
5459             format['abr'] = 0
5460         if not format.get('vbr') and format.get('vcodec') != 'none':
5461             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5462         if not format.get('abr') and format.get('acodec') != 'none':
5463             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5464         if not format.get('tbr'):
5465             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5466
5467         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5468
5469
5470 # XXX: Temporary
5471 class _YDLLogger:
5472     def __init__(self, ydl=None):
5473         self._ydl = ydl
5474
5475     def debug(self, message):
5476         if self._ydl:
5477             self._ydl.write_debug(message)
5478
5479     def info(self, message):
5480         if self._ydl:
5481             self._ydl.to_screen(message)
5482
5483     def warning(self, message, *, once=False):
5484         if self._ydl:
5485             self._ydl.report_warning(message, once)
5486
5487     def error(self, message, *, is_error=True):
5488         if self._ydl:
5489             self._ydl.report_error(message, is_error=is_error)
5490
5491     def stdout(self, message):
5492         if self._ydl:
5493             self._ydl.to_stdout(message)
5494
5495     def stderr(self, message):
5496         if self._ydl:
5497             self._ydl.to_stderr(message)