yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import inspect
  19 import io
  20 import itertools
  21 import json
  22 import locale
  23 import math
  24 import mimetypes
  25 import netrc
  26 import operator
  27 import os
  28 import platform
  29 import random
  30 import re
  31 import shlex
  32 import socket
  33 import ssl
  34 import struct
  35 import subprocess
  36 import sys
  37 import tempfile
  38 import time
  39 import traceback
  40 import types
  41 import unicodedata
  42 import urllib.error
  43 import urllib.parse
  44 import urllib.request
  45 import xml.etree.ElementTree
  46
  47 from . import traversal
  48
  49 from ..compat import functools  # isort: split
  50 from ..compat import (
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_HTMLParseError,
  54     compat_os_name,
  55     compat_shlex_quote,
  56 )
  57 from ..dependencies import websockets, xattr
  58
  59 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  60
  61 # This is not clearly defined otherwise
  62 compiled_regex_type = type(re.compile(''))
  63
  64
  65 USER_AGENTS = {
  66     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  67 }
  68
  69
  70 class NO_DEFAULT:
  71     pass
  72
  73
  74 def IDENTITY(x):
  75     return x
  76
  77
  78 ENGLISH_MONTH_NAMES = [
  79     'January', 'February', 'March', 'April', 'May', 'June',
  80     'July', 'August', 'September', 'October', 'November', 'December']
  81
  82 MONTH_NAMES = {
  83     'en': ENGLISH_MONTH_NAMES,
  84     'fr': [
  85         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  86         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  87     # these follow the genitive grammatical case (dopełniacz)
  88     # some websites might be using nominative, which will require another month list
  89     # https://en.wikibooks.org/wiki/Polish/Noun_cases
  90     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  91            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  92 }
  93
  94 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  95 TIMEZONE_NAMES = {
  96     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  97     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
  98     'EST': -5, 'EDT': -4,  # Eastern
  99     'CST': -6, 'CDT': -5,  # Central
 100     'MST': -7, 'MDT': -6,  # Mountain
 101     'PST': -8, 'PDT': -7   # Pacific
 102 }
 103
 104 # needed for sanitizing filenames in restricted mode
 105 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 106                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 107                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 108
 109 DATE_FORMATS = (
 110     '%d %B %Y',
 111     '%d %b %Y',
 112     '%B %d %Y',
 113     '%B %dst %Y',
 114     '%B %dnd %Y',
 115     '%B %drd %Y',
 116     '%B %dth %Y',
 117     '%b %d %Y',
 118     '%b %dst %Y',
 119     '%b %dnd %Y',
 120     '%b %drd %Y',
 121     '%b %dth %Y',
 122     '%b %dst %Y %I:%M',
 123     '%b %dnd %Y %I:%M',
 124     '%b %drd %Y %I:%M',
 125     '%b %dth %Y %I:%M',
 126     '%Y %m %d',
 127     '%Y-%m-%d',
 128     '%Y.%m.%d.',
 129     '%Y/%m/%d',
 130     '%Y/%m/%d %H:%M',
 131     '%Y/%m/%d %H:%M:%S',
 132     '%Y%m%d%H%M',
 133     '%Y%m%d%H%M%S',
 134     '%Y%m%d',
 135     '%Y-%m-%d %H:%M',
 136     '%Y-%m-%d %H:%M:%S',
 137     '%Y-%m-%d %H:%M:%S.%f',
 138     '%Y-%m-%d %H:%M:%S:%f',
 139     '%d.%m.%Y %H:%M',
 140     '%d.%m.%Y %H.%M',
 141     '%Y-%m-%dT%H:%M:%SZ',
 142     '%Y-%m-%dT%H:%M:%S.%fZ',
 143     '%Y-%m-%dT%H:%M:%S.%f0Z',
 144     '%Y-%m-%dT%H:%M:%S',
 145     '%Y-%m-%dT%H:%M:%S.%f',
 146     '%Y-%m-%dT%H:%M',
 147     '%b %d %Y at %H:%M',
 148     '%b %d %Y at %H:%M:%S',
 149     '%B %d %Y at %H:%M',
 150     '%B %d %Y at %H:%M:%S',
 151     '%H:%M %d-%b-%Y',
 152 )
 153
 154 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 155 DATE_FORMATS_DAY_FIRST.extend([
 156     '%d-%m-%Y',
 157     '%d.%m.%Y',
 158     '%d.%m.%y',
 159     '%d/%m/%Y',
 160     '%d/%m/%y',
 161     '%d/%m/%Y %H:%M:%S',
 162     '%d-%m-%Y %H:%M',
 163     '%H:%M %d/%m/%Y',
 164 ])
 165
 166 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 167 DATE_FORMATS_MONTH_FIRST.extend([
 168     '%m-%d-%Y',
 169     '%m.%d.%Y',
 170     '%m/%d/%Y',
 171     '%m/%d/%y',
 172     '%m/%d/%Y %H:%M:%S',
 173 ])
 174
 175 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 176 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 177
 178 NUMBER_RE = r'\d+(?:\.\d+)?'
 179
 180
 181 @functools.cache
 182 def preferredencoding():
 183     """Get preferred encoding.
 184
 185     Returns the best encoding scheme for the system, based on
 186     locale.getpreferredencoding() and some further tweaks.
 187     """
 188     try:
 189         pref = locale.getpreferredencoding()
 190         'TEST'.encode(pref)
 191     except Exception:
 192         pref = 'UTF-8'
 193
 194     return pref
 195
 196
 197 def write_json_file(obj, fn):
 198     """ Encode obj as JSON and write it to fn, atomically if possible """
 199
 200     tf = tempfile.NamedTemporaryFile(
 201         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 202         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 203
 204     try:
 205         with tf:
 206             json.dump(obj, tf, ensure_ascii=False)
 207         if sys.platform == 'win32':
 208             # Need to remove existing file on Windows, else os.rename raises
 209             # WindowsError or FileExistsError.
 210             with contextlib.suppress(OSError):
 211                 os.unlink(fn)
 212         with contextlib.suppress(OSError):
 213             mask = os.umask(0)
 214             os.umask(mask)
 215             os.chmod(tf.name, 0o666 & ~mask)
 216         os.rename(tf.name, fn)
 217     except Exception:
 218         with contextlib.suppress(OSError):
 219             os.remove(tf.name)
 220         raise
 221
 222
 223 def find_xpath_attr(node, xpath, key, val=None):
 224     """ Find the xpath xpath[@key=val] """
 225     assert re.match(r'^[a-zA-Z_-]+$', key)
 226     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 227     return node.find(expr)
 228
 229 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 230 # the namespace parameter
 231
 232
 233 def xpath_with_ns(path, ns_map):
 234     components = [c.split(':') for c in path.split('/')]
 235     replaced = []
 236     for c in components:
 237         if len(c) == 1:
 238             replaced.append(c[0])
 239         else:
 240             ns, tag = c
 241             replaced.append('{%s}%s' % (ns_map[ns], tag))
 242     return '/'.join(replaced)
 243
 244
 245 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 246     def _find_xpath(xpath):
 247         return node.find(xpath)
 248
 249     if isinstance(xpath, str):
 250         n = _find_xpath(xpath)
 251     else:
 252         for xp in xpath:
 253             n = _find_xpath(xp)
 254             if n is not None:
 255                 break
 256
 257     if n is None:
 258         if default is not NO_DEFAULT:
 259             return default
 260         elif fatal:
 261             name = xpath if name is None else name
 262             raise ExtractorError('Could not find XML element %s' % name)
 263         else:
 264             return None
 265     return n
 266
 267
 268 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 269     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 270     if n is None or n == default:
 271         return n
 272     if n.text is None:
 273         if default is not NO_DEFAULT:
 274             return default
 275         elif fatal:
 276             name = xpath if name is None else name
 277             raise ExtractorError('Could not find XML element\'s text %s' % name)
 278         else:
 279             return None
 280     return n.text
 281
 282
 283 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 284     n = find_xpath_attr(node, xpath, key)
 285     if n is None:
 286         if default is not NO_DEFAULT:
 287             return default
 288         elif fatal:
 289             name = f'{xpath}[@{key}]' if name is None else name
 290             raise ExtractorError('Could not find XML attribute %s' % name)
 291         else:
 292             return None
 293     return n.attrib[key]
 294
 295
 296 def get_element_by_id(id, html, **kwargs):
 297     """Return the content of the tag with the specified ID in the passed HTML document"""
 298     return get_element_by_attribute('id', id, html, **kwargs)
 299
 300
 301 def get_element_html_by_id(id, html, **kwargs):
 302     """Return the html of the tag with the specified ID in the passed HTML document"""
 303     return get_element_html_by_attribute('id', id, html, **kwargs)
 304
 305
 306 def get_element_by_class(class_name, html):
 307     """Return the content of the first tag with the specified class in the passed HTML document"""
 308     retval = get_elements_by_class(class_name, html)
 309     return retval[0] if retval else None
 310
 311
 312 def get_element_html_by_class(class_name, html):
 313     """Return the html of the first tag with the specified class in the passed HTML document"""
 314     retval = get_elements_html_by_class(class_name, html)
 315     return retval[0] if retval else None
 316
 317
 318 def get_element_by_attribute(attribute, value, html, **kwargs):
 319     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 320     return retval[0] if retval else None
 321
 322
 323 def get_element_html_by_attribute(attribute, value, html, **kargs):
 324     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 325     return retval[0] if retval else None
 326
 327
 328 def get_elements_by_class(class_name, html, **kargs):
 329     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 330     return get_elements_by_attribute(
 331         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 332         html, escape_value=False)
 333
 334
 335 def get_elements_html_by_class(class_name, html):
 336     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 337     return get_elements_html_by_attribute(
 338         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 339         html, escape_value=False)
 340
 341
 342 def get_elements_by_attribute(*args, **kwargs):
 343     """Return the content of the tag with the specified attribute in the passed HTML document"""
 344     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 345
 346
 347 def get_elements_html_by_attribute(*args, **kwargs):
 348     """Return the html of the tag with the specified attribute in the passed HTML document"""
 349     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 350
 351
 352 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 353     """
 354     Return the text (content) and the html (whole) of the tag with the specified
 355     attribute in the passed HTML document
 356     """
 357     if not value:
 358         return
 359
 360     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 361
 362     value = re.escape(value) if escape_value else value
 363
 364     partial_element_re = rf'''(?x)
 365         <(?P<tag>{tag})
 366          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 367          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 368         '''
 369
 370     for m in re.finditer(partial_element_re, html):
 371         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 372
 373         yield (
 374             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 375             whole
 376         )
 377
 378
 379 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 380     """
 381     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 382     closing tag for the first opening tag it has encountered, and can be used
 383     as a context manager
 384     """
 385
 386     class HTMLBreakOnClosingTagException(Exception):
 387         pass
 388
 389     def __init__(self):
 390         self.tagstack = collections.deque()
 391         html.parser.HTMLParser.__init__(self)
 392
 393     def __enter__(self):
 394         return self
 395
 396     def __exit__(self, *_):
 397         self.close()
 398
 399     def close(self):
 400         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 401         # so data remains buffered; we no longer have any interest in it, thus
 402         # override this method to discard it
 403         pass
 404
 405     def handle_starttag(self, tag, _):
 406         self.tagstack.append(tag)
 407
 408     def handle_endtag(self, tag):
 409         if not self.tagstack:
 410             raise compat_HTMLParseError('no tags in the stack')
 411         while self.tagstack:
 412             inner_tag = self.tagstack.pop()
 413             if inner_tag == tag:
 414                 break
 415         else:
 416             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 417         if not self.tagstack:
 418             raise self.HTMLBreakOnClosingTagException()
 419
 420
 421 # XXX: This should be far less strict
 422 def get_element_text_and_html_by_tag(tag, html):
 423     """
 424     For the first element with the specified tag in the passed HTML document
 425     return its' content (text) and the whole element (html)
 426     """
 427     def find_or_raise(haystack, needle, exc):
 428         try:
 429             return haystack.index(needle)
 430         except ValueError:
 431             raise exc
 432     closing_tag = f'</{tag}>'
 433     whole_start = find_or_raise(
 434         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 435     content_start = find_or_raise(
 436         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 437     content_start += whole_start + 1
 438     with HTMLBreakOnClosingTagParser() as parser:
 439         parser.feed(html[whole_start:content_start])
 440         if not parser.tagstack or parser.tagstack[0] != tag:
 441             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 442         offset = content_start
 443         while offset < len(html):
 444             next_closing_tag_start = find_or_raise(
 445                 html[offset:], closing_tag,
 446                 compat_HTMLParseError(f'closing {tag} tag not found'))
 447             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 448             try:
 449                 parser.feed(html[offset:offset + next_closing_tag_end])
 450                 offset += next_closing_tag_end
 451             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 452                 return html[content_start:offset + next_closing_tag_start], \
 453                     html[whole_start:offset + next_closing_tag_end]
 454         raise compat_HTMLParseError('unexpected end of html')
 455
 456
 457 class HTMLAttributeParser(html.parser.HTMLParser):
 458     """Trivial HTML parser to gather the attributes for a single element"""
 459
 460     def __init__(self):
 461         self.attrs = {}
 462         html.parser.HTMLParser.__init__(self)
 463
 464     def handle_starttag(self, tag, attrs):
 465         self.attrs = dict(attrs)
 466         raise compat_HTMLParseError('done')
 467
 468
 469 class HTMLListAttrsParser(html.parser.HTMLParser):
 470     """HTML parser to gather the attributes for the elements of a list"""
 471
 472     def __init__(self):
 473         html.parser.HTMLParser.__init__(self)
 474         self.items = []
 475         self._level = 0
 476
 477     def handle_starttag(self, tag, attrs):
 478         if tag == 'li' and self._level == 0:
 479             self.items.append(dict(attrs))
 480         self._level += 1
 481
 482     def handle_endtag(self, tag):
 483         self._level -= 1
 484
 485
 486 def extract_attributes(html_element):
 487     """Given a string for an HTML element such as
 488     <el
 489          a="foo" B="bar" c="&98;az" d=boz
 490          empty= noval entity="&amp;"
 491          sq='"' dq="'"
 492     >
 493     Decode and return a dictionary of attributes.
 494     {
 495         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 496         'empty': '', 'noval': None, 'entity': '&',
 497         'sq': '"', 'dq': '\''
 498     }.
 499     """
 500     parser = HTMLAttributeParser()
 501     with contextlib.suppress(compat_HTMLParseError):
 502         parser.feed(html_element)
 503         parser.close()
 504     return parser.attrs
 505
 506
 507 def parse_list(webpage):
 508     """Given a string for an series of HTML <li> elements,
 509     return a dictionary of their attributes"""
 510     parser = HTMLListAttrsParser()
 511     parser.feed(webpage)
 512     parser.close()
 513     return parser.items
 514
 515
 516 def clean_html(html):
 517     """Clean an HTML snippet into a readable string"""
 518
 519     if html is None:  # Convenience for sanitizing descriptions etc.
 520         return html
 521
 522     html = re.sub(r'\s+', ' ', html)
 523     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 524     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 525     # Strip html tags
 526     html = re.sub('<.*?>', '', html)
 527     # Replace html entities
 528     html = unescapeHTML(html)
 529     return html.strip()
 530
 531
 532 class LenientJSONDecoder(json.JSONDecoder):
 533     # TODO: Write tests
 534     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 535         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 536         self._close_attempts = 2 * close_objects
 537         super().__init__(*args, **kwargs)
 538
 539     @staticmethod
 540     def _close_object(err):
 541         doc = err.doc[:err.pos]
 542         # We need to add comma first to get the correct error message
 543         if err.msg.startswith('Expecting \',\''):
 544             return doc + ','
 545         elif not doc.endswith(','):
 546             return
 547
 548         if err.msg.startswith('Expecting property name'):
 549             return doc[:-1] + '}'
 550         elif err.msg.startswith('Expecting value'):
 551             return doc[:-1] + ']'
 552
 553     def decode(self, s):
 554         if self.transform_source:
 555             s = self.transform_source(s)
 556         for attempt in range(self._close_attempts + 1):
 557             try:
 558                 if self.ignore_extra:
 559                     return self.raw_decode(s.lstrip())[0]
 560                 return super().decode(s)
 561             except json.JSONDecodeError as e:
 562                 if e.pos is None:
 563                     raise
 564                 elif attempt < self._close_attempts:
 565                     s = self._close_object(e)
 566                     if s is not None:
 567                         continue
 568                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 569         assert False, 'Too many attempts to decode JSON'
 570
 571
 572 def sanitize_open(filename, open_mode):
 573     """Try to open the given filename, and slightly tweak it if this fails.
 574
 575     Attempts to open the given filename. If this fails, it tries to change
 576     the filename slightly, step by step, until it's either able to open it
 577     or it fails and raises a final exception, like the standard open()
 578     function.
 579
 580     It returns the tuple (stream, definitive_file_name).
 581     """
 582     if filename == '-':
 583         if sys.platform == 'win32':
 584             import msvcrt
 585
 586             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 587             with contextlib.suppress(io.UnsupportedOperation):
 588                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 589         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 590
 591     for attempt in range(2):
 592         try:
 593             try:
 594                 if sys.platform == 'win32':
 595                     # FIXME: An exclusive lock also locks the file from being read.
 596                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 597                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 598                     raise LockingUnsupportedError()
 599                 stream = locked_file(filename, open_mode, block=False).__enter__()
 600             except OSError:
 601                 stream = open(filename, open_mode)
 602             return stream, filename
 603         except OSError as err:
 604             if attempt or err.errno in (errno.EACCES,):
 605                 raise
 606             old_filename, filename = filename, sanitize_path(filename)
 607             if old_filename == filename:
 608                 raise
 609
 610
 611 def timeconvert(timestr):
 612     """Convert RFC 2822 defined time string into system timestamp"""
 613     timestamp = None
 614     timetuple = email.utils.parsedate_tz(timestr)
 615     if timetuple is not None:
 616         timestamp = email.utils.mktime_tz(timetuple)
 617     return timestamp
 618
 619
 620 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 621     """Sanitizes a string so it could be used as part of a filename.
 622     @param restricted   Use a stricter subset of allowed characters
 623     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 624                         If unset, yt-dlp's new sanitization rules are in effect
 625     """
 626     if s == '':
 627         return ''
 628
 629     def replace_insane(char):
 630         if restricted and char in ACCENT_CHARS:
 631             return ACCENT_CHARS[char]
 632         elif not restricted and char == '\n':
 633             return '\0 '
 634         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 635             # Replace with their full-width unicode counterparts
 636             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 637         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 638             return ''
 639         elif char == '"':
 640             return '' if restricted else '\''
 641         elif char == ':':
 642             return '\0_\0-' if restricted else '\0 \0-'
 643         elif char in '\\/|*<>':
 644             return '\0_'
 645         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 646             return '\0_'
 647         return char
 648
 649     # Replace look-alike Unicode glyphs
 650     if restricted and (is_id is NO_DEFAULT or not is_id):
 651         s = unicodedata.normalize('NFKC', s)
 652     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 653     result = ''.join(map(replace_insane, s))
 654     if is_id is NO_DEFAULT:
 655         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 656         STRIP_RE = r'(?:\0.|[ _-])*'
 657         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 658     result = result.replace('\0', '') or '_'
 659
 660     if not is_id:
 661         while '__' in result:
 662             result = result.replace('__', '_')
 663         result = result.strip('_')
 664         # Common case of "Foreign band name - English song title"
 665         if restricted and result.startswith('-_'):
 666             result = result[2:]
 667         if result.startswith('-'):
 668             result = '_' + result[len('-'):]
 669         result = result.lstrip('.')
 670         if not result:
 671             result = '_'
 672     return result
 673
 674
 675 def sanitize_path(s, force=False):
 676     """Sanitizes and normalizes path on Windows"""
 677     if sys.platform == 'win32':
 678         force = False
 679         drive_or_unc, _ = os.path.splitdrive(s)
 680     elif force:
 681         drive_or_unc = ''
 682     else:
 683         return s
 684
 685     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 686     if drive_or_unc:
 687         norm_path.pop(0)
 688     sanitized_path = [
 689         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 690         for path_part in norm_path]
 691     if drive_or_unc:
 692         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 693     elif force and s and s[0] == os.path.sep:
 694         sanitized_path.insert(0, os.path.sep)
 695     return os.path.join(*sanitized_path)
 696
 697
 698 def sanitize_url(url, *, scheme='http'):
 699     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 700     # the number of unwanted failures due to missing protocol
 701     if url is None:
 702         return
 703     elif url.startswith('//'):
 704         return f'{scheme}:{url}'
 705     # Fix some common typos seen so far
 706     COMMON_TYPOS = (
 707         # https://github.com/ytdl-org/youtube-dl/issues/15649
 708         (r'^httpss://', r'https://'),
 709         # https://bx1.be/lives/direct-tv/
 710         (r'^rmtp([es]?)://', r'rtmp\1://'),
 711     )
 712     for mistake, fixup in COMMON_TYPOS:
 713         if re.match(mistake, url):
 714             return re.sub(mistake, fixup, url)
 715     return url
 716
 717
 718 def extract_basic_auth(url):
 719     parts = urllib.parse.urlsplit(url)
 720     if parts.username is None:
 721         return url, None
 722     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 723         parts.hostname if parts.port is None
 724         else '%s:%d' % (parts.hostname, parts.port))))
 725     auth_payload = base64.b64encode(
 726         ('%s:%s' % (parts.username, parts.password or '')).encode())
 727     return url, f'Basic {auth_payload.decode()}'
 728
 729
 730 def sanitized_Request(url, *args, **kwargs):
 731     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 732     if auth_header is not None:
 733         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 734         headers['Authorization'] = auth_header
 735     return urllib.request.Request(url, *args, **kwargs)
 736
 737
 738 def expand_path(s):
 739     """Expand shell variables and ~"""
 740     return os.path.expandvars(compat_expanduser(s))
 741
 742
 743 def orderedSet(iterable, *, lazy=False):
 744     """Remove all duplicates from the input iterable"""
 745     def _iter():
 746         seen = []  # Do not use set since the items can be unhashable
 747         for x in iterable:
 748             if x not in seen:
 749                 seen.append(x)
 750                 yield x
 751
 752     return _iter() if lazy else list(_iter())
 753
 754
 755 def _htmlentity_transform(entity_with_semicolon):
 756     """Transforms an HTML entity to a character."""
 757     entity = entity_with_semicolon[:-1]
 758
 759     # Known non-numeric HTML entity
 760     if entity in html.entities.name2codepoint:
 761         return chr(html.entities.name2codepoint[entity])
 762
 763     # TODO: HTML5 allows entities without a semicolon.
 764     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 765     if entity_with_semicolon in html.entities.html5:
 766         return html.entities.html5[entity_with_semicolon]
 767
 768     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 769     if mobj is not None:
 770         numstr = mobj.group(1)
 771         if numstr.startswith('x'):
 772             base = 16
 773             numstr = '0%s' % numstr
 774         else:
 775             base = 10
 776         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 777         with contextlib.suppress(ValueError):
 778             return chr(int(numstr, base))
 779
 780     # Unknown entity in name, return its literal representation
 781     return '&%s;' % entity
 782
 783
 784 def unescapeHTML(s):
 785     if s is None:
 786         return None
 787     assert isinstance(s, str)
 788
 789     return re.sub(
 790         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 791
 792
 793 def escapeHTML(text):
 794     return (
 795         text
 796         .replace('&', '&amp;')
 797         .replace('<', '&lt;')
 798         .replace('>', '&gt;')
 799         .replace('"', '&quot;')
 800         .replace("'", '&#39;')
 801     )
 802
 803
 804 class netrc_from_content(netrc.netrc):
 805     def __init__(self, content):
 806         self.hosts, self.macros = {}, {}
 807         with io.StringIO(content) as stream:
 808             self._parse('-', stream, False)
 809
 810
 811 class Popen(subprocess.Popen):
 812     if sys.platform == 'win32':
 813         _startupinfo = subprocess.STARTUPINFO()
 814         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 815     else:
 816         _startupinfo = None
 817
 818     @staticmethod
 819     def _fix_pyinstaller_ld_path(env):
 820         """Restore LD_LIBRARY_PATH when using PyInstaller
 821             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 822                  https://github.com/yt-dlp/yt-dlp/issues/4573
 823         """
 824         if not hasattr(sys, '_MEIPASS'):
 825             return
 826
 827         def _fix(key):
 828             orig = env.get(f'{key}_ORIG')
 829             if orig is None:
 830                 env.pop(key, None)
 831             else:
 832                 env[key] = orig
 833
 834         _fix('LD_LIBRARY_PATH')  # Linux
 835         _fix('DYLD_LIBRARY_PATH')  # macOS
 836
 837     def __init__(self, *args, env=None, text=False, **kwargs):
 838         if env is None:
 839             env = os.environ.copy()
 840         self._fix_pyinstaller_ld_path(env)
 841
 842         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 843         if text is True:
 844             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 845             kwargs.setdefault('encoding', 'utf-8')
 846             kwargs.setdefault('errors', 'replace')
 847         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 848
 849     def communicate_or_kill(self, *args, **kwargs):
 850         try:
 851             return self.communicate(*args, **kwargs)
 852         except BaseException:  # Including KeyboardInterrupt
 853             self.kill(timeout=None)
 854             raise
 855
 856     def kill(self, *, timeout=0):
 857         super().kill()
 858         if timeout != 0:
 859             self.wait(timeout=timeout)
 860
 861     @classmethod
 862     def run(cls, *args, timeout=None, **kwargs):
 863         with cls(*args, **kwargs) as proc:
 864             default = '' if proc.__text_mode else b''
 865             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 866             return stdout or default, stderr or default, proc.returncode
 867
 868
 869 def encodeArgument(s):
 870     # Legacy code that uses byte strings
 871     # Uncomment the following line after fixing all post processors
 872     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 873     return s if isinstance(s, str) else s.decode('ascii')
 874
 875
 876 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 877
 878
 879 def timetuple_from_msec(msec):
 880     secs, msec = divmod(msec, 1000)
 881     mins, secs = divmod(secs, 60)
 882     hrs, mins = divmod(mins, 60)
 883     return _timetuple(hrs, mins, secs, msec)
 884
 885
 886 def formatSeconds(secs, delim=':', msec=False):
 887     time = timetuple_from_msec(secs * 1000)
 888     if time.hours:
 889         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 890     elif time.minutes:
 891         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 892     else:
 893         ret = '%d' % time.seconds
 894     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 895
 896
 897 def make_HTTPS_handler(params, **kwargs):
 898     from ._deprecated import YoutubeDLHTTPSHandler
 899     from ..networking._helper import make_ssl_context
 900     return YoutubeDLHTTPSHandler(params, context=make_ssl_context(
 901         verify=not params.get('nocheckcertificate'),
 902         client_certificate=params.get('client_certificate'),
 903         client_certificate_key=params.get('client_certificate_key'),
 904         client_certificate_password=params.get('client_certificate_password'),
 905         legacy_support=params.get('legacyserverconnect'),
 906         use_certifi='no-certifi' not in params.get('compat_opts', []),
 907     ), **kwargs)
 908
 909
 910 def bug_reports_message(before=';'):
 911     from ..update import REPOSITORY
 912
 913     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 914            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 915
 916     before = before.rstrip()
 917     if not before or before.endswith(('.', '!', '?')):
 918         msg = msg[0].title() + msg[1:]
 919
 920     return (before + ' ' if before else '') + msg
 921
 922
 923 class YoutubeDLError(Exception):
 924     """Base exception for YoutubeDL errors."""
 925     msg = None
 926
 927     def __init__(self, msg=None):
 928         if msg is not None:
 929             self.msg = msg
 930         elif self.msg is None:
 931             self.msg = type(self).__name__
 932         super().__init__(self.msg)
 933
 934
 935 class ExtractorError(YoutubeDLError):
 936     """Error during info extraction."""
 937
 938     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 939         """ tb, if given, is the original traceback (so that it can be printed out).
 940         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 941         """
 942         from ..networking.exceptions import network_exceptions
 943         if sys.exc_info()[0] in network_exceptions:
 944             expected = True
 945
 946         self.orig_msg = str(msg)
 947         self.traceback = tb
 948         self.expected = expected
 949         self.cause = cause
 950         self.video_id = video_id
 951         self.ie = ie
 952         self.exc_info = sys.exc_info()  # preserve original exception
 953         if isinstance(self.exc_info[1], ExtractorError):
 954             self.exc_info = self.exc_info[1].exc_info
 955         super().__init__(self.__msg)
 956
 957     @property
 958     def __msg(self):
 959         return ''.join((
 960             format_field(self.ie, None, '[%s] '),
 961             format_field(self.video_id, None, '%s: '),
 962             self.orig_msg,
 963             format_field(self.cause, None, ' (caused by %r)'),
 964             '' if self.expected else bug_reports_message()))
 965
 966     def format_traceback(self):
 967         return join_nonempty(
 968             self.traceback and ''.join(traceback.format_tb(self.traceback)),
 969             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
 970             delim='\n') or None
 971
 972     def __setattr__(self, name, value):
 973         super().__setattr__(name, value)
 974         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
 975             self.msg = self.__msg or type(self).__name__
 976             self.args = (self.msg, )  # Cannot be property
 977
 978
 979 class UnsupportedError(ExtractorError):
 980     def __init__(self, url):
 981         super().__init__(
 982             'Unsupported URL: %s' % url, expected=True)
 983         self.url = url
 984
 985
 986 class RegexNotFoundError(ExtractorError):
 987     """Error when a regex didn't match"""
 988     pass
 989
 990
 991 class GeoRestrictedError(ExtractorError):
 992     """Geographic restriction Error exception.
 993
 994     This exception may be thrown when a video is not available from your
 995     geographic location due to geographic restrictions imposed by a website.
 996     """
 997
 998     def __init__(self, msg, countries=None, **kwargs):
 999         kwargs['expected'] = True
1000         super().__init__(msg, **kwargs)
1001         self.countries = countries
1002
1003
1004 class UserNotLive(ExtractorError):
1005     """Error when a channel/user is not live"""
1006
1007     def __init__(self, msg=None, **kwargs):
1008         kwargs['expected'] = True
1009         super().__init__(msg or 'The channel is not currently live', **kwargs)
1010
1011
1012 class DownloadError(YoutubeDLError):
1013     """Download Error exception.
1014
1015     This exception may be thrown by FileDownloader objects if they are not
1016     configured to continue on errors. They will contain the appropriate
1017     error message.
1018     """
1019
1020     def __init__(self, msg, exc_info=None):
1021         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1022         super().__init__(msg)
1023         self.exc_info = exc_info
1024
1025
1026 class EntryNotInPlaylist(YoutubeDLError):
1027     """Entry not in playlist exception.
1028
1029     This exception will be thrown by YoutubeDL when a requested entry
1030     is not found in the playlist info_dict
1031     """
1032     msg = 'Entry not found in info'
1033
1034
1035 class SameFileError(YoutubeDLError):
1036     """Same File exception.
1037
1038     This exception will be thrown by FileDownloader objects if they detect
1039     multiple files would have to be downloaded to the same file on disk.
1040     """
1041     msg = 'Fixed output name but more than one file to download'
1042
1043     def __init__(self, filename=None):
1044         if filename is not None:
1045             self.msg += f': {filename}'
1046         super().__init__(self.msg)
1047
1048
1049 class PostProcessingError(YoutubeDLError):
1050     """Post Processing exception.
1051
1052     This exception may be raised by PostProcessor's .run() method to
1053     indicate an error in the postprocessing task.
1054     """
1055
1056
1057 class DownloadCancelled(YoutubeDLError):
1058     """ Exception raised when the download queue should be interrupted """
1059     msg = 'The download was cancelled'
1060
1061
1062 class ExistingVideoReached(DownloadCancelled):
1063     """ --break-on-existing triggered """
1064     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1065
1066
1067 class RejectedVideoReached(DownloadCancelled):
1068     """ --break-match-filter triggered """
1069     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1070
1071
1072 class MaxDownloadsReached(DownloadCancelled):
1073     """ --max-downloads limit has been reached. """
1074     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1075
1076
1077 class ReExtractInfo(YoutubeDLError):
1078     """ Video info needs to be re-extracted. """
1079
1080     def __init__(self, msg, expected=False):
1081         super().__init__(msg)
1082         self.expected = expected
1083
1084
1085 class ThrottledDownload(ReExtractInfo):
1086     """ Download speed below --throttled-rate. """
1087     msg = 'The download speed is below throttle limit'
1088
1089     def __init__(self):
1090         super().__init__(self.msg, expected=False)
1091
1092
1093 class UnavailableVideoError(YoutubeDLError):
1094     """Unavailable Format exception.
1095
1096     This exception will be thrown when a video is requested
1097     in a format that is not available for that video.
1098     """
1099     msg = 'Unable to download video'
1100
1101     def __init__(self, err=None):
1102         if err is not None:
1103             self.msg += f': {err}'
1104         super().__init__(self.msg)
1105
1106
1107 class ContentTooShortError(YoutubeDLError):
1108     """Content Too Short exception.
1109
1110     This exception may be raised by FileDownloader objects when a file they
1111     download is too small for what the server announced first, indicating
1112     the connection was probably interrupted.
1113     """
1114
1115     def __init__(self, downloaded, expected):
1116         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1117         # Both in bytes
1118         self.downloaded = downloaded
1119         self.expected = expected
1120
1121
1122 class XAttrMetadataError(YoutubeDLError):
1123     def __init__(self, code=None, msg='Unknown error'):
1124         super().__init__(msg)
1125         self.code = code
1126         self.msg = msg
1127
1128         # Parsing code and msg
1129         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1130                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1131             self.reason = 'NO_SPACE'
1132         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1133             self.reason = 'VALUE_TOO_LONG'
1134         else:
1135             self.reason = 'NOT_SUPPORTED'
1136
1137
1138 class XAttrUnavailableError(YoutubeDLError):
1139     pass
1140
1141
1142 def is_path_like(f):
1143     return isinstance(f, (str, bytes, os.PathLike))
1144
1145
1146 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1147     def __init__(self, cookiejar=None):
1148         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1149
1150     def http_response(self, request, response):
1151         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1152
1153     https_request = urllib.request.HTTPCookieProcessor.http_request
1154     https_response = http_response
1155
1156
1157 def extract_timezone(date_str):
1158     m = re.search(
1159         r'''(?x)
1160             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1161             (?P<tz>Z|                                            # just the UTC Z, or
1162                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1163                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1164                    [ ]?                                          # optional space
1165                 (?P<sign>\+|-)                                   # +/-
1166                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1167             $)
1168         ''', date_str)
1169     if not m:
1170         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1171         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1172         if timezone is not None:
1173             date_str = date_str[:-len(m.group('tz'))]
1174         timezone = datetime.timedelta(hours=timezone or 0)
1175     else:
1176         date_str = date_str[:-len(m.group('tz'))]
1177         if not m.group('sign'):
1178             timezone = datetime.timedelta()
1179         else:
1180             sign = 1 if m.group('sign') == '+' else -1
1181             timezone = datetime.timedelta(
1182                 hours=sign * int(m.group('hours')),
1183                 minutes=sign * int(m.group('minutes')))
1184     return timezone, date_str
1185
1186
1187 def parse_iso8601(date_str, delimiter='T', timezone=None):
1188     """ Return a UNIX timestamp from the given date """
1189
1190     if date_str is None:
1191         return None
1192
1193     date_str = re.sub(r'\.[0-9]+', '', date_str)
1194
1195     if timezone is None:
1196         timezone, date_str = extract_timezone(date_str)
1197
1198     with contextlib.suppress(ValueError):
1199         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1200         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1201         return calendar.timegm(dt.timetuple())
1202
1203
1204 def date_formats(day_first=True):
1205     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1206
1207
1208 def unified_strdate(date_str, day_first=True):
1209     """Return a string with the date in the format YYYYMMDD"""
1210
1211     if date_str is None:
1212         return None
1213     upload_date = None
1214     # Replace commas
1215     date_str = date_str.replace(',', ' ')
1216     # Remove AM/PM + timezone
1217     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1218     _, date_str = extract_timezone(date_str)
1219
1220     for expression in date_formats(day_first):
1221         with contextlib.suppress(ValueError):
1222             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1223     if upload_date is None:
1224         timetuple = email.utils.parsedate_tz(date_str)
1225         if timetuple:
1226             with contextlib.suppress(ValueError):
1227                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1228     if upload_date is not None:
1229         return str(upload_date)
1230
1231
1232 def unified_timestamp(date_str, day_first=True):
1233     if not isinstance(date_str, str):
1234         return None
1235
1236     date_str = re.sub(r'\s+', ' ', re.sub(
1237         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1238
1239     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1240     timezone, date_str = extract_timezone(date_str)
1241
1242     # Remove AM/PM + timezone
1243     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1244
1245     # Remove unrecognized timezones from ISO 8601 alike timestamps
1246     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1247     if m:
1248         date_str = date_str[:-len(m.group('tz'))]
1249
1250     # Python only supports microseconds, so remove nanoseconds
1251     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1252     if m:
1253         date_str = m.group(1)
1254
1255     for expression in date_formats(day_first):
1256         with contextlib.suppress(ValueError):
1257             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1258             return calendar.timegm(dt.timetuple())
1259
1260     timetuple = email.utils.parsedate_tz(date_str)
1261     if timetuple:
1262         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1263
1264
1265 def determine_ext(url, default_ext='unknown_video'):
1266     if url is None or '.' not in url:
1267         return default_ext
1268     guess = url.partition('?')[0].rpartition('.')[2]
1269     if re.match(r'^[A-Za-z0-9]+$', guess):
1270         return guess
1271     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1272     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1273         return guess.rstrip('/')
1274     else:
1275         return default_ext
1276
1277
1278 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1279     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1280
1281
1282 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1283     R"""
1284     Return a datetime object from a string.
1285     Supported format:
1286         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1287
1288     @param format       strftime format of DATE
1289     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1290                         auto: round to the unit provided in date_str (if applicable).
1291     """
1292     auto_precision = False
1293     if precision == 'auto':
1294         auto_precision = True
1295         precision = 'microsecond'
1296     today = datetime_round(datetime.datetime.utcnow(), precision)
1297     if date_str in ('now', 'today'):
1298         return today
1299     if date_str == 'yesterday':
1300         return today - datetime.timedelta(days=1)
1301     match = re.match(
1302         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1303         date_str)
1304     if match is not None:
1305         start_time = datetime_from_str(match.group('start'), precision, format)
1306         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1307         unit = match.group('unit')
1308         if unit == 'month' or unit == 'year':
1309             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1310             unit = 'day'
1311         else:
1312             if unit == 'week':
1313                 unit = 'day'
1314                 time *= 7
1315             delta = datetime.timedelta(**{unit + 's': time})
1316             new_date = start_time + delta
1317         if auto_precision:
1318             return datetime_round(new_date, unit)
1319         return new_date
1320
1321     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1322
1323
1324 def date_from_str(date_str, format='%Y%m%d', strict=False):
1325     R"""
1326     Return a date object from a string using datetime_from_str
1327
1328     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1329                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1330     """
1331     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1332         raise ValueError(f'Invalid date format "{date_str}"')
1333     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1334
1335
1336 def datetime_add_months(dt, months):
1337     """Increment/Decrement a datetime object by months."""
1338     month = dt.month + months - 1
1339     year = dt.year + month // 12
1340     month = month % 12 + 1
1341     day = min(dt.day, calendar.monthrange(year, month)[1])
1342     return dt.replace(year, month, day)
1343
1344
1345 def datetime_round(dt, precision='day'):
1346     """
1347     Round a datetime object's time to a specific precision
1348     """
1349     if precision == 'microsecond':
1350         return dt
1351
1352     unit_seconds = {
1353         'day': 86400,
1354         'hour': 3600,
1355         'minute': 60,
1356         'second': 1,
1357     }
1358     roundto = lambda x, n: ((x + n / 2) // n) * n
1359     timestamp = calendar.timegm(dt.timetuple())
1360     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1361
1362
1363 def hyphenate_date(date_str):
1364     """
1365     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1366     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1367     if match is not None:
1368         return '-'.join(match.groups())
1369     else:
1370         return date_str
1371
1372
1373 class DateRange:
1374     """Represents a time interval between two dates"""
1375
1376     def __init__(self, start=None, end=None):
1377         """start and end must be strings in the format accepted by date"""
1378         if start is not None:
1379             self.start = date_from_str(start, strict=True)
1380         else:
1381             self.start = datetime.datetime.min.date()
1382         if end is not None:
1383             self.end = date_from_str(end, strict=True)
1384         else:
1385             self.end = datetime.datetime.max.date()
1386         if self.start > self.end:
1387             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1388
1389     @classmethod
1390     def day(cls, day):
1391         """Returns a range that only contains the given day"""
1392         return cls(day, day)
1393
1394     def __contains__(self, date):
1395         """Check if the date is in the range"""
1396         if not isinstance(date, datetime.date):
1397             date = date_from_str(date)
1398         return self.start <= date <= self.end
1399
1400     def __repr__(self):
1401         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1402
1403     def __eq__(self, other):
1404         return (isinstance(other, DateRange)
1405                 and self.start == other.start and self.end == other.end)
1406
1407
1408 @functools.cache
1409 def system_identifier():
1410     python_implementation = platform.python_implementation()
1411     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1412         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1413     libc_ver = []
1414     with contextlib.suppress(OSError):  # We may not have access to the executable
1415         libc_ver = platform.libc_ver()
1416
1417     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1418         platform.python_version(),
1419         python_implementation,
1420         platform.machine(),
1421         platform.architecture()[0],
1422         platform.platform(),
1423         ssl.OPENSSL_VERSION,
1424         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1425     )
1426
1427
1428 @functools.cache
1429 def get_windows_version():
1430     ''' Get Windows version. returns () if it's not running on Windows '''
1431     if compat_os_name == 'nt':
1432         return version_tuple(platform.win32_ver()[1])
1433     else:
1434         return ()
1435
1436
1437 def write_string(s, out=None, encoding=None):
1438     assert isinstance(s, str)
1439     out = out or sys.stderr
1440     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1441     if not out:
1442         return
1443
1444     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1445         s = re.sub(r'([\r\n]+)', r' \1', s)
1446
1447     enc, buffer = None, out
1448     if 'b' in getattr(out, 'mode', ''):
1449         enc = encoding or preferredencoding()
1450     elif hasattr(out, 'buffer'):
1451         buffer = out.buffer
1452         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1453
1454     buffer.write(s.encode(enc, 'ignore') if enc else s)
1455     out.flush()
1456
1457
1458 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1459     from .. import _IN_CLI
1460     if _IN_CLI:
1461         if msg in deprecation_warning._cache:
1462             return
1463         deprecation_warning._cache.add(msg)
1464         if printer:
1465             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1466         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1467     else:
1468         import warnings
1469         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1470
1471
1472 deprecation_warning._cache = set()
1473
1474
1475 def bytes_to_intlist(bs):
1476     if not bs:
1477         return []
1478     if isinstance(bs[0], int):  # Python 3
1479         return list(bs)
1480     else:
1481         return [ord(c) for c in bs]
1482
1483
1484 def intlist_to_bytes(xs):
1485     if not xs:
1486         return b''
1487     return struct.pack('%dB' % len(xs), *xs)
1488
1489
1490 class LockingUnsupportedError(OSError):
1491     msg = 'File locking is not supported'
1492
1493     def __init__(self):
1494         super().__init__(self.msg)
1495
1496
1497 # Cross-platform file locking
1498 if sys.platform == 'win32':
1499     import ctypes
1500     import ctypes.wintypes
1501     import msvcrt
1502
1503     class OVERLAPPED(ctypes.Structure):
1504         _fields_ = [
1505             ('Internal', ctypes.wintypes.LPVOID),
1506             ('InternalHigh', ctypes.wintypes.LPVOID),
1507             ('Offset', ctypes.wintypes.DWORD),
1508             ('OffsetHigh', ctypes.wintypes.DWORD),
1509             ('hEvent', ctypes.wintypes.HANDLE),
1510         ]
1511
1512     kernel32 = ctypes.WinDLL('kernel32')
1513     LockFileEx = kernel32.LockFileEx
1514     LockFileEx.argtypes = [
1515         ctypes.wintypes.HANDLE,     # hFile
1516         ctypes.wintypes.DWORD,      # dwFlags
1517         ctypes.wintypes.DWORD,      # dwReserved
1518         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1519         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1520         ctypes.POINTER(OVERLAPPED)  # Overlapped
1521     ]
1522     LockFileEx.restype = ctypes.wintypes.BOOL
1523     UnlockFileEx = kernel32.UnlockFileEx
1524     UnlockFileEx.argtypes = [
1525         ctypes.wintypes.HANDLE,     # hFile
1526         ctypes.wintypes.DWORD,      # dwReserved
1527         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1528         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1529         ctypes.POINTER(OVERLAPPED)  # Overlapped
1530     ]
1531     UnlockFileEx.restype = ctypes.wintypes.BOOL
1532     whole_low = 0xffffffff
1533     whole_high = 0x7fffffff
1534
1535     def _lock_file(f, exclusive, block):
1536         overlapped = OVERLAPPED()
1537         overlapped.Offset = 0
1538         overlapped.OffsetHigh = 0
1539         overlapped.hEvent = 0
1540         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1541
1542         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1543                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1544                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1545             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1546             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1547
1548     def _unlock_file(f):
1549         assert f._lock_file_overlapped_p
1550         handle = msvcrt.get_osfhandle(f.fileno())
1551         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1552             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1553
1554 else:
1555     try:
1556         import fcntl
1557
1558         def _lock_file(f, exclusive, block):
1559             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1560             if not block:
1561                 flags |= fcntl.LOCK_NB
1562             try:
1563                 fcntl.flock(f, flags)
1564             except BlockingIOError:
1565                 raise
1566             except OSError:  # AOSP does not have flock()
1567                 fcntl.lockf(f, flags)
1568
1569         def _unlock_file(f):
1570             with contextlib.suppress(OSError):
1571                 return fcntl.flock(f, fcntl.LOCK_UN)
1572             with contextlib.suppress(OSError):
1573                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1574             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1575
1576     except ImportError:
1577
1578         def _lock_file(f, exclusive, block):
1579             raise LockingUnsupportedError()
1580
1581         def _unlock_file(f):
1582             raise LockingUnsupportedError()
1583
1584
1585 class locked_file:
1586     locked = False
1587
1588     def __init__(self, filename, mode, block=True, encoding=None):
1589         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1590             raise NotImplementedError(mode)
1591         self.mode, self.block = mode, block
1592
1593         writable = any(f in mode for f in 'wax+')
1594         readable = any(f in mode for f in 'r+')
1595         flags = functools.reduce(operator.ior, (
1596             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
1597             getattr(os, 'O_BINARY', 0),  # Windows only
1598             getattr(os, 'O_NOINHERIT', 0),  # Windows only
1599             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
1600             os.O_APPEND if 'a' in mode else 0,
1601             os.O_EXCL if 'x' in mode else 0,
1602             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1603         ))
1604
1605         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1606
1607     def __enter__(self):
1608         exclusive = 'r' not in self.mode
1609         try:
1610             _lock_file(self.f, exclusive, self.block)
1611             self.locked = True
1612         except OSError:
1613             self.f.close()
1614             raise
1615         if 'w' in self.mode:
1616             try:
1617                 self.f.truncate()
1618             except OSError as e:
1619                 if e.errno not in (
1620                     errno.ESPIPE,  # Illegal seek - expected for FIFO
1621                     errno.EINVAL,  # Invalid argument - expected for /dev/null
1622                 ):
1623                     raise
1624         return self
1625
1626     def unlock(self):
1627         if not self.locked:
1628             return
1629         try:
1630             _unlock_file(self.f)
1631         finally:
1632             self.locked = False
1633
1634     def __exit__(self, *_):
1635         try:
1636             self.unlock()
1637         finally:
1638             self.f.close()
1639
1640     open = __enter__
1641     close = __exit__
1642
1643     def __getattr__(self, attr):
1644         return getattr(self.f, attr)
1645
1646     def __iter__(self):
1647         return iter(self.f)
1648
1649
1650 @functools.cache
1651 def get_filesystem_encoding():
1652     encoding = sys.getfilesystemencoding()
1653     return encoding if encoding is not None else 'utf-8'
1654
1655
1656 def shell_quote(args):
1657     quoted_args = []
1658     encoding = get_filesystem_encoding()
1659     for a in args:
1660         if isinstance(a, bytes):
1661             # We may get a filename encoded with 'encodeFilename'
1662             a = a.decode(encoding)
1663         quoted_args.append(compat_shlex_quote(a))
1664     return ' '.join(quoted_args)
1665
1666
1667 def smuggle_url(url, data):
1668     """ Pass additional data in a URL for internal use. """
1669
1670     url, idata = unsmuggle_url(url, {})
1671     data.update(idata)
1672     sdata = urllib.parse.urlencode(
1673         {'__youtubedl_smuggle': json.dumps(data)})
1674     return url + '#' + sdata
1675
1676
1677 def unsmuggle_url(smug_url, default=None):
1678     if '#__youtubedl_smuggle' not in smug_url:
1679         return smug_url, default
1680     url, _, sdata = smug_url.rpartition('#')
1681     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1682     data = json.loads(jsond)
1683     return url, data
1684
1685
1686 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1687     """ Formats numbers with decimal sufixes like K, M, etc """
1688     num, factor = float_or_none(num), float(factor)
1689     if num is None or num < 0:
1690         return None
1691     POSSIBLE_SUFFIXES = 'kMGTPEZY'
1692     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1693     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1694     if factor == 1024:
1695         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1696     converted = num / (factor ** exponent)
1697     return fmt % (converted, suffix)
1698
1699
1700 def format_bytes(bytes):
1701     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1702
1703
1704 def lookup_unit_table(unit_table, s, strict=False):
1705     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1706     units_re = '|'.join(re.escape(u) for u in unit_table)
1707     m = (re.fullmatch if strict else re.match)(
1708         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1709     if not m:
1710         return None
1711
1712     num = float(m.group('num').replace(',', '.'))
1713     mult = unit_table[m.group('unit')]
1714     return round(num * mult)
1715
1716
1717 def parse_bytes(s):
1718     """Parse a string indicating a byte quantity into an integer"""
1719     return lookup_unit_table(
1720         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1721         s.upper(), strict=True)
1722
1723
1724 def parse_filesize(s):
1725     if s is None:
1726         return None
1727
1728     # The lower-case forms are of course incorrect and unofficial,
1729     # but we support those too
1730     _UNIT_TABLE = {
1731         'B': 1,
1732         'b': 1,
1733         'bytes': 1,
1734         'KiB': 1024,
1735         'KB': 1000,
1736         'kB': 1024,
1737         'Kb': 1000,
1738         'kb': 1000,
1739         'kilobytes': 1000,
1740         'kibibytes': 1024,
1741         'MiB': 1024 ** 2,
1742         'MB': 1000 ** 2,
1743         'mB': 1024 ** 2,
1744         'Mb': 1000 ** 2,
1745         'mb': 1000 ** 2,
1746         'megabytes': 1000 ** 2,
1747         'mebibytes': 1024 ** 2,
1748         'GiB': 1024 ** 3,
1749         'GB': 1000 ** 3,
1750         'gB': 1024 ** 3,
1751         'Gb': 1000 ** 3,
1752         'gb': 1000 ** 3,
1753         'gigabytes': 1000 ** 3,
1754         'gibibytes': 1024 ** 3,
1755         'TiB': 1024 ** 4,
1756         'TB': 1000 ** 4,
1757         'tB': 1024 ** 4,
1758         'Tb': 1000 ** 4,
1759         'tb': 1000 ** 4,
1760         'terabytes': 1000 ** 4,
1761         'tebibytes': 1024 ** 4,
1762         'PiB': 1024 ** 5,
1763         'PB': 1000 ** 5,
1764         'pB': 1024 ** 5,
1765         'Pb': 1000 ** 5,
1766         'pb': 1000 ** 5,
1767         'petabytes': 1000 ** 5,
1768         'pebibytes': 1024 ** 5,
1769         'EiB': 1024 ** 6,
1770         'EB': 1000 ** 6,
1771         'eB': 1024 ** 6,
1772         'Eb': 1000 ** 6,
1773         'eb': 1000 ** 6,
1774         'exabytes': 1000 ** 6,
1775         'exbibytes': 1024 ** 6,
1776         'ZiB': 1024 ** 7,
1777         'ZB': 1000 ** 7,
1778         'zB': 1024 ** 7,
1779         'Zb': 1000 ** 7,
1780         'zb': 1000 ** 7,
1781         'zettabytes': 1000 ** 7,
1782         'zebibytes': 1024 ** 7,
1783         'YiB': 1024 ** 8,
1784         'YB': 1000 ** 8,
1785         'yB': 1024 ** 8,
1786         'Yb': 1000 ** 8,
1787         'yb': 1000 ** 8,
1788         'yottabytes': 1000 ** 8,
1789         'yobibytes': 1024 ** 8,
1790     }
1791
1792     return lookup_unit_table(_UNIT_TABLE, s)
1793
1794
1795 def parse_count(s):
1796     if s is None:
1797         return None
1798
1799     s = re.sub(r'^[^\d]+\s', '', s).strip()
1800
1801     if re.match(r'^[\d,.]+$', s):
1802         return str_to_int(s)
1803
1804     _UNIT_TABLE = {
1805         'k': 1000,
1806         'K': 1000,
1807         'm': 1000 ** 2,
1808         'M': 1000 ** 2,
1809         'kk': 1000 ** 2,
1810         'KK': 1000 ** 2,
1811         'b': 1000 ** 3,
1812         'B': 1000 ** 3,
1813     }
1814
1815     ret = lookup_unit_table(_UNIT_TABLE, s)
1816     if ret is not None:
1817         return ret
1818
1819     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1820     if mobj:
1821         return str_to_int(mobj.group(1))
1822
1823
1824 def parse_resolution(s, *, lenient=False):
1825     if s is None:
1826         return {}
1827
1828     if lenient:
1829         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1830     else:
1831         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1832     if mobj:
1833         return {
1834             'width': int(mobj.group('w')),
1835             'height': int(mobj.group('h')),
1836         }
1837
1838     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1839     if mobj:
1840         return {'height': int(mobj.group(1))}
1841
1842     mobj = re.search(r'\b([48])[kK]\b', s)
1843     if mobj:
1844         return {'height': int(mobj.group(1)) * 540}
1845
1846     return {}
1847
1848
1849 def parse_bitrate(s):
1850     if not isinstance(s, str):
1851         return
1852     mobj = re.search(r'\b(\d+)\s*kbps', s)
1853     if mobj:
1854         return int(mobj.group(1))
1855
1856
1857 def month_by_name(name, lang='en'):
1858     """ Return the number of a month by (locale-independently) English name """
1859
1860     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1861
1862     try:
1863         return month_names.index(name) + 1
1864     except ValueError:
1865         return None
1866
1867
1868 def month_by_abbreviation(abbrev):
1869     """ Return the number of a month by (locale-independently) English
1870         abbreviations """
1871
1872     try:
1873         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1874     except ValueError:
1875         return None
1876
1877
1878 def fix_xml_ampersands(xml_str):
1879     """Replace all the '&' by '&amp;' in XML"""
1880     return re.sub(
1881         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1882         '&amp;',
1883         xml_str)
1884
1885
1886 def setproctitle(title):
1887     assert isinstance(title, str)
1888
1889     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1890     try:
1891         import ctypes
1892     except ImportError:
1893         return
1894
1895     try:
1896         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1897     except OSError:
1898         return
1899     except TypeError:
1900         # LoadLibrary in Windows Python 2.7.13 only expects
1901         # a bytestring, but since unicode_literals turns
1902         # every string into a unicode string, it fails.
1903         return
1904     title_bytes = title.encode()
1905     buf = ctypes.create_string_buffer(len(title_bytes))
1906     buf.value = title_bytes
1907     try:
1908         libc.prctl(15, buf, 0, 0, 0)
1909     except AttributeError:
1910         return  # Strange libc, just skip this
1911
1912
1913 def remove_start(s, start):
1914     return s[len(start):] if s is not None and s.startswith(start) else s
1915
1916
1917 def remove_end(s, end):
1918     return s[:-len(end)] if s is not None and s.endswith(end) else s
1919
1920
1921 def remove_quotes(s):
1922     if s is None or len(s) < 2:
1923         return s
1924     for quote in ('"', "'", ):
1925         if s[0] == quote and s[-1] == quote:
1926             return s[1:-1]
1927     return s
1928
1929
1930 def get_domain(url):
1931     """
1932     This implementation is inconsistent, but is kept for compatibility.
1933     Use this only for "webpage_url_domain"
1934     """
1935     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1936
1937
1938 def url_basename(url):
1939     path = urllib.parse.urlparse(url).path
1940     return path.strip('/').split('/')[-1]
1941
1942
1943 def base_url(url):
1944     return re.match(r'https?://[^?#]+/', url).group()
1945
1946
1947 def urljoin(base, path):
1948     if isinstance(path, bytes):
1949         path = path.decode()
1950     if not isinstance(path, str) or not path:
1951         return None
1952     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1953         return path
1954     if isinstance(base, bytes):
1955         base = base.decode()
1956     if not isinstance(base, str) or not re.match(
1957             r'^(?:https?:)?//', base):
1958         return None
1959     return urllib.parse.urljoin(base, path)
1960
1961
1962 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1963     if get_attr and v is not None:
1964         v = getattr(v, get_attr, None)
1965     try:
1966         return int(v) * invscale // scale
1967     except (ValueError, TypeError, OverflowError):
1968         return default
1969
1970
1971 def str_or_none(v, default=None):
1972     return default if v is None else str(v)
1973
1974
1975 def str_to_int(int_str):
1976     """ A more relaxed version of int_or_none """
1977     if isinstance(int_str, int):
1978         return int_str
1979     elif isinstance(int_str, str):
1980         int_str = re.sub(r'[,\.\+]', '', int_str)
1981         return int_or_none(int_str)
1982
1983
1984 def float_or_none(v, scale=1, invscale=1, default=None):
1985     if v is None:
1986         return default
1987     try:
1988         return float(v) * invscale / scale
1989     except (ValueError, TypeError):
1990         return default
1991
1992
1993 def bool_or_none(v, default=None):
1994     return v if isinstance(v, bool) else default
1995
1996
1997 def strip_or_none(v, default=None):
1998     return v.strip() if isinstance(v, str) else default
1999
2000
2001 def url_or_none(url):
2002     if not url or not isinstance(url, str):
2003         return None
2004     url = url.strip()
2005     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2006
2007
2008 def request_to_url(req):
2009     if isinstance(req, urllib.request.Request):
2010         return req.get_full_url()
2011     else:
2012         return req
2013
2014
2015 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
2016     datetime_object = None
2017     try:
2018         if isinstance(timestamp, (int, float)):  # unix timestamp
2019             # Using naive datetime here can break timestamp() in Windows
2020             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2021             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2022             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2023             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2024                                + datetime.timedelta(seconds=timestamp))
2025         elif isinstance(timestamp, str):  # assume YYYYMMDD
2026             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2027         date_format = re.sub(  # Support %s on windows
2028             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2029         return datetime_object.strftime(date_format)
2030     except (ValueError, TypeError, AttributeError):
2031         return default
2032
2033
2034 def parse_duration(s):
2035     if not isinstance(s, str):
2036         return None
2037     s = s.strip()
2038     if not s:
2039         return None
2040
2041     days, hours, mins, secs, ms = [None] * 5
2042     m = re.match(r'''(?x)
2043             (?P<before_secs>
2044                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2045             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2046             (?P<ms>[.:][0-9]+)?Z?$
2047         ''', s)
2048     if m:
2049         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2050     else:
2051         m = re.match(
2052             r'''(?ix)(?:P?
2053                 (?:
2054                     [0-9]+\s*y(?:ears?)?,?\s*
2055                 )?
2056                 (?:
2057                     [0-9]+\s*m(?:onths?)?,?\s*
2058                 )?
2059                 (?:
2060                     [0-9]+\s*w(?:eeks?)?,?\s*
2061                 )?
2062                 (?:
2063                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2064                 )?
2065                 T)?
2066                 (?:
2067                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2068                 )?
2069                 (?:
2070                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2071                 )?
2072                 (?:
2073                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2074                 )?Z?$''', s)
2075         if m:
2076             days, hours, mins, secs, ms = m.groups()
2077         else:
2078             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2079             if m:
2080                 hours, mins = m.groups()
2081             else:
2082                 return None
2083
2084     if ms:
2085         ms = ms.replace(':', '.')
2086     return sum(float(part or 0) * mult for part, mult in (
2087         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2088
2089
2090 def prepend_extension(filename, ext, expected_real_ext=None):
2091     name, real_ext = os.path.splitext(filename)
2092     return (
2093         f'{name}.{ext}{real_ext}'
2094         if not expected_real_ext or real_ext[1:] == expected_real_ext
2095         else f'{filename}.{ext}')
2096
2097
2098 def replace_extension(filename, ext, expected_real_ext=None):
2099     name, real_ext = os.path.splitext(filename)
2100     return '{}.{}'.format(
2101         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2102         ext)
2103
2104
2105 def check_executable(exe, args=[]):
2106     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2107     args can be a list of arguments for a short output (like -version) """
2108     try:
2109         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2110     except OSError:
2111         return False
2112     return exe
2113
2114
2115 def _get_exe_version_output(exe, args):
2116     try:
2117         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2118         # SIGTTOU if yt-dlp is run in the background.
2119         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2120         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2121                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2122         if ret:
2123             return None
2124     except OSError:
2125         return False
2126     return stdout
2127
2128
2129 def detect_exe_version(output, version_re=None, unrecognized='present'):
2130     assert isinstance(output, str)
2131     if version_re is None:
2132         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2133     m = re.search(version_re, output)
2134     if m:
2135         return m.group(1)
2136     else:
2137         return unrecognized
2138
2139
2140 def get_exe_version(exe, args=['--version'],
2141                     version_re=None, unrecognized=('present', 'broken')):
2142     """ Returns the version of the specified executable,
2143     or False if the executable is not present """
2144     unrecognized = variadic(unrecognized)
2145     assert len(unrecognized) in (1, 2)
2146     out = _get_exe_version_output(exe, args)
2147     if out is None:
2148         return unrecognized[-1]
2149     return out and detect_exe_version(out, version_re, unrecognized[0])
2150
2151
2152 def frange(start=0, stop=None, step=1):
2153     """Float range"""
2154     if stop is None:
2155         start, stop = 0, start
2156     sign = [-1, 1][step > 0] if step else 0
2157     while sign * start < sign * stop:
2158         yield start
2159         start += step
2160
2161
2162 class LazyList(collections.abc.Sequence):
2163     """Lazy immutable list from an iterable
2164     Note that slices of a LazyList are lists and not LazyList"""
2165
2166     class IndexError(IndexError):
2167         pass
2168
2169     def __init__(self, iterable, *, reverse=False, _cache=None):
2170         self._iterable = iter(iterable)
2171         self._cache = [] if _cache is None else _cache
2172         self._reversed = reverse
2173
2174     def __iter__(self):
2175         if self._reversed:
2176             # We need to consume the entire iterable to iterate in reverse
2177             yield from self.exhaust()
2178             return
2179         yield from self._cache
2180         for item in self._iterable:
2181             self._cache.append(item)
2182             yield item
2183
2184     def _exhaust(self):
2185         self._cache.extend(self._iterable)
2186         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2187         return self._cache
2188
2189     def exhaust(self):
2190         """Evaluate the entire iterable"""
2191         return self._exhaust()[::-1 if self._reversed else 1]
2192
2193     @staticmethod
2194     def _reverse_index(x):
2195         return None if x is None else ~x
2196
2197     def __getitem__(self, idx):
2198         if isinstance(idx, slice):
2199             if self._reversed:
2200                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2201             start, stop, step = idx.start, idx.stop, idx.step or 1
2202         elif isinstance(idx, int):
2203             if self._reversed:
2204                 idx = self._reverse_index(idx)
2205             start, stop, step = idx, idx, 0
2206         else:
2207             raise TypeError('indices must be integers or slices')
2208         if ((start or 0) < 0 or (stop or 0) < 0
2209                 or (start is None and step < 0)
2210                 or (stop is None and step > 0)):
2211             # We need to consume the entire iterable to be able to slice from the end
2212             # Obviously, never use this with infinite iterables
2213             self._exhaust()
2214             try:
2215                 return self._cache[idx]
2216             except IndexError as e:
2217                 raise self.IndexError(e) from e
2218         n = max(start or 0, stop or 0) - len(self._cache) + 1
2219         if n > 0:
2220             self._cache.extend(itertools.islice(self._iterable, n))
2221         try:
2222             return self._cache[idx]
2223         except IndexError as e:
2224             raise self.IndexError(e) from e
2225
2226     def __bool__(self):
2227         try:
2228             self[-1] if self._reversed else self[0]
2229         except self.IndexError:
2230             return False
2231         return True
2232
2233     def __len__(self):
2234         self._exhaust()
2235         return len(self._cache)
2236
2237     def __reversed__(self):
2238         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2239
2240     def __copy__(self):
2241         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2242
2243     def __repr__(self):
2244         # repr and str should mimic a list. So we exhaust the iterable
2245         return repr(self.exhaust())
2246
2247     def __str__(self):
2248         return repr(self.exhaust())
2249
2250
2251 class PagedList:
2252
2253     class IndexError(IndexError):
2254         pass
2255
2256     def __len__(self):
2257         # This is only useful for tests
2258         return len(self.getslice())
2259
2260     def __init__(self, pagefunc, pagesize, use_cache=True):
2261         self._pagefunc = pagefunc
2262         self._pagesize = pagesize
2263         self._pagecount = float('inf')
2264         self._use_cache = use_cache
2265         self._cache = {}
2266
2267     def getpage(self, pagenum):
2268         page_results = self._cache.get(pagenum)
2269         if page_results is None:
2270             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2271         if self._use_cache:
2272             self._cache[pagenum] = page_results
2273         return page_results
2274
2275     def getslice(self, start=0, end=None):
2276         return list(self._getslice(start, end))
2277
2278     def _getslice(self, start, end):
2279         raise NotImplementedError('This method must be implemented by subclasses')
2280
2281     def __getitem__(self, idx):
2282         assert self._use_cache, 'Indexing PagedList requires cache'
2283         if not isinstance(idx, int) or idx < 0:
2284             raise TypeError('indices must be non-negative integers')
2285         entries = self.getslice(idx, idx + 1)
2286         if not entries:
2287             raise self.IndexError()
2288         return entries[0]
2289
2290
2291 class OnDemandPagedList(PagedList):
2292     """Download pages until a page with less than maximum results"""
2293
2294     def _getslice(self, start, end):
2295         for pagenum in itertools.count(start // self._pagesize):
2296             firstid = pagenum * self._pagesize
2297             nextfirstid = pagenum * self._pagesize + self._pagesize
2298             if start >= nextfirstid:
2299                 continue
2300
2301             startv = (
2302                 start % self._pagesize
2303                 if firstid <= start < nextfirstid
2304                 else 0)
2305             endv = (
2306                 ((end - 1) % self._pagesize) + 1
2307                 if (end is not None and firstid <= end <= nextfirstid)
2308                 else None)
2309
2310             try:
2311                 page_results = self.getpage(pagenum)
2312             except Exception:
2313                 self._pagecount = pagenum - 1
2314                 raise
2315             if startv != 0 or endv is not None:
2316                 page_results = page_results[startv:endv]
2317             yield from page_results
2318
2319             # A little optimization - if current page is not "full", ie. does
2320             # not contain page_size videos then we can assume that this page
2321             # is the last one - there are no more ids on further pages -
2322             # i.e. no need to query again.
2323             if len(page_results) + startv < self._pagesize:
2324                 break
2325
2326             # If we got the whole page, but the next page is not interesting,
2327             # break out early as well
2328             if end == nextfirstid:
2329                 break
2330
2331
2332 class InAdvancePagedList(PagedList):
2333     """PagedList with total number of pages known in advance"""
2334
2335     def __init__(self, pagefunc, pagecount, pagesize):
2336         PagedList.__init__(self, pagefunc, pagesize, True)
2337         self._pagecount = pagecount
2338
2339     def _getslice(self, start, end):
2340         start_page = start // self._pagesize
2341         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2342         skip_elems = start - start_page * self._pagesize
2343         only_more = None if end is None else end - start
2344         for pagenum in range(start_page, end_page):
2345             page_results = self.getpage(pagenum)
2346             if skip_elems:
2347                 page_results = page_results[skip_elems:]
2348                 skip_elems = None
2349             if only_more is not None:
2350                 if len(page_results) < only_more:
2351                     only_more -= len(page_results)
2352                 else:
2353                     yield from page_results[:only_more]
2354                     break
2355             yield from page_results
2356
2357
2358 class PlaylistEntries:
2359     MissingEntry = object()
2360     is_exhausted = False
2361
2362     def __init__(self, ydl, info_dict):
2363         self.ydl = ydl
2364
2365         # _entries must be assigned now since infodict can change during iteration
2366         entries = info_dict.get('entries')
2367         if entries is None:
2368             raise EntryNotInPlaylist('There are no entries')
2369         elif isinstance(entries, list):
2370             self.is_exhausted = True
2371
2372         requested_entries = info_dict.get('requested_entries')
2373         self.is_incomplete = requested_entries is not None
2374         if self.is_incomplete:
2375             assert self.is_exhausted
2376             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2377             for i, entry in zip(requested_entries, entries):
2378                 self._entries[i - 1] = entry
2379         elif isinstance(entries, (list, PagedList, LazyList)):
2380             self._entries = entries
2381         else:
2382             self._entries = LazyList(entries)
2383
2384     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2385         (?P<start>[+-]?\d+)?
2386         (?P<range>[:-]
2387             (?P<end>[+-]?\d+|inf(?:inite)?)?
2388             (?::(?P<step>[+-]?\d+))?
2389         )?''')
2390
2391     @classmethod
2392     def parse_playlist_items(cls, string):
2393         for segment in string.split(','):
2394             if not segment:
2395                 raise ValueError('There is two or more consecutive commas')
2396             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2397             if not mobj:
2398                 raise ValueError(f'{segment!r} is not a valid specification')
2399             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2400             if int_or_none(step) == 0:
2401                 raise ValueError(f'Step in {segment!r} cannot be zero')
2402             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2403
2404     def get_requested_items(self):
2405         playlist_items = self.ydl.params.get('playlist_items')
2406         playlist_start = self.ydl.params.get('playliststart', 1)
2407         playlist_end = self.ydl.params.get('playlistend')
2408         # For backwards compatibility, interpret -1 as whole list
2409         if playlist_end in (-1, None):
2410             playlist_end = ''
2411         if not playlist_items:
2412             playlist_items = f'{playlist_start}:{playlist_end}'
2413         elif playlist_start != 1 or playlist_end:
2414             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2415
2416         for index in self.parse_playlist_items(playlist_items):
2417             for i, entry in self[index]:
2418                 yield i, entry
2419                 if not entry:
2420                     continue
2421                 try:
2422                     # The item may have just been added to archive. Don't break due to it
2423                     if not self.ydl.params.get('lazy_playlist'):
2424                         # TODO: Add auto-generated fields
2425                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2426                 except (ExistingVideoReached, RejectedVideoReached):
2427                     return
2428
2429     def get_full_count(self):
2430         if self.is_exhausted and not self.is_incomplete:
2431             return len(self)
2432         elif isinstance(self._entries, InAdvancePagedList):
2433             if self._entries._pagesize == 1:
2434                 return self._entries._pagecount
2435
2436     @functools.cached_property
2437     def _getter(self):
2438         if isinstance(self._entries, list):
2439             def get_entry(i):
2440                 try:
2441                     entry = self._entries[i]
2442                 except IndexError:
2443                     entry = self.MissingEntry
2444                     if not self.is_incomplete:
2445                         raise self.IndexError()
2446                 if entry is self.MissingEntry:
2447                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2448                 return entry
2449         else:
2450             def get_entry(i):
2451                 try:
2452                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2453                 except (LazyList.IndexError, PagedList.IndexError):
2454                     raise self.IndexError()
2455         return get_entry
2456
2457     def __getitem__(self, idx):
2458         if isinstance(idx, int):
2459             idx = slice(idx, idx)
2460
2461         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2462         step = 1 if idx.step is None else idx.step
2463         if idx.start is None:
2464             start = 0 if step > 0 else len(self) - 1
2465         else:
2466             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2467
2468         # NB: Do not call len(self) when idx == [:]
2469         if idx.stop is None:
2470             stop = 0 if step < 0 else float('inf')
2471         else:
2472             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2473         stop += [-1, 1][step > 0]
2474
2475         for i in frange(start, stop, step):
2476             if i < 0:
2477                 continue
2478             try:
2479                 entry = self._getter(i)
2480             except self.IndexError:
2481                 self.is_exhausted = True
2482                 if step > 0:
2483                     break
2484                 continue
2485             yield i + 1, entry
2486
2487     def __len__(self):
2488         return len(tuple(self[:]))
2489
2490     class IndexError(IndexError):
2491         pass
2492
2493
2494 def uppercase_escape(s):
2495     unicode_escape = codecs.getdecoder('unicode_escape')
2496     return re.sub(
2497         r'\\U[0-9a-fA-F]{8}',
2498         lambda m: unicode_escape(m.group(0))[0],
2499         s)
2500
2501
2502 def lowercase_escape(s):
2503     unicode_escape = codecs.getdecoder('unicode_escape')
2504     return re.sub(
2505         r'\\u[0-9a-fA-F]{4}',
2506         lambda m: unicode_escape(m.group(0))[0],
2507         s)
2508
2509
2510 def escape_rfc3986(s):
2511     """Escape non-ASCII characters as suggested by RFC 3986"""
2512     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2513
2514
2515 def escape_url(url):
2516     """Escape URL as suggested by RFC 3986"""
2517     url_parsed = urllib.parse.urlparse(url)
2518     return url_parsed._replace(
2519         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2520         path=escape_rfc3986(url_parsed.path),
2521         params=escape_rfc3986(url_parsed.params),
2522         query=escape_rfc3986(url_parsed.query),
2523         fragment=escape_rfc3986(url_parsed.fragment)
2524     ).geturl()
2525
2526
2527 def parse_qs(url, **kwargs):
2528     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2529
2530
2531 def read_batch_urls(batch_fd):
2532     def fixup(url):
2533         if not isinstance(url, str):
2534             url = url.decode('utf-8', 'replace')
2535         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2536         for bom in BOM_UTF8:
2537             if url.startswith(bom):
2538                 url = url[len(bom):]
2539         url = url.lstrip()
2540         if not url or url.startswith(('#', ';', ']')):
2541             return False
2542         # "#" cannot be stripped out since it is part of the URI
2543         # However, it can be safely stripped out if following a whitespace
2544         return re.split(r'\s#', url, 1)[0].rstrip()
2545
2546     with contextlib.closing(batch_fd) as fd:
2547         return [url for url in map(fixup, fd) if url]
2548
2549
2550 def urlencode_postdata(*args, **kargs):
2551     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2552
2553
2554 def update_url(url, *, query_update=None, **kwargs):
2555     """Replace URL components specified by kwargs
2556        @param url           str or parse url tuple
2557        @param query_update  update query
2558        @returns             str
2559     """
2560     if isinstance(url, str):
2561         if not kwargs and not query_update:
2562             return url
2563         else:
2564             url = urllib.parse.urlparse(url)
2565     if query_update:
2566         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2567         kwargs['query'] = urllib.parse.urlencode({
2568             **urllib.parse.parse_qs(url.query),
2569             **query_update
2570         }, True)
2571     return urllib.parse.urlunparse(url._replace(**kwargs))
2572
2573
2574 def update_url_query(url, query):
2575     return update_url(url, query_update=query)
2576
2577
2578 def _multipart_encode_impl(data, boundary):
2579     content_type = 'multipart/form-data; boundary=%s' % boundary
2580
2581     out = b''
2582     for k, v in data.items():
2583         out += b'--' + boundary.encode('ascii') + b'\r\n'
2584         if isinstance(k, str):
2585             k = k.encode()
2586         if isinstance(v, str):
2587             v = v.encode()
2588         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2589         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2590         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2591         if boundary.encode('ascii') in content:
2592             raise ValueError('Boundary overlaps with data')
2593         out += content
2594
2595     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2596
2597     return out, content_type
2598
2599
2600 def multipart_encode(data, boundary=None):
2601     '''
2602     Encode a dict to RFC 7578-compliant form-data
2603
2604     data:
2605         A dict where keys and values can be either Unicode or bytes-like
2606         objects.
2607     boundary:
2608         If specified a Unicode object, it's used as the boundary. Otherwise
2609         a random boundary is generated.
2610
2611     Reference: https://tools.ietf.org/html/rfc7578
2612     '''
2613     has_specified_boundary = boundary is not None
2614
2615     while True:
2616         if boundary is None:
2617             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2618
2619         try:
2620             out, content_type = _multipart_encode_impl(data, boundary)
2621             break
2622         except ValueError:
2623             if has_specified_boundary:
2624                 raise
2625             boundary = None
2626
2627     return out, content_type
2628
2629
2630 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2631     if blocked_types is NO_DEFAULT:
2632         blocked_types = (str, bytes, collections.abc.Mapping)
2633     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2634
2635
2636 def variadic(x, allowed_types=NO_DEFAULT):
2637     if not isinstance(allowed_types, (tuple, type)):
2638         deprecation_warning('allowed_types should be a tuple or a type')
2639         allowed_types = tuple(allowed_types)
2640     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2641
2642
2643 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2644     for f in funcs:
2645         try:
2646             val = f(*args, **kwargs)
2647         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2648             pass
2649         else:
2650             if expected_type is None or isinstance(val, expected_type):
2651                 return val
2652
2653
2654 def try_get(src, getter, expected_type=None):
2655     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2656
2657
2658 def filter_dict(dct, cndn=lambda _, v: v is not None):
2659     return {k: v for k, v in dct.items() if cndn(k, v)}
2660
2661
2662 def merge_dicts(*dicts):
2663     merged = {}
2664     for a_dict in dicts:
2665         for k, v in a_dict.items():
2666             if (v is not None and k not in merged
2667                     or isinstance(v, str) and merged[k] == ''):
2668                 merged[k] = v
2669     return merged
2670
2671
2672 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2673     return string if isinstance(string, str) else str(string, encoding, errors)
2674
2675
2676 US_RATINGS = {
2677     'G': 0,
2678     'PG': 10,
2679     'PG-13': 13,
2680     'R': 16,
2681     'NC': 18,
2682 }
2683
2684
2685 TV_PARENTAL_GUIDELINES = {
2686     'TV-Y': 0,
2687     'TV-Y7': 7,
2688     'TV-G': 0,
2689     'TV-PG': 0,
2690     'TV-14': 14,
2691     'TV-MA': 17,
2692 }
2693
2694
2695 def parse_age_limit(s):
2696     # isinstance(False, int) is True. So type() must be used instead
2697     if type(s) is int:  # noqa: E721
2698         return s if 0 <= s <= 21 else None
2699     elif not isinstance(s, str):
2700         return None
2701     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2702     if m:
2703         return int(m.group('age'))
2704     s = s.upper()
2705     if s in US_RATINGS:
2706         return US_RATINGS[s]
2707     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2708     if m:
2709         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2710     return None
2711
2712
2713 def strip_jsonp(code):
2714     return re.sub(
2715         r'''(?sx)^
2716             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2717             (?:\s*&&\s*(?P=func_name))?
2718             \s*\(\s*(?P<callback_data>.*)\);?
2719             \s*?(?://[^\n]*)*$''',
2720         r'\g<callback_data>', code)
2721
2722
2723 def js_to_json(code, vars={}, *, strict=False):
2724     # vars is a dict of var, val pairs to substitute
2725     STRING_QUOTES = '\'"`'
2726     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2727     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2728     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2729     INTEGER_TABLE = (
2730         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2731         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2732     )
2733
2734     def process_escape(match):
2735         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2736         escape = match.group(1) or match.group(2)
2737
2738         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2739                 else R'\u00' if escape == 'x'
2740                 else '' if escape == '\n'
2741                 else escape)
2742
2743     def template_substitute(match):
2744         evaluated = js_to_json(match.group(1), vars, strict=strict)
2745         if evaluated[0] == '"':
2746             return json.loads(evaluated)
2747         return evaluated
2748
2749     def fix_kv(m):
2750         v = m.group(0)
2751         if v in ('true', 'false', 'null'):
2752             return v
2753         elif v in ('undefined', 'void 0'):
2754             return 'null'
2755         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2756             return ''
2757
2758         if v[0] in STRING_QUOTES:
2759             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2760             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2761             return f'"{escaped}"'
2762
2763         for regex, base in INTEGER_TABLE:
2764             im = re.match(regex, v)
2765             if im:
2766                 i = int(im.group(1), base)
2767                 return f'"{i}":' if v.endswith(':') else str(i)
2768
2769         if v in vars:
2770             try:
2771                 if not strict:
2772                     json.loads(vars[v])
2773             except json.JSONDecodeError:
2774                 return json.dumps(vars[v])
2775             else:
2776                 return vars[v]
2777
2778         if not strict:
2779             return f'"{v}"'
2780
2781         raise ValueError(f'Unknown value: {v}')
2782
2783     def create_map(mobj):
2784         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2785
2786     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2787     if not strict:
2788         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
2789         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2790         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2791         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2792
2793     return re.sub(rf'''(?sx)
2794         {STRING_RE}|
2795         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2796         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2797         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2798         [0-9]+(?={SKIP_RE}:)|
2799         !+
2800         ''', fix_kv, code)
2801
2802
2803 def qualities(quality_ids):
2804     """ Get a numeric quality value out of a list of possible values """
2805     def q(qid):
2806         try:
2807             return quality_ids.index(qid)
2808         except ValueError:
2809             return -1
2810     return q
2811
2812
2813 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2814
2815
2816 DEFAULT_OUTTMPL = {
2817     'default': '%(title)s [%(id)s].%(ext)s',
2818     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2819 }
2820 OUTTMPL_TYPES = {
2821     'chapter': None,
2822     'subtitle': None,
2823     'thumbnail': None,
2824     'description': 'description',
2825     'annotation': 'annotations.xml',
2826     'infojson': 'info.json',
2827     'link': None,
2828     'pl_video': None,
2829     'pl_thumbnail': None,
2830     'pl_description': 'description',
2831     'pl_infojson': 'info.json',
2832 }
2833
2834 # As of [1] format syntax is:
2835 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2836 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2837 STR_FORMAT_RE_TMPL = r'''(?x)
2838     (?<!%)(?P<prefix>(?:%%)*)
2839     %
2840     (?P<has_key>\((?P<key>{0})\))?
2841     (?P<format>
2842         (?P<conversion>[#0\-+ ]+)?
2843         (?P<min_width>\d+)?
2844         (?P<precision>\.\d+)?
2845         (?P<len_mod>[hlL])?  # unused in python
2846         {1}  # conversion type
2847     )
2848 '''
2849
2850
2851 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2852
2853
2854 def limit_length(s, length):
2855     """ Add ellipses to overly long strings """
2856     if s is None:
2857         return None
2858     ELLIPSES = '...'
2859     if len(s) > length:
2860         return s[:length - len(ELLIPSES)] + ELLIPSES
2861     return s
2862
2863
2864 def version_tuple(v):
2865     return tuple(int(e) for e in re.split(r'[-.]', v))
2866
2867
2868 def is_outdated_version(version, limit, assume_new=True):
2869     if not version:
2870         return not assume_new
2871     try:
2872         return version_tuple(version) < version_tuple(limit)
2873     except ValueError:
2874         return not assume_new
2875
2876
2877 def ytdl_is_updateable():
2878     """ Returns if yt-dlp can be updated with -U """
2879
2880     from ..update import is_non_updateable
2881
2882     return not is_non_updateable()
2883
2884
2885 def args_to_str(args):
2886     # Get a short string representation for a subprocess command
2887     return ' '.join(compat_shlex_quote(a) for a in args)
2888
2889
2890 def error_to_str(err):
2891     return f'{type(err).__name__}: {err}'
2892
2893
2894 def mimetype2ext(mt, default=NO_DEFAULT):
2895     if not isinstance(mt, str):
2896         if default is not NO_DEFAULT:
2897             return default
2898         return None
2899
2900     MAP = {
2901         # video
2902         '3gpp': '3gp',
2903         'mp2t': 'ts',
2904         'mp4': 'mp4',
2905         'mpeg': 'mpeg',
2906         'mpegurl': 'm3u8',
2907         'quicktime': 'mov',
2908         'webm': 'webm',
2909         'vp9': 'vp9',
2910         'x-flv': 'flv',
2911         'x-m4v': 'm4v',
2912         'x-matroska': 'mkv',
2913         'x-mng': 'mng',
2914         'x-mp4-fragmented': 'mp4',
2915         'x-ms-asf': 'asf',
2916         'x-ms-wmv': 'wmv',
2917         'x-msvideo': 'avi',
2918
2919         # application (streaming playlists)
2920         'dash+xml': 'mpd',
2921         'f4m+xml': 'f4m',
2922         'hds+xml': 'f4m',
2923         'vnd.apple.mpegurl': 'm3u8',
2924         'vnd.ms-sstr+xml': 'ism',
2925         'x-mpegurl': 'm3u8',
2926
2927         # audio
2928         'audio/mp4': 'm4a',
2929         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2930         # Using .mp3 as it's the most popular one
2931         'audio/mpeg': 'mp3',
2932         'audio/webm': 'webm',
2933         'audio/x-matroska': 'mka',
2934         'audio/x-mpegurl': 'm3u',
2935         'midi': 'mid',
2936         'ogg': 'ogg',
2937         'wav': 'wav',
2938         'wave': 'wav',
2939         'x-aac': 'aac',
2940         'x-flac': 'flac',
2941         'x-m4a': 'm4a',
2942         'x-realaudio': 'ra',
2943         'x-wav': 'wav',
2944
2945         # image
2946         'avif': 'avif',
2947         'bmp': 'bmp',
2948         'gif': 'gif',
2949         'jpeg': 'jpg',
2950         'png': 'png',
2951         'svg+xml': 'svg',
2952         'tiff': 'tif',
2953         'vnd.wap.wbmp': 'wbmp',
2954         'webp': 'webp',
2955         'x-icon': 'ico',
2956         'x-jng': 'jng',
2957         'x-ms-bmp': 'bmp',
2958
2959         # caption
2960         'filmstrip+json': 'fs',
2961         'smptett+xml': 'tt',
2962         'ttaf+xml': 'dfxp',
2963         'ttml+xml': 'ttml',
2964         'x-ms-sami': 'sami',
2965
2966         # misc
2967         'gzip': 'gz',
2968         'json': 'json',
2969         'xml': 'xml',
2970         'zip': 'zip',
2971     }
2972
2973     mimetype = mt.partition(';')[0].strip().lower()
2974     _, _, subtype = mimetype.rpartition('/')
2975
2976     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2977     if ext:
2978         return ext
2979     elif default is not NO_DEFAULT:
2980         return default
2981     return subtype.replace('+', '.')
2982
2983
2984 def ext2mimetype(ext_or_url):
2985     if not ext_or_url:
2986         return None
2987     if '.' not in ext_or_url:
2988         ext_or_url = f'file.{ext_or_url}'
2989     return mimetypes.guess_type(ext_or_url)[0]
2990
2991
2992 def parse_codecs(codecs_str):
2993     # http://tools.ietf.org/html/rfc6381
2994     if not codecs_str:
2995         return {}
2996     split_codecs = list(filter(None, map(
2997         str.strip, codecs_str.strip().strip(',').split(','))))
2998     vcodec, acodec, scodec, hdr = None, None, None, None
2999     for full_codec in split_codecs:
3000         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3001         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3002                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3003             if vcodec:
3004                 continue
3005             vcodec = full_codec
3006             if parts[0] in ('dvh1', 'dvhe'):
3007                 hdr = 'DV'
3008             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3009                 hdr = 'HDR10'
3010             elif parts[:2] == ['vp9', '2']:
3011                 hdr = 'HDR10'
3012         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3013                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3014             acodec = acodec or full_codec
3015         elif parts[0] in ('stpp', 'wvtt'):
3016             scodec = scodec or full_codec
3017         else:
3018             write_string(f'WARNING: Unknown codec {full_codec}\n')
3019     if vcodec or acodec or scodec:
3020         return {
3021             'vcodec': vcodec or 'none',
3022             'acodec': acodec or 'none',
3023             'dynamic_range': hdr,
3024             **({'scodec': scodec} if scodec is not None else {}),
3025         }
3026     elif len(split_codecs) == 2:
3027         return {
3028             'vcodec': split_codecs[0],
3029             'acodec': split_codecs[1],
3030         }
3031     return {}
3032
3033
3034 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3035     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3036
3037     allow_mkv = not preferences or 'mkv' in preferences
3038
3039     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3040         return 'mkv'  # TODO: any other format allows this?
3041
3042     # TODO: All codecs supported by parse_codecs isn't handled here
3043     COMPATIBLE_CODECS = {
3044         'mp4': {
3045             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3046             'h264', 'aacl', 'ec-3',  # Set in ISM
3047         },
3048         'webm': {
3049             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3050             'vp9x', 'vp8x',  # in the webm spec
3051         },
3052     }
3053
3054     sanitize_codec = functools.partial(
3055         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3056     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3057
3058     for ext in preferences or COMPATIBLE_CODECS.keys():
3059         codec_set = COMPATIBLE_CODECS.get(ext, set())
3060         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3061             return ext
3062
3063     COMPATIBLE_EXTS = (
3064         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3065         {'webm', 'weba'},
3066     )
3067     for ext in preferences or vexts:
3068         current_exts = {ext, *vexts, *aexts}
3069         if ext == 'mkv' or current_exts == {ext} or any(
3070                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3071             return ext
3072     return 'mkv' if allow_mkv else preferences[-1]
3073
3074
3075 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3076     getheader = url_handle.headers.get
3077
3078     cd = getheader('Content-Disposition')
3079     if cd:
3080         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3081         if m:
3082             e = determine_ext(m.group('filename'), default_ext=None)
3083             if e:
3084                 return e
3085
3086     meta_ext = getheader('x-amz-meta-name')
3087     if meta_ext:
3088         e = meta_ext.rpartition('.')[2]
3089         if e:
3090             return e
3091
3092     return mimetype2ext(getheader('Content-Type'), default=default)
3093
3094
3095 def encode_data_uri(data, mime_type):
3096     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3097
3098
3099 def age_restricted(content_limit, age_limit):
3100     """ Returns True iff the content should be blocked """
3101
3102     if age_limit is None:  # No limit set
3103         return False
3104     if content_limit is None:
3105         return False  # Content available for everyone
3106     return age_limit < content_limit
3107
3108
3109 # List of known byte-order-marks (BOM)
3110 BOMS = [
3111     (b'\xef\xbb\xbf', 'utf-8'),
3112     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3113     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3114     (b'\xff\xfe', 'utf-16-le'),
3115     (b'\xfe\xff', 'utf-16-be'),
3116 ]
3117
3118
3119 def is_html(first_bytes):
3120     """ Detect whether a file contains HTML by examining its first bytes. """
3121
3122     encoding = 'utf-8'
3123     for bom, enc in BOMS:
3124         while first_bytes.startswith(bom):
3125             encoding, first_bytes = enc, first_bytes[len(bom):]
3126
3127     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3128
3129
3130 def determine_protocol(info_dict):
3131     protocol = info_dict.get('protocol')
3132     if protocol is not None:
3133         return protocol
3134
3135     url = sanitize_url(info_dict['url'])
3136     if url.startswith('rtmp'):
3137         return 'rtmp'
3138     elif url.startswith('mms'):
3139         return 'mms'
3140     elif url.startswith('rtsp'):
3141         return 'rtsp'
3142
3143     ext = determine_ext(url)
3144     if ext == 'm3u8':
3145         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3146     elif ext == 'f4m':
3147         return 'f4m'
3148
3149     return urllib.parse.urlparse(url).scheme
3150
3151
3152 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3153     """ Render a list of rows, each as a list of values.
3154     Text after a \t will be right aligned """
3155     def width(string):
3156         return len(remove_terminal_sequences(string).replace('\t', ''))
3157
3158     def get_max_lens(table):
3159         return [max(width(str(v)) for v in col) for col in zip(*table)]
3160
3161     def filter_using_list(row, filterArray):
3162         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3163
3164     max_lens = get_max_lens(data) if hide_empty else []
3165     header_row = filter_using_list(header_row, max_lens)
3166     data = [filter_using_list(row, max_lens) for row in data]
3167
3168     table = [header_row] + data
3169     max_lens = get_max_lens(table)
3170     extra_gap += 1
3171     if delim:
3172         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3173         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3174     for row in table:
3175         for pos, text in enumerate(map(str, row)):
3176             if '\t' in text:
3177                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3178             else:
3179                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3180     ret = '\n'.join(''.join(row).rstrip() for row in table)
3181     return ret
3182
3183
3184 def _match_one(filter_part, dct, incomplete):
3185     # TODO: Generalize code with YoutubeDL._build_format_filter
3186     STRING_OPERATORS = {
3187         '*=': operator.contains,
3188         '^=': lambda attr, value: attr.startswith(value),
3189         '$=': lambda attr, value: attr.endswith(value),
3190         '~=': lambda attr, value: re.search(value, attr),
3191     }
3192     COMPARISON_OPERATORS = {
3193         **STRING_OPERATORS,
3194         '<=': operator.le,  # "<=" must be defined above "<"
3195         '<': operator.lt,
3196         '>=': operator.ge,
3197         '>': operator.gt,
3198         '=': operator.eq,
3199     }
3200
3201     if isinstance(incomplete, bool):
3202         is_incomplete = lambda _: incomplete
3203     else:
3204         is_incomplete = lambda k: k in incomplete
3205
3206     operator_rex = re.compile(r'''(?x)
3207         (?P<key>[a-z_]+)
3208         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3209         (?:
3210             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3211             (?P<strval>.+?)
3212         )
3213         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3214     m = operator_rex.fullmatch(filter_part.strip())
3215     if m:
3216         m = m.groupdict()
3217         unnegated_op = COMPARISON_OPERATORS[m['op']]
3218         if m['negation']:
3219             op = lambda attr, value: not unnegated_op(attr, value)
3220         else:
3221             op = unnegated_op
3222         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3223         if m['quote']:
3224             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3225         actual_value = dct.get(m['key'])
3226         numeric_comparison = None
3227         if isinstance(actual_value, (int, float)):
3228             # If the original field is a string and matching comparisonvalue is
3229             # a number we should respect the origin of the original field
3230             # and process comparison value as a string (see
3231             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3232             try:
3233                 numeric_comparison = int(comparison_value)
3234             except ValueError:
3235                 numeric_comparison = parse_filesize(comparison_value)
3236                 if numeric_comparison is None:
3237                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3238                 if numeric_comparison is None:
3239                     numeric_comparison = parse_duration(comparison_value)
3240         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3241             raise ValueError('Operator %s only supports string values!' % m['op'])
3242         if actual_value is None:
3243             return is_incomplete(m['key']) or m['none_inclusive']
3244         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3245
3246     UNARY_OPERATORS = {
3247         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3248         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3249     }
3250     operator_rex = re.compile(r'''(?x)
3251         (?P<op>%s)\s*(?P<key>[a-z_]+)
3252         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3253     m = operator_rex.fullmatch(filter_part.strip())
3254     if m:
3255         op = UNARY_OPERATORS[m.group('op')]
3256         actual_value = dct.get(m.group('key'))
3257         if is_incomplete(m.group('key')) and actual_value is None:
3258             return True
3259         return op(actual_value)
3260
3261     raise ValueError('Invalid filter part %r' % filter_part)
3262
3263
3264 def match_str(filter_str, dct, incomplete=False):
3265     """ Filter a dictionary with a simple string syntax.
3266     @returns           Whether the filter passes
3267     @param incomplete  Set of keys that is expected to be missing from dct.
3268                        Can be True/False to indicate all/none of the keys may be missing.
3269                        All conditions on incomplete keys pass if the key is missing
3270     """
3271     return all(
3272         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3273         for filter_part in re.split(r'(?<!\\)&', filter_str))
3274
3275
3276 def match_filter_func(filters, breaking_filters=None):
3277     if not filters and not breaking_filters:
3278         return None
3279     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3280     filters = set(variadic(filters or []))
3281
3282     interactive = '-' in filters
3283     if interactive:
3284         filters.remove('-')
3285
3286     def _match_func(info_dict, incomplete=False):
3287         ret = breaking_filters(info_dict, incomplete)
3288         if ret is not None:
3289             raise RejectedVideoReached(ret)
3290
3291         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3292             return NO_DEFAULT if interactive and not incomplete else None
3293         else:
3294             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3295             filter_str = ') | ('.join(map(str.strip, filters))
3296             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3297     return _match_func
3298
3299
3300 class download_range_func:
3301     def __init__(self, chapters, ranges, from_info=False):
3302         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3303
3304     def __call__(self, info_dict, ydl):
3305
3306         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3307                    else 'Cannot match chapters since chapter information is unavailable')
3308         for regex in self.chapters or []:
3309             for i, chapter in enumerate(info_dict.get('chapters') or []):
3310                 if re.search(regex, chapter['title']):
3311                     warning = None
3312                     yield {**chapter, 'index': i}
3313         if self.chapters and warning:
3314             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3315
3316         for start, end in self.ranges or []:
3317             yield {
3318                 'start_time': self._handle_negative_timestamp(start, info_dict),
3319                 'end_time': self._handle_negative_timestamp(end, info_dict),
3320             }
3321
3322         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3323             yield {
3324                 'start_time': info_dict.get('start_time') or 0,
3325                 'end_time': info_dict.get('end_time') or float('inf'),
3326             }
3327         elif not self.ranges and not self.chapters:
3328             yield {}
3329
3330     @staticmethod
3331     def _handle_negative_timestamp(time, info):
3332         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3333
3334     def __eq__(self, other):
3335         return (isinstance(other, download_range_func)
3336                 and self.chapters == other.chapters and self.ranges == other.ranges)
3337
3338     def __repr__(self):
3339         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3340
3341
3342 def parse_dfxp_time_expr(time_expr):
3343     if not time_expr:
3344         return
3345
3346     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3347     if mobj:
3348         return float(mobj.group('time_offset'))
3349
3350     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3351     if mobj:
3352         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3353
3354
3355 def srt_subtitles_timecode(seconds):
3356     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3357
3358
3359 def ass_subtitles_timecode(seconds):
3360     time = timetuple_from_msec(seconds * 1000)
3361     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3362
3363
3364 def dfxp2srt(dfxp_data):
3365     '''
3366     @param dfxp_data A bytes-like object containing DFXP data
3367     @returns A unicode object containing converted SRT data
3368     '''
3369     LEGACY_NAMESPACES = (
3370         (b'http://www.w3.org/ns/ttml', [
3371             b'http://www.w3.org/2004/11/ttaf1',
3372             b'http://www.w3.org/2006/04/ttaf1',
3373             b'http://www.w3.org/2006/10/ttaf1',
3374         ]),
3375         (b'http://www.w3.org/ns/ttml#styling', [
3376             b'http://www.w3.org/ns/ttml#style',
3377         ]),
3378     )
3379
3380     SUPPORTED_STYLING = [
3381         'color',
3382         'fontFamily',
3383         'fontSize',
3384         'fontStyle',
3385         'fontWeight',
3386         'textDecoration'
3387     ]
3388
3389     _x = functools.partial(xpath_with_ns, ns_map={
3390         'xml': 'http://www.w3.org/XML/1998/namespace',
3391         'ttml': 'http://www.w3.org/ns/ttml',
3392         'tts': 'http://www.w3.org/ns/ttml#styling',
3393     })
3394
3395     styles = {}
3396     default_style = {}
3397
3398     class TTMLPElementParser:
3399         _out = ''
3400         _unclosed_elements = []
3401         _applied_styles = []
3402
3403         def start(self, tag, attrib):
3404             if tag in (_x('ttml:br'), 'br'):
3405                 self._out += '\n'
3406             else:
3407                 unclosed_elements = []
3408                 style = {}
3409                 element_style_id = attrib.get('style')
3410                 if default_style:
3411                     style.update(default_style)
3412                 if element_style_id:
3413                     style.update(styles.get(element_style_id, {}))
3414                 for prop in SUPPORTED_STYLING:
3415                     prop_val = attrib.get(_x('tts:' + prop))
3416                     if prop_val:
3417                         style[prop] = prop_val
3418                 if style:
3419                     font = ''
3420                     for k, v in sorted(style.items()):
3421                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3422                             continue
3423                         if k == 'color':
3424                             font += ' color="%s"' % v
3425                         elif k == 'fontSize':
3426                             font += ' size="%s"' % v
3427                         elif k == 'fontFamily':
3428                             font += ' face="%s"' % v
3429                         elif k == 'fontWeight' and v == 'bold':
3430                             self._out += '<b>'
3431                             unclosed_elements.append('b')
3432                         elif k == 'fontStyle' and v == 'italic':
3433                             self._out += '<i>'
3434                             unclosed_elements.append('i')
3435                         elif k == 'textDecoration' and v == 'underline':
3436                             self._out += '<u>'
3437                             unclosed_elements.append('u')
3438                     if font:
3439                         self._out += '<font' + font + '>'
3440                         unclosed_elements.append('font')
3441                     applied_style = {}
3442                     if self._applied_styles:
3443                         applied_style.update(self._applied_styles[-1])
3444                     applied_style.update(style)
3445                     self._applied_styles.append(applied_style)
3446                 self._unclosed_elements.append(unclosed_elements)
3447
3448         def end(self, tag):
3449             if tag not in (_x('ttml:br'), 'br'):
3450                 unclosed_elements = self._unclosed_elements.pop()
3451                 for element in reversed(unclosed_elements):
3452                     self._out += '</%s>' % element
3453                 if unclosed_elements and self._applied_styles:
3454                     self._applied_styles.pop()
3455
3456         def data(self, data):
3457             self._out += data
3458
3459         def close(self):
3460             return self._out.strip()
3461
3462     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3463     # This will not trigger false positives since only UTF-8 text is being replaced
3464     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3465
3466     def parse_node(node):
3467         target = TTMLPElementParser()
3468         parser = xml.etree.ElementTree.XMLParser(target=target)
3469         parser.feed(xml.etree.ElementTree.tostring(node))
3470         return parser.close()
3471
3472     for k, v in LEGACY_NAMESPACES:
3473         for ns in v:
3474             dfxp_data = dfxp_data.replace(ns, k)
3475
3476     dfxp = compat_etree_fromstring(dfxp_data)
3477     out = []
3478     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3479
3480     if not paras:
3481         raise ValueError('Invalid dfxp/TTML subtitle')
3482
3483     repeat = False
3484     while True:
3485         for style in dfxp.findall(_x('.//ttml:style')):
3486             style_id = style.get('id') or style.get(_x('xml:id'))
3487             if not style_id:
3488                 continue
3489             parent_style_id = style.get('style')
3490             if parent_style_id:
3491                 if parent_style_id not in styles:
3492                     repeat = True
3493                     continue
3494                 styles[style_id] = styles[parent_style_id].copy()
3495             for prop in SUPPORTED_STYLING:
3496                 prop_val = style.get(_x('tts:' + prop))
3497                 if prop_val:
3498                     styles.setdefault(style_id, {})[prop] = prop_val
3499         if repeat:
3500             repeat = False
3501         else:
3502             break
3503
3504     for p in ('body', 'div'):
3505         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3506         if ele is None:
3507             continue
3508         style = styles.get(ele.get('style'))
3509         if not style:
3510             continue
3511         default_style.update(style)
3512
3513     for para, index in zip(paras, itertools.count(1)):
3514         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3515         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3516         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3517         if begin_time is None:
3518             continue
3519         if not end_time:
3520             if not dur:
3521                 continue
3522             end_time = begin_time + dur
3523         out.append('%d\n%s --> %s\n%s\n\n' % (
3524             index,
3525             srt_subtitles_timecode(begin_time),
3526             srt_subtitles_timecode(end_time),
3527             parse_node(para)))
3528
3529     return ''.join(out)
3530
3531
3532 def cli_option(params, command_option, param, separator=None):
3533     param = params.get(param)
3534     return ([] if param is None
3535             else [command_option, str(param)] if separator is None
3536             else [f'{command_option}{separator}{param}'])
3537
3538
3539 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3540     param = params.get(param)
3541     assert param in (True, False, None)
3542     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3543
3544
3545 def cli_valueless_option(params, command_option, param, expected_value=True):
3546     return [command_option] if params.get(param) == expected_value else []
3547
3548
3549 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3550     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3551         if use_compat:
3552             return argdict
3553         else:
3554             argdict = None
3555     if argdict is None:
3556         return default
3557     assert isinstance(argdict, dict)
3558
3559     assert isinstance(keys, (list, tuple))
3560     for key_list in keys:
3561         arg_list = list(filter(
3562             lambda x: x is not None,
3563             [argdict.get(key.lower()) for key in variadic(key_list)]))
3564         if arg_list:
3565             return [arg for args in arg_list for arg in args]
3566     return default
3567
3568
3569 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3570     main_key, exe = main_key.lower(), exe.lower()
3571     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3572     keys = [f'{root_key}{k}' for k in (keys or [''])]
3573     if root_key in keys:
3574         if main_key != exe:
3575             keys.append((main_key, exe))
3576         keys.append('default')
3577     else:
3578         use_compat = False
3579     return cli_configuration_args(argdict, keys, default, use_compat)
3580
3581
3582 class ISO639Utils:
3583     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3584     _lang_map = {
3585         'aa': 'aar',
3586         'ab': 'abk',
3587         'ae': 'ave',
3588         'af': 'afr',
3589         'ak': 'aka',
3590         'am': 'amh',
3591         'an': 'arg',
3592         'ar': 'ara',
3593         'as': 'asm',
3594         'av': 'ava',
3595         'ay': 'aym',
3596         'az': 'aze',
3597         'ba': 'bak',
3598         'be': 'bel',
3599         'bg': 'bul',
3600         'bh': 'bih',
3601         'bi': 'bis',
3602         'bm': 'bam',
3603         'bn': 'ben',
3604         'bo': 'bod',
3605         'br': 'bre',
3606         'bs': 'bos',
3607         'ca': 'cat',
3608         'ce': 'che',
3609         'ch': 'cha',
3610         'co': 'cos',
3611         'cr': 'cre',
3612         'cs': 'ces',
3613         'cu': 'chu',
3614         'cv': 'chv',
3615         'cy': 'cym',
3616         'da': 'dan',
3617         'de': 'deu',
3618         'dv': 'div',
3619         'dz': 'dzo',
3620         'ee': 'ewe',
3621         'el': 'ell',
3622         'en': 'eng',
3623         'eo': 'epo',
3624         'es': 'spa',
3625         'et': 'est',
3626         'eu': 'eus',
3627         'fa': 'fas',
3628         'ff': 'ful',
3629         'fi': 'fin',
3630         'fj': 'fij',
3631         'fo': 'fao',
3632         'fr': 'fra',
3633         'fy': 'fry',
3634         'ga': 'gle',
3635         'gd': 'gla',
3636         'gl': 'glg',
3637         'gn': 'grn',
3638         'gu': 'guj',
3639         'gv': 'glv',
3640         'ha': 'hau',
3641         'he': 'heb',
3642         'iw': 'heb',  # Replaced by he in 1989 revision
3643         'hi': 'hin',
3644         'ho': 'hmo',
3645         'hr': 'hrv',
3646         'ht': 'hat',
3647         'hu': 'hun',
3648         'hy': 'hye',
3649         'hz': 'her',
3650         'ia': 'ina',
3651         'id': 'ind',
3652         'in': 'ind',  # Replaced by id in 1989 revision
3653         'ie': 'ile',
3654         'ig': 'ibo',
3655         'ii': 'iii',
3656         'ik': 'ipk',
3657         'io': 'ido',
3658         'is': 'isl',
3659         'it': 'ita',
3660         'iu': 'iku',
3661         'ja': 'jpn',
3662         'jv': 'jav',
3663         'ka': 'kat',
3664         'kg': 'kon',
3665         'ki': 'kik',
3666         'kj': 'kua',
3667         'kk': 'kaz',
3668         'kl': 'kal',
3669         'km': 'khm',
3670         'kn': 'kan',
3671         'ko': 'kor',
3672         'kr': 'kau',
3673         'ks': 'kas',
3674         'ku': 'kur',
3675         'kv': 'kom',
3676         'kw': 'cor',
3677         'ky': 'kir',
3678         'la': 'lat',
3679         'lb': 'ltz',
3680         'lg': 'lug',
3681         'li': 'lim',
3682         'ln': 'lin',
3683         'lo': 'lao',
3684         'lt': 'lit',
3685         'lu': 'lub',
3686         'lv': 'lav',
3687         'mg': 'mlg',
3688         'mh': 'mah',
3689         'mi': 'mri',
3690         'mk': 'mkd',
3691         'ml': 'mal',
3692         'mn': 'mon',
3693         'mr': 'mar',
3694         'ms': 'msa',
3695         'mt': 'mlt',
3696         'my': 'mya',
3697         'na': 'nau',
3698         'nb': 'nob',
3699         'nd': 'nde',
3700         'ne': 'nep',
3701         'ng': 'ndo',
3702         'nl': 'nld',
3703         'nn': 'nno',
3704         'no': 'nor',
3705         'nr': 'nbl',
3706         'nv': 'nav',
3707         'ny': 'nya',
3708         'oc': 'oci',
3709         'oj': 'oji',
3710         'om': 'orm',
3711         'or': 'ori',
3712         'os': 'oss',
3713         'pa': 'pan',
3714         'pe': 'per',
3715         'pi': 'pli',
3716         'pl': 'pol',
3717         'ps': 'pus',
3718         'pt': 'por',
3719         'qu': 'que',
3720         'rm': 'roh',
3721         'rn': 'run',
3722         'ro': 'ron',
3723         'ru': 'rus',
3724         'rw': 'kin',
3725         'sa': 'san',
3726         'sc': 'srd',
3727         'sd': 'snd',
3728         'se': 'sme',
3729         'sg': 'sag',
3730         'si': 'sin',
3731         'sk': 'slk',
3732         'sl': 'slv',
3733         'sm': 'smo',
3734         'sn': 'sna',
3735         'so': 'som',
3736         'sq': 'sqi',
3737         'sr': 'srp',
3738         'ss': 'ssw',
3739         'st': 'sot',
3740         'su': 'sun',
3741         'sv': 'swe',
3742         'sw': 'swa',
3743         'ta': 'tam',
3744         'te': 'tel',
3745         'tg': 'tgk',
3746         'th': 'tha',
3747         'ti': 'tir',
3748         'tk': 'tuk',
3749         'tl': 'tgl',
3750         'tn': 'tsn',
3751         'to': 'ton',
3752         'tr': 'tur',
3753         'ts': 'tso',
3754         'tt': 'tat',
3755         'tw': 'twi',
3756         'ty': 'tah',
3757         'ug': 'uig',
3758         'uk': 'ukr',
3759         'ur': 'urd',
3760         'uz': 'uzb',
3761         've': 'ven',
3762         'vi': 'vie',
3763         'vo': 'vol',
3764         'wa': 'wln',
3765         'wo': 'wol',
3766         'xh': 'xho',
3767         'yi': 'yid',
3768         'ji': 'yid',  # Replaced by yi in 1989 revision
3769         'yo': 'yor',
3770         'za': 'zha',
3771         'zh': 'zho',
3772         'zu': 'zul',
3773     }
3774
3775     @classmethod
3776     def short2long(cls, code):
3777         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3778         return cls._lang_map.get(code[:2])
3779
3780     @classmethod
3781     def long2short(cls, code):
3782         """Convert language code from ISO 639-2/T to ISO 639-1"""
3783         for short_name, long_name in cls._lang_map.items():
3784             if long_name == code:
3785                 return short_name
3786
3787
3788 class ISO3166Utils:
3789     # From http://data.okfn.org/data/core/country-list
3790     _country_map = {
3791         'AF': 'Afghanistan',
3792         'AX': 'Åland Islands',
3793         'AL': 'Albania',
3794         'DZ': 'Algeria',
3795         'AS': 'American Samoa',
3796         'AD': 'Andorra',
3797         'AO': 'Angola',
3798         'AI': 'Anguilla',
3799         'AQ': 'Antarctica',
3800         'AG': 'Antigua and Barbuda',
3801         'AR': 'Argentina',
3802         'AM': 'Armenia',
3803         'AW': 'Aruba',
3804         'AU': 'Australia',
3805         'AT': 'Austria',
3806         'AZ': 'Azerbaijan',
3807         'BS': 'Bahamas',
3808         'BH': 'Bahrain',
3809         'BD': 'Bangladesh',
3810         'BB': 'Barbados',
3811         'BY': 'Belarus',
3812         'BE': 'Belgium',
3813         'BZ': 'Belize',
3814         'BJ': 'Benin',
3815         'BM': 'Bermuda',
3816         'BT': 'Bhutan',
3817         'BO': 'Bolivia, Plurinational State of',
3818         'BQ': 'Bonaire, Sint Eustatius and Saba',
3819         'BA': 'Bosnia and Herzegovina',
3820         'BW': 'Botswana',
3821         'BV': 'Bouvet Island',
3822         'BR': 'Brazil',
3823         'IO': 'British Indian Ocean Territory',
3824         'BN': 'Brunei Darussalam',
3825         'BG': 'Bulgaria',
3826         'BF': 'Burkina Faso',
3827         'BI': 'Burundi',
3828         'KH': 'Cambodia',
3829         'CM': 'Cameroon',
3830         'CA': 'Canada',
3831         'CV': 'Cape Verde',
3832         'KY': 'Cayman Islands',
3833         'CF': 'Central African Republic',
3834         'TD': 'Chad',
3835         'CL': 'Chile',
3836         'CN': 'China',
3837         'CX': 'Christmas Island',
3838         'CC': 'Cocos (Keeling) Islands',
3839         'CO': 'Colombia',
3840         'KM': 'Comoros',
3841         'CG': 'Congo',
3842         'CD': 'Congo, the Democratic Republic of the',
3843         'CK': 'Cook Islands',
3844         'CR': 'Costa Rica',
3845         'CI': 'Côte d\'Ivoire',
3846         'HR': 'Croatia',
3847         'CU': 'Cuba',
3848         'CW': 'Curaçao',
3849         'CY': 'Cyprus',
3850         'CZ': 'Czech Republic',
3851         'DK': 'Denmark',
3852         'DJ': 'Djibouti',
3853         'DM': 'Dominica',
3854         'DO': 'Dominican Republic',
3855         'EC': 'Ecuador',
3856         'EG': 'Egypt',
3857         'SV': 'El Salvador',
3858         'GQ': 'Equatorial Guinea',
3859         'ER': 'Eritrea',
3860         'EE': 'Estonia',
3861         'ET': 'Ethiopia',
3862         'FK': 'Falkland Islands (Malvinas)',
3863         'FO': 'Faroe Islands',
3864         'FJ': 'Fiji',
3865         'FI': 'Finland',
3866         'FR': 'France',
3867         'GF': 'French Guiana',
3868         'PF': 'French Polynesia',
3869         'TF': 'French Southern Territories',
3870         'GA': 'Gabon',
3871         'GM': 'Gambia',
3872         'GE': 'Georgia',
3873         'DE': 'Germany',
3874         'GH': 'Ghana',
3875         'GI': 'Gibraltar',
3876         'GR': 'Greece',
3877         'GL': 'Greenland',
3878         'GD': 'Grenada',
3879         'GP': 'Guadeloupe',
3880         'GU': 'Guam',
3881         'GT': 'Guatemala',
3882         'GG': 'Guernsey',
3883         'GN': 'Guinea',
3884         'GW': 'Guinea-Bissau',
3885         'GY': 'Guyana',
3886         'HT': 'Haiti',
3887         'HM': 'Heard Island and McDonald Islands',
3888         'VA': 'Holy See (Vatican City State)',
3889         'HN': 'Honduras',
3890         'HK': 'Hong Kong',
3891         'HU': 'Hungary',
3892         'IS': 'Iceland',
3893         'IN': 'India',
3894         'ID': 'Indonesia',
3895         'IR': 'Iran, Islamic Republic of',
3896         'IQ': 'Iraq',
3897         'IE': 'Ireland',
3898         'IM': 'Isle of Man',
3899         'IL': 'Israel',
3900         'IT': 'Italy',
3901         'JM': 'Jamaica',
3902         'JP': 'Japan',
3903         'JE': 'Jersey',
3904         'JO': 'Jordan',
3905         'KZ': 'Kazakhstan',
3906         'KE': 'Kenya',
3907         'KI': 'Kiribati',
3908         'KP': 'Korea, Democratic People\'s Republic of',
3909         'KR': 'Korea, Republic of',
3910         'KW': 'Kuwait',
3911         'KG': 'Kyrgyzstan',
3912         'LA': 'Lao People\'s Democratic Republic',
3913         'LV': 'Latvia',
3914         'LB': 'Lebanon',
3915         'LS': 'Lesotho',
3916         'LR': 'Liberia',
3917         'LY': 'Libya',
3918         'LI': 'Liechtenstein',
3919         'LT': 'Lithuania',
3920         'LU': 'Luxembourg',
3921         'MO': 'Macao',
3922         'MK': 'Macedonia, the Former Yugoslav Republic of',
3923         'MG': 'Madagascar',
3924         'MW': 'Malawi',
3925         'MY': 'Malaysia',
3926         'MV': 'Maldives',
3927         'ML': 'Mali',
3928         'MT': 'Malta',
3929         'MH': 'Marshall Islands',
3930         'MQ': 'Martinique',
3931         'MR': 'Mauritania',
3932         'MU': 'Mauritius',
3933         'YT': 'Mayotte',
3934         'MX': 'Mexico',
3935         'FM': 'Micronesia, Federated States of',
3936         'MD': 'Moldova, Republic of',
3937         'MC': 'Monaco',
3938         'MN': 'Mongolia',
3939         'ME': 'Montenegro',
3940         'MS': 'Montserrat',
3941         'MA': 'Morocco',
3942         'MZ': 'Mozambique',
3943         'MM': 'Myanmar',
3944         'NA': 'Namibia',
3945         'NR': 'Nauru',
3946         'NP': 'Nepal',
3947         'NL': 'Netherlands',
3948         'NC': 'New Caledonia',
3949         'NZ': 'New Zealand',
3950         'NI': 'Nicaragua',
3951         'NE': 'Niger',
3952         'NG': 'Nigeria',
3953         'NU': 'Niue',
3954         'NF': 'Norfolk Island',
3955         'MP': 'Northern Mariana Islands',
3956         'NO': 'Norway',
3957         'OM': 'Oman',
3958         'PK': 'Pakistan',
3959         'PW': 'Palau',
3960         'PS': 'Palestine, State of',
3961         'PA': 'Panama',
3962         'PG': 'Papua New Guinea',
3963         'PY': 'Paraguay',
3964         'PE': 'Peru',
3965         'PH': 'Philippines',
3966         'PN': 'Pitcairn',
3967         'PL': 'Poland',
3968         'PT': 'Portugal',
3969         'PR': 'Puerto Rico',
3970         'QA': 'Qatar',
3971         'RE': 'Réunion',
3972         'RO': 'Romania',
3973         'RU': 'Russian Federation',
3974         'RW': 'Rwanda',
3975         'BL': 'Saint Barthélemy',
3976         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3977         'KN': 'Saint Kitts and Nevis',
3978         'LC': 'Saint Lucia',
3979         'MF': 'Saint Martin (French part)',
3980         'PM': 'Saint Pierre and Miquelon',
3981         'VC': 'Saint Vincent and the Grenadines',
3982         'WS': 'Samoa',
3983         'SM': 'San Marino',
3984         'ST': 'Sao Tome and Principe',
3985         'SA': 'Saudi Arabia',
3986         'SN': 'Senegal',
3987         'RS': 'Serbia',
3988         'SC': 'Seychelles',
3989         'SL': 'Sierra Leone',
3990         'SG': 'Singapore',
3991         'SX': 'Sint Maarten (Dutch part)',
3992         'SK': 'Slovakia',
3993         'SI': 'Slovenia',
3994         'SB': 'Solomon Islands',
3995         'SO': 'Somalia',
3996         'ZA': 'South Africa',
3997         'GS': 'South Georgia and the South Sandwich Islands',
3998         'SS': 'South Sudan',
3999         'ES': 'Spain',
4000         'LK': 'Sri Lanka',
4001         'SD': 'Sudan',
4002         'SR': 'Suriname',
4003         'SJ': 'Svalbard and Jan Mayen',
4004         'SZ': 'Swaziland',
4005         'SE': 'Sweden',
4006         'CH': 'Switzerland',
4007         'SY': 'Syrian Arab Republic',
4008         'TW': 'Taiwan, Province of China',
4009         'TJ': 'Tajikistan',
4010         'TZ': 'Tanzania, United Republic of',
4011         'TH': 'Thailand',
4012         'TL': 'Timor-Leste',
4013         'TG': 'Togo',
4014         'TK': 'Tokelau',
4015         'TO': 'Tonga',
4016         'TT': 'Trinidad and Tobago',
4017         'TN': 'Tunisia',
4018         'TR': 'Turkey',
4019         'TM': 'Turkmenistan',
4020         'TC': 'Turks and Caicos Islands',
4021         'TV': 'Tuvalu',
4022         'UG': 'Uganda',
4023         'UA': 'Ukraine',
4024         'AE': 'United Arab Emirates',
4025         'GB': 'United Kingdom',
4026         'US': 'United States',
4027         'UM': 'United States Minor Outlying Islands',
4028         'UY': 'Uruguay',
4029         'UZ': 'Uzbekistan',
4030         'VU': 'Vanuatu',
4031         'VE': 'Venezuela, Bolivarian Republic of',
4032         'VN': 'Viet Nam',
4033         'VG': 'Virgin Islands, British',
4034         'VI': 'Virgin Islands, U.S.',
4035         'WF': 'Wallis and Futuna',
4036         'EH': 'Western Sahara',
4037         'YE': 'Yemen',
4038         'ZM': 'Zambia',
4039         'ZW': 'Zimbabwe',
4040         # Not ISO 3166 codes, but used for IP blocks
4041         'AP': 'Asia/Pacific Region',
4042         'EU': 'Europe',
4043     }
4044
4045     @classmethod
4046     def short2full(cls, code):
4047         """Convert an ISO 3166-2 country code to the corresponding full name"""
4048         return cls._country_map.get(code.upper())
4049
4050
4051 class GeoUtils:
4052     # Major IPv4 address blocks per country
4053     _country_ip_map = {
4054         'AD': '46.172.224.0/19',
4055         'AE': '94.200.0.0/13',
4056         'AF': '149.54.0.0/17',
4057         'AG': '209.59.64.0/18',
4058         'AI': '204.14.248.0/21',
4059         'AL': '46.99.0.0/16',
4060         'AM': '46.70.0.0/15',
4061         'AO': '105.168.0.0/13',
4062         'AP': '182.50.184.0/21',
4063         'AQ': '23.154.160.0/24',
4064         'AR': '181.0.0.0/12',
4065         'AS': '202.70.112.0/20',
4066         'AT': '77.116.0.0/14',
4067         'AU': '1.128.0.0/11',
4068         'AW': '181.41.0.0/18',
4069         'AX': '185.217.4.0/22',
4070         'AZ': '5.197.0.0/16',
4071         'BA': '31.176.128.0/17',
4072         'BB': '65.48.128.0/17',
4073         'BD': '114.130.0.0/16',
4074         'BE': '57.0.0.0/8',
4075         'BF': '102.178.0.0/15',
4076         'BG': '95.42.0.0/15',
4077         'BH': '37.131.0.0/17',
4078         'BI': '154.117.192.0/18',
4079         'BJ': '137.255.0.0/16',
4080         'BL': '185.212.72.0/23',
4081         'BM': '196.12.64.0/18',
4082         'BN': '156.31.0.0/16',
4083         'BO': '161.56.0.0/16',
4084         'BQ': '161.0.80.0/20',
4085         'BR': '191.128.0.0/12',
4086         'BS': '24.51.64.0/18',
4087         'BT': '119.2.96.0/19',
4088         'BW': '168.167.0.0/16',
4089         'BY': '178.120.0.0/13',
4090         'BZ': '179.42.192.0/18',
4091         'CA': '99.224.0.0/11',
4092         'CD': '41.243.0.0/16',
4093         'CF': '197.242.176.0/21',
4094         'CG': '160.113.0.0/16',
4095         'CH': '85.0.0.0/13',
4096         'CI': '102.136.0.0/14',
4097         'CK': '202.65.32.0/19',
4098         'CL': '152.172.0.0/14',
4099         'CM': '102.244.0.0/14',
4100         'CN': '36.128.0.0/10',
4101         'CO': '181.240.0.0/12',
4102         'CR': '201.192.0.0/12',
4103         'CU': '152.206.0.0/15',
4104         'CV': '165.90.96.0/19',
4105         'CW': '190.88.128.0/17',
4106         'CY': '31.153.0.0/16',
4107         'CZ': '88.100.0.0/14',
4108         'DE': '53.0.0.0/8',
4109         'DJ': '197.241.0.0/17',
4110         'DK': '87.48.0.0/12',
4111         'DM': '192.243.48.0/20',
4112         'DO': '152.166.0.0/15',
4113         'DZ': '41.96.0.0/12',
4114         'EC': '186.68.0.0/15',
4115         'EE': '90.190.0.0/15',
4116         'EG': '156.160.0.0/11',
4117         'ER': '196.200.96.0/20',
4118         'ES': '88.0.0.0/11',
4119         'ET': '196.188.0.0/14',
4120         'EU': '2.16.0.0/13',
4121         'FI': '91.152.0.0/13',
4122         'FJ': '144.120.0.0/16',
4123         'FK': '80.73.208.0/21',
4124         'FM': '119.252.112.0/20',
4125         'FO': '88.85.32.0/19',
4126         'FR': '90.0.0.0/9',
4127         'GA': '41.158.0.0/15',
4128         'GB': '25.0.0.0/8',
4129         'GD': '74.122.88.0/21',
4130         'GE': '31.146.0.0/16',
4131         'GF': '161.22.64.0/18',
4132         'GG': '62.68.160.0/19',
4133         'GH': '154.160.0.0/12',
4134         'GI': '95.164.0.0/16',
4135         'GL': '88.83.0.0/19',
4136         'GM': '160.182.0.0/15',
4137         'GN': '197.149.192.0/18',
4138         'GP': '104.250.0.0/19',
4139         'GQ': '105.235.224.0/20',
4140         'GR': '94.64.0.0/13',
4141         'GT': '168.234.0.0/16',
4142         'GU': '168.123.0.0/16',
4143         'GW': '197.214.80.0/20',
4144         'GY': '181.41.64.0/18',
4145         'HK': '113.252.0.0/14',
4146         'HN': '181.210.0.0/16',
4147         'HR': '93.136.0.0/13',
4148         'HT': '148.102.128.0/17',
4149         'HU': '84.0.0.0/14',
4150         'ID': '39.192.0.0/10',
4151         'IE': '87.32.0.0/12',
4152         'IL': '79.176.0.0/13',
4153         'IM': '5.62.80.0/20',
4154         'IN': '117.192.0.0/10',
4155         'IO': '203.83.48.0/21',
4156         'IQ': '37.236.0.0/14',
4157         'IR': '2.176.0.0/12',
4158         'IS': '82.221.0.0/16',
4159         'IT': '79.0.0.0/10',
4160         'JE': '87.244.64.0/18',
4161         'JM': '72.27.0.0/17',
4162         'JO': '176.29.0.0/16',
4163         'JP': '133.0.0.0/8',
4164         'KE': '105.48.0.0/12',
4165         'KG': '158.181.128.0/17',
4166         'KH': '36.37.128.0/17',
4167         'KI': '103.25.140.0/22',
4168         'KM': '197.255.224.0/20',
4169         'KN': '198.167.192.0/19',
4170         'KP': '175.45.176.0/22',
4171         'KR': '175.192.0.0/10',
4172         'KW': '37.36.0.0/14',
4173         'KY': '64.96.0.0/15',
4174         'KZ': '2.72.0.0/13',
4175         'LA': '115.84.64.0/18',
4176         'LB': '178.135.0.0/16',
4177         'LC': '24.92.144.0/20',
4178         'LI': '82.117.0.0/19',
4179         'LK': '112.134.0.0/15',
4180         'LR': '102.183.0.0/16',
4181         'LS': '129.232.0.0/17',
4182         'LT': '78.56.0.0/13',
4183         'LU': '188.42.0.0/16',
4184         'LV': '46.109.0.0/16',
4185         'LY': '41.252.0.0/14',
4186         'MA': '105.128.0.0/11',
4187         'MC': '88.209.64.0/18',
4188         'MD': '37.246.0.0/16',
4189         'ME': '178.175.0.0/17',
4190         'MF': '74.112.232.0/21',
4191         'MG': '154.126.0.0/17',
4192         'MH': '117.103.88.0/21',
4193         'MK': '77.28.0.0/15',
4194         'ML': '154.118.128.0/18',
4195         'MM': '37.111.0.0/17',
4196         'MN': '49.0.128.0/17',
4197         'MO': '60.246.0.0/16',
4198         'MP': '202.88.64.0/20',
4199         'MQ': '109.203.224.0/19',
4200         'MR': '41.188.64.0/18',
4201         'MS': '208.90.112.0/22',
4202         'MT': '46.11.0.0/16',
4203         'MU': '105.16.0.0/12',
4204         'MV': '27.114.128.0/18',
4205         'MW': '102.70.0.0/15',
4206         'MX': '187.192.0.0/11',
4207         'MY': '175.136.0.0/13',
4208         'MZ': '197.218.0.0/15',
4209         'NA': '41.182.0.0/16',
4210         'NC': '101.101.0.0/18',
4211         'NE': '197.214.0.0/18',
4212         'NF': '203.17.240.0/22',
4213         'NG': '105.112.0.0/12',
4214         'NI': '186.76.0.0/15',
4215         'NL': '145.96.0.0/11',
4216         'NO': '84.208.0.0/13',
4217         'NP': '36.252.0.0/15',
4218         'NR': '203.98.224.0/19',
4219         'NU': '49.156.48.0/22',
4220         'NZ': '49.224.0.0/14',
4221         'OM': '5.36.0.0/15',
4222         'PA': '186.72.0.0/15',
4223         'PE': '186.160.0.0/14',
4224         'PF': '123.50.64.0/18',
4225         'PG': '124.240.192.0/19',
4226         'PH': '49.144.0.0/13',
4227         'PK': '39.32.0.0/11',
4228         'PL': '83.0.0.0/11',
4229         'PM': '70.36.0.0/20',
4230         'PR': '66.50.0.0/16',
4231         'PS': '188.161.0.0/16',
4232         'PT': '85.240.0.0/13',
4233         'PW': '202.124.224.0/20',
4234         'PY': '181.120.0.0/14',
4235         'QA': '37.210.0.0/15',
4236         'RE': '102.35.0.0/16',
4237         'RO': '79.112.0.0/13',
4238         'RS': '93.86.0.0/15',
4239         'RU': '5.136.0.0/13',
4240         'RW': '41.186.0.0/16',
4241         'SA': '188.48.0.0/13',
4242         'SB': '202.1.160.0/19',
4243         'SC': '154.192.0.0/11',
4244         'SD': '102.120.0.0/13',
4245         'SE': '78.64.0.0/12',
4246         'SG': '8.128.0.0/10',
4247         'SI': '188.196.0.0/14',
4248         'SK': '78.98.0.0/15',
4249         'SL': '102.143.0.0/17',
4250         'SM': '89.186.32.0/19',
4251         'SN': '41.82.0.0/15',
4252         'SO': '154.115.192.0/18',
4253         'SR': '186.179.128.0/17',
4254         'SS': '105.235.208.0/21',
4255         'ST': '197.159.160.0/19',
4256         'SV': '168.243.0.0/16',
4257         'SX': '190.102.0.0/20',
4258         'SY': '5.0.0.0/16',
4259         'SZ': '41.84.224.0/19',
4260         'TC': '65.255.48.0/20',
4261         'TD': '154.68.128.0/19',
4262         'TG': '196.168.0.0/14',
4263         'TH': '171.96.0.0/13',
4264         'TJ': '85.9.128.0/18',
4265         'TK': '27.96.24.0/21',
4266         'TL': '180.189.160.0/20',
4267         'TM': '95.85.96.0/19',
4268         'TN': '197.0.0.0/11',
4269         'TO': '175.176.144.0/21',
4270         'TR': '78.160.0.0/11',
4271         'TT': '186.44.0.0/15',
4272         'TV': '202.2.96.0/19',
4273         'TW': '120.96.0.0/11',
4274         'TZ': '156.156.0.0/14',
4275         'UA': '37.52.0.0/14',
4276         'UG': '102.80.0.0/13',
4277         'US': '6.0.0.0/8',
4278         'UY': '167.56.0.0/13',
4279         'UZ': '84.54.64.0/18',
4280         'VA': '212.77.0.0/19',
4281         'VC': '207.191.240.0/21',
4282         'VE': '186.88.0.0/13',
4283         'VG': '66.81.192.0/20',
4284         'VI': '146.226.0.0/16',
4285         'VN': '14.160.0.0/11',
4286         'VU': '202.80.32.0/20',
4287         'WF': '117.20.32.0/21',
4288         'WS': '202.4.32.0/19',
4289         'YE': '134.35.0.0/16',
4290         'YT': '41.242.116.0/22',
4291         'ZA': '41.0.0.0/11',
4292         'ZM': '102.144.0.0/13',
4293         'ZW': '102.177.192.0/18',
4294     }
4295
4296     @classmethod
4297     def random_ipv4(cls, code_or_block):
4298         if len(code_or_block) == 2:
4299             block = cls._country_ip_map.get(code_or_block.upper())
4300             if not block:
4301                 return None
4302         else:
4303             block = code_or_block
4304         addr, preflen = block.split('/')
4305         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4306         addr_max = addr_min | (0xffffffff >> int(preflen))
4307         return str(socket.inet_ntoa(
4308             struct.pack('!L', random.randint(addr_min, addr_max))))
4309
4310
4311 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4312 # released into Public Domain
4313 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4314
4315 def long_to_bytes(n, blocksize=0):
4316     """long_to_bytes(n:long, blocksize:int) : string
4317     Convert a long integer to a byte string.
4318
4319     If optional blocksize is given and greater than zero, pad the front of the
4320     byte string with binary zeros so that the length is a multiple of
4321     blocksize.
4322     """
4323     # after much testing, this algorithm was deemed to be the fastest
4324     s = b''
4325     n = int(n)
4326     while n > 0:
4327         s = struct.pack('>I', n & 0xffffffff) + s
4328         n = n >> 32
4329     # strip off leading zeros
4330     for i in range(len(s)):
4331         if s[i] != b'\000'[0]:
4332             break
4333     else:
4334         # only happens when n == 0
4335         s = b'\000'
4336         i = 0
4337     s = s[i:]
4338     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4339     # de-padding being done above, but sigh...
4340     if blocksize > 0 and len(s) % blocksize:
4341         s = (blocksize - len(s) % blocksize) * b'\000' + s
4342     return s
4343
4344
4345 def bytes_to_long(s):
4346     """bytes_to_long(string) : long
4347     Convert a byte string to a long integer.
4348
4349     This is (essentially) the inverse of long_to_bytes().
4350     """
4351     acc = 0
4352     length = len(s)
4353     if length % 4:
4354         extra = (4 - length % 4)
4355         s = b'\000' * extra + s
4356         length = length + extra
4357     for i in range(0, length, 4):
4358         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4359     return acc
4360
4361
4362 def ohdave_rsa_encrypt(data, exponent, modulus):
4363     '''
4364     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4365
4366     Input:
4367         data: data to encrypt, bytes-like object
4368         exponent, modulus: parameter e and N of RSA algorithm, both integer
4369     Output: hex string of encrypted data
4370
4371     Limitation: supports one block encryption only
4372     '''
4373
4374     payload = int(binascii.hexlify(data[::-1]), 16)
4375     encrypted = pow(payload, exponent, modulus)
4376     return '%x' % encrypted
4377
4378
4379 def pkcs1pad(data, length):
4380     """
4381     Padding input data with PKCS#1 scheme
4382
4383     @param {int[]} data        input data
4384     @param {int}   length      target length
4385     @returns {int[]}           padded data
4386     """
4387     if len(data) > length - 11:
4388         raise ValueError('Input data too long for PKCS#1 padding')
4389
4390     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4391     return [0, 2] + pseudo_random + [0] + data
4392
4393
4394 def _base_n_table(n, table):
4395     if not table and not n:
4396         raise ValueError('Either table or n must be specified')
4397     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4398
4399     if n and n != len(table):
4400         raise ValueError(f'base {n} exceeds table length {len(table)}')
4401     return table
4402
4403
4404 def encode_base_n(num, n=None, table=None):
4405     """Convert given int to a base-n string"""
4406     table = _base_n_table(n, table)
4407     if not num:
4408         return table[0]
4409
4410     result, base = '', len(table)
4411     while num:
4412         result = table[num % base] + result
4413         num = num // base
4414     return result
4415
4416
4417 def decode_base_n(string, n=None, table=None):
4418     """Convert given base-n string to int"""
4419     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4420     result, base = 0, len(table)
4421     for char in string:
4422         result = result * base + table[char]
4423     return result
4424
4425
4426 def decode_packed_codes(code):
4427     mobj = re.search(PACKED_CODES_RE, code)
4428     obfuscated_code, base, count, symbols = mobj.groups()
4429     base = int(base)
4430     count = int(count)
4431     symbols = symbols.split('|')
4432     symbol_table = {}
4433
4434     while count:
4435         count -= 1
4436         base_n_count = encode_base_n(count, base)
4437         symbol_table[base_n_count] = symbols[count] or base_n_count
4438
4439     return re.sub(
4440         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4441         obfuscated_code)
4442
4443
4444 def caesar(s, alphabet, shift):
4445     if shift == 0:
4446         return s
4447     l = len(alphabet)
4448     return ''.join(
4449         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4450         for c in s)
4451
4452
4453 def rot47(s):
4454     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4455
4456
4457 def parse_m3u8_attributes(attrib):
4458     info = {}
4459     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4460         if val.startswith('"'):
4461             val = val[1:-1]
4462         info[key] = val
4463     return info
4464
4465
4466 def urshift(val, n):
4467     return val >> n if val >= 0 else (val + 0x100000000) >> n
4468
4469
4470 def write_xattr(path, key, value):
4471     # Windows: Write xattrs to NTFS Alternate Data Streams:
4472     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4473     if compat_os_name == 'nt':
4474         assert ':' not in key
4475         assert os.path.exists(path)
4476
4477         try:
4478             with open(f'{path}:{key}', 'wb') as f:
4479                 f.write(value)
4480         except OSError as e:
4481             raise XAttrMetadataError(e.errno, e.strerror)
4482         return
4483
4484     # UNIX Method 1. Use xattrs/pyxattrs modules
4485
4486     setxattr = None
4487     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4488         # Unicode arguments are not supported in pyxattr until version 0.5.0
4489         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4490         if version_tuple(xattr.__version__) >= (0, 5, 0):
4491             setxattr = xattr.set
4492     elif xattr:
4493         setxattr = xattr.setxattr
4494
4495     if setxattr:
4496         try:
4497             setxattr(path, key, value)
4498         except OSError as e:
4499             raise XAttrMetadataError(e.errno, e.strerror)
4500         return
4501
4502     # UNIX Method 2. Use setfattr/xattr executables
4503     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4504            else 'xattr' if check_executable('xattr', ['-h']) else None)
4505     if not exe:
4506         raise XAttrUnavailableError(
4507             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4508             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4509
4510     value = value.decode()
4511     try:
4512         _, stderr, returncode = Popen.run(
4513             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4514             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4515     except OSError as e:
4516         raise XAttrMetadataError(e.errno, e.strerror)
4517     if returncode:
4518         raise XAttrMetadataError(returncode, stderr)
4519
4520
4521 def random_birthday(year_field, month_field, day_field):
4522     start_date = datetime.date(1950, 1, 1)
4523     end_date = datetime.date(1995, 12, 31)
4524     offset = random.randint(0, (end_date - start_date).days)
4525     random_date = start_date + datetime.timedelta(offset)
4526     return {
4527         year_field: str(random_date.year),
4528         month_field: str(random_date.month),
4529         day_field: str(random_date.day),
4530     }
4531
4532
4533 def find_available_port(interface=''):
4534     try:
4535         with socket.socket() as sock:
4536             sock.bind((interface, 0))
4537             return sock.getsockname()[1]
4538     except OSError:
4539         return None
4540
4541
4542 # Templates for internet shortcut files, which are plain text files.
4543 DOT_URL_LINK_TEMPLATE = '''\
4544 [InternetShortcut]
4545 URL=%(url)s
4546 '''
4547
4548 DOT_WEBLOC_LINK_TEMPLATE = '''\
4549 <?xml version="1.0" encoding="UTF-8"?>
4550 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4551 <plist version="1.0">
4552 <dict>
4553 \t<key>URL</key>
4554 \t<string>%(url)s</string>
4555 </dict>
4556 </plist>
4557 '''
4558
4559 DOT_DESKTOP_LINK_TEMPLATE = '''\
4560 [Desktop Entry]
4561 Encoding=UTF-8
4562 Name=%(filename)s
4563 Type=Link
4564 URL=%(url)s
4565 Icon=text-html
4566 '''
4567
4568 LINK_TEMPLATES = {
4569     'url': DOT_URL_LINK_TEMPLATE,
4570     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4571     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4572 }
4573
4574
4575 def iri_to_uri(iri):
4576     """
4577     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4578
4579     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4580     """
4581
4582     iri_parts = urllib.parse.urlparse(iri)
4583
4584     if '[' in iri_parts.netloc:
4585         raise ValueError('IPv6 URIs are not, yet, supported.')
4586         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4587
4588     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4589
4590     net_location = ''
4591     if iri_parts.username:
4592         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4593         if iri_parts.password is not None:
4594             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4595         net_location += '@'
4596
4597     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4598     # The 'idna' encoding produces ASCII text.
4599     if iri_parts.port is not None and iri_parts.port != 80:
4600         net_location += ':' + str(iri_parts.port)
4601
4602     return urllib.parse.urlunparse(
4603         (iri_parts.scheme,
4604             net_location,
4605
4606             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4607
4608             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4609             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4610
4611             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4612             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4613
4614             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4615
4616     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4617
4618
4619 def to_high_limit_path(path):
4620     if sys.platform in ['win32', 'cygwin']:
4621         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4622         return '\\\\?\\' + os.path.abspath(path)
4623
4624     return path
4625
4626
4627 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4628     val = traversal.traverse_obj(obj, *variadic(field))
4629     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4630         return default
4631     return template % func(val)
4632
4633
4634 def clean_podcast_url(url):
4635     url = re.sub(r'''(?x)
4636         (?:
4637             (?:
4638                 chtbl\.com/track|
4639                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4640                 play\.podtrac\.com|
4641                 chrt\.fm/track|
4642                 mgln\.ai/e
4643             )(?:/[^/.]+)?|
4644             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4645             flex\.acast\.com|
4646             pd(?:
4647                 cn\.co| # https://podcorn.com/analytics-prefix/
4648                 st\.fm # https://podsights.com/docs/
4649             )/e|
4650             [0-9]\.gum\.fm|
4651             pscrb\.fm/rss/p
4652         )/''', '', url)
4653     return re.sub(r'^\w+://(\w+://)', r'\1', url)
4654
4655
4656 _HEX_TABLE = '0123456789abcdef'
4657
4658
4659 def random_uuidv4():
4660     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4661
4662
4663 def make_dir(path, to_screen=None):
4664     try:
4665         dn = os.path.dirname(path)
4666         if dn:
4667             os.makedirs(dn, exist_ok=True)
4668         return True
4669     except OSError as err:
4670         if callable(to_screen) is not None:
4671             to_screen(f'unable to create directory {err}')
4672         return False
4673
4674
4675 def get_executable_path():
4676     from ..update import _get_variant_and_executable_path
4677
4678     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4679
4680
4681 def get_user_config_dirs(package_name):
4682     # .config (e.g. ~/.config/package_name)
4683     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4684     yield os.path.join(xdg_config_home, package_name)
4685
4686     # appdata (%APPDATA%/package_name)
4687     appdata_dir = os.getenv('appdata')
4688     if appdata_dir:
4689         yield os.path.join(appdata_dir, package_name)
4690
4691     # home (~/.package_name)
4692     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4693
4694
4695 def get_system_config_dirs(package_name):
4696     # /etc/package_name
4697     yield os.path.join('/etc', package_name)
4698
4699
4700 def time_seconds(**kwargs):
4701     """
4702     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4703     """
4704     return time.time() + datetime.timedelta(**kwargs).total_seconds()
4705
4706
4707 # create a JSON Web Signature (jws) with HS256 algorithm
4708 # the resulting format is in JWS Compact Serialization
4709 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4710 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4711 def jwt_encode_hs256(payload_data, key, headers={}):
4712     header_data = {
4713         'alg': 'HS256',
4714         'typ': 'JWT',
4715     }
4716     if headers:
4717         header_data.update(headers)
4718     header_b64 = base64.b64encode(json.dumps(header_data).encode())
4719     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4720     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4721     signature_b64 = base64.b64encode(h.digest())
4722     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4723     return token
4724
4725
4726 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4727 def jwt_decode_hs256(jwt):
4728     header_b64, payload_b64, signature_b64 = jwt.split('.')
4729     # add trailing ='s that may have been stripped, superfluous ='s are ignored
4730     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4731     return payload_data
4732
4733
4734 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4735
4736
4737 @functools.cache
4738 def supports_terminal_sequences(stream):
4739     if compat_os_name == 'nt':
4740         if not WINDOWS_VT_MODE:
4741             return False
4742     elif not os.getenv('TERM'):
4743         return False
4744     try:
4745         return stream.isatty()
4746     except BaseException:
4747         return False
4748
4749
4750 def windows_enable_vt_mode():
4751     """Ref: https://bugs.python.org/issue30075 """
4752     if get_windows_version() < (10, 0, 10586):
4753         return
4754
4755     import ctypes
4756     import ctypes.wintypes
4757     import msvcrt
4758
4759     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4760
4761     dll = ctypes.WinDLL('kernel32', use_last_error=False)
4762     handle = os.open('CONOUT$', os.O_RDWR)
4763     try:
4764         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4765         dw_original_mode = ctypes.wintypes.DWORD()
4766         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4767         if not success:
4768             raise Exception('GetConsoleMode failed')
4769
4770         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4771             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4772         if not success:
4773             raise Exception('SetConsoleMode failed')
4774     finally:
4775         os.close(handle)
4776
4777     global WINDOWS_VT_MODE
4778     WINDOWS_VT_MODE = True
4779     supports_terminal_sequences.cache_clear()
4780
4781
4782 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4783
4784
4785 def remove_terminal_sequences(string):
4786     return _terminal_sequences_re.sub('', string)
4787
4788
4789 def number_of_digits(number):
4790     return len('%d' % number)
4791
4792
4793 def join_nonempty(*values, delim='-', from_dict=None):
4794     if from_dict is not None:
4795         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4796     return delim.join(map(str, filter(None, values)))
4797
4798
4799 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4800     """
4801     Find the largest format dimensions in terms of video width and, for each thumbnail:
4802     * Modify the URL: Match the width with the provided regex and replace with the former width
4803     * Update dimensions
4804
4805     This function is useful with video services that scale the provided thumbnails on demand
4806     """
4807     _keys = ('width', 'height')
4808     max_dimensions = max(
4809         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4810         default=(0, 0))
4811     if not max_dimensions[0]:
4812         return thumbnails
4813     return [
4814         merge_dicts(
4815             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4816             dict(zip(_keys, max_dimensions)), thumbnail)
4817         for thumbnail in thumbnails
4818     ]
4819
4820
4821 def parse_http_range(range):
4822     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4823     if not range:
4824         return None, None, None
4825     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4826     if not crg:
4827         return None, None, None
4828     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4829
4830
4831 def read_stdin(what):
4832     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4833     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4834     return sys.stdin
4835
4836
4837 def determine_file_encoding(data):
4838     """
4839     Detect the text encoding used
4840     @returns (encoding, bytes to skip)
4841     """
4842
4843     # BOM marks are given priority over declarations
4844     for bom, enc in BOMS:
4845         if data.startswith(bom):
4846             return enc, len(bom)
4847
4848     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4849     # We ignore the endianness to get a good enough match
4850     data = data.replace(b'\0', b'')
4851     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4852     return mobj.group(1).decode() if mobj else None, 0
4853
4854
4855 class Config:
4856     own_args = None
4857     parsed_args = None
4858     filename = None
4859     __initialized = False
4860
4861     def __init__(self, parser, label=None):
4862         self.parser, self.label = parser, label
4863         self._loaded_paths, self.configs = set(), []
4864
4865     def init(self, args=None, filename=None):
4866         assert not self.__initialized
4867         self.own_args, self.filename = args, filename
4868         return self.load_configs()
4869
4870     def load_configs(self):
4871         directory = ''
4872         if self.filename:
4873             location = os.path.realpath(self.filename)
4874             directory = os.path.dirname(location)
4875             if location in self._loaded_paths:
4876                 return False
4877             self._loaded_paths.add(location)
4878
4879         self.__initialized = True
4880         opts, _ = self.parser.parse_known_args(self.own_args)
4881         self.parsed_args = self.own_args
4882         for location in opts.config_locations or []:
4883             if location == '-':
4884                 if location in self._loaded_paths:
4885                     continue
4886                 self._loaded_paths.add(location)
4887                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4888                 continue
4889             location = os.path.join(directory, expand_path(location))
4890             if os.path.isdir(location):
4891                 location = os.path.join(location, 'yt-dlp.conf')
4892             if not os.path.exists(location):
4893                 self.parser.error(f'config location {location} does not exist')
4894             self.append_config(self.read_file(location), location)
4895         return True
4896
4897     def __str__(self):
4898         label = join_nonempty(
4899             self.label, 'config', f'"{self.filename}"' if self.filename else '',
4900             delim=' ')
4901         return join_nonempty(
4902             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4903             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4904             delim='\n')
4905
4906     @staticmethod
4907     def read_file(filename, default=[]):
4908         try:
4909             optionf = open(filename, 'rb')
4910         except OSError:
4911             return default  # silently skip if file is not present
4912         try:
4913             enc, skip = determine_file_encoding(optionf.read(512))
4914             optionf.seek(skip, io.SEEK_SET)
4915         except OSError:
4916             enc = None  # silently skip read errors
4917         try:
4918             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4919             contents = optionf.read().decode(enc or preferredencoding())
4920             res = shlex.split(contents, comments=True)
4921         except Exception as err:
4922             raise ValueError(f'Unable to parse "{filename}": {err}')
4923         finally:
4924             optionf.close()
4925         return res
4926
4927     @staticmethod
4928     def hide_login_info(opts):
4929         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4930         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4931
4932         def _scrub_eq(o):
4933             m = eqre.match(o)
4934             if m:
4935                 return m.group('key') + '=PRIVATE'
4936             else:
4937                 return o
4938
4939         opts = list(map(_scrub_eq, opts))
4940         for idx, opt in enumerate(opts):
4941             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4942                 opts[idx + 1] = 'PRIVATE'
4943         return opts
4944
4945     def append_config(self, *args, label=None):
4946         config = type(self)(self.parser, label)
4947         config._loaded_paths = self._loaded_paths
4948         if config.init(*args):
4949             self.configs.append(config)
4950
4951     @property
4952     def all_args(self):
4953         for config in reversed(self.configs):
4954             yield from config.all_args
4955         yield from self.parsed_args or []
4956
4957     def parse_known_args(self, **kwargs):
4958         return self.parser.parse_known_args(self.all_args, **kwargs)
4959
4960     def parse_args(self):
4961         return self.parser.parse_args(self.all_args)
4962
4963
4964 class WebSocketsWrapper:
4965     """Wraps websockets module to use in non-async scopes"""
4966     pool = None
4967
4968     def __init__(self, url, headers=None, connect=True):
4969         self.loop = asyncio.new_event_loop()
4970         # XXX: "loop" is deprecated
4971         self.conn = websockets.connect(
4972             url, extra_headers=headers, ping_interval=None,
4973             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
4974         if connect:
4975             self.__enter__()
4976         atexit.register(self.__exit__, None, None, None)
4977
4978     def __enter__(self):
4979         if not self.pool:
4980             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
4981         return self
4982
4983     def send(self, *args):
4984         self.run_with_loop(self.pool.send(*args), self.loop)
4985
4986     def recv(self, *args):
4987         return self.run_with_loop(self.pool.recv(*args), self.loop)
4988
4989     def __exit__(self, type, value, traceback):
4990         try:
4991             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
4992         finally:
4993             self.loop.close()
4994             self._cancel_all_tasks(self.loop)
4995
4996     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
4997     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
4998     @staticmethod
4999     def run_with_loop(main, loop):
5000         if not asyncio.iscoroutine(main):
5001             raise ValueError(f'a coroutine was expected, got {main!r}')
5002
5003         try:
5004             return loop.run_until_complete(main)
5005         finally:
5006             loop.run_until_complete(loop.shutdown_asyncgens())
5007             if hasattr(loop, 'shutdown_default_executor'):
5008                 loop.run_until_complete(loop.shutdown_default_executor())
5009
5010     @staticmethod
5011     def _cancel_all_tasks(loop):
5012         to_cancel = asyncio.all_tasks(loop)
5013
5014         if not to_cancel:
5015             return
5016
5017         for task in to_cancel:
5018             task.cancel()
5019
5020         # XXX: "loop" is removed in python 3.10+
5021         loop.run_until_complete(
5022             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5023
5024         for task in to_cancel:
5025             if task.cancelled():
5026                 continue
5027             if task.exception() is not None:
5028                 loop.call_exception_handler({
5029                     'message': 'unhandled exception during asyncio.run() shutdown',
5030                     'exception': task.exception(),
5031                     'task': task,
5032                 })
5033
5034
5035 def merge_headers(*dicts):
5036     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5037     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5038
5039
5040 def cached_method(f):
5041     """Cache a method"""
5042     signature = inspect.signature(f)
5043
5044     @functools.wraps(f)
5045     def wrapper(self, *args, **kwargs):
5046         bound_args = signature.bind(self, *args, **kwargs)
5047         bound_args.apply_defaults()
5048         key = tuple(bound_args.arguments.values())[1:]
5049
5050         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5051         if key not in cache:
5052             cache[key] = f(self, *args, **kwargs)
5053         return cache[key]
5054     return wrapper
5055
5056
5057 class classproperty:
5058     """property access for class methods with optional caching"""
5059     def __new__(cls, func=None, *args, **kwargs):
5060         if not func:
5061             return functools.partial(cls, *args, **kwargs)
5062         return super().__new__(cls)
5063
5064     def __init__(self, func, *, cache=False):
5065         functools.update_wrapper(self, func)
5066         self.func = func
5067         self._cache = {} if cache else None
5068
5069     def __get__(self, _, cls):
5070         if self._cache is None:
5071             return self.func(cls)
5072         elif cls not in self._cache:
5073             self._cache[cls] = self.func(cls)
5074         return self._cache[cls]
5075
5076
5077 class function_with_repr:
5078     def __init__(self, func, repr_=None):
5079         functools.update_wrapper(self, func)
5080         self.func, self.__repr = func, repr_
5081
5082     def __call__(self, *args, **kwargs):
5083         return self.func(*args, **kwargs)
5084
5085     def __repr__(self):
5086         if self.__repr:
5087             return self.__repr
5088         return f'{self.func.__module__}.{self.func.__qualname__}'
5089
5090
5091 class Namespace(types.SimpleNamespace):
5092     """Immutable namespace"""
5093
5094     def __iter__(self):
5095         return iter(self.__dict__.values())
5096
5097     @property
5098     def items_(self):
5099         return self.__dict__.items()
5100
5101
5102 MEDIA_EXTENSIONS = Namespace(
5103     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5104     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5105     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5106     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5107     thumbnails=('jpg', 'png', 'webp'),
5108     storyboards=('mhtml', ),
5109     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5110     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5111 )
5112 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5113 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5114
5115 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5116
5117
5118 class RetryManager:
5119     """Usage:
5120         for retry in RetryManager(...):
5121             try:
5122                 ...
5123             except SomeException as err:
5124                 retry.error = err
5125                 continue
5126     """
5127     attempt, _error = 0, None
5128
5129     def __init__(self, _retries, _error_callback, **kwargs):
5130         self.retries = _retries or 0
5131         self.error_callback = functools.partial(_error_callback, **kwargs)
5132
5133     def _should_retry(self):
5134         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5135
5136     @property
5137     def error(self):
5138         if self._error is NO_DEFAULT:
5139             return None
5140         return self._error
5141
5142     @error.setter
5143     def error(self, value):
5144         self._error = value
5145
5146     def __iter__(self):
5147         while self._should_retry():
5148             self.error = NO_DEFAULT
5149             self.attempt += 1
5150             yield self
5151             if self.error:
5152                 self.error_callback(self.error, self.attempt, self.retries)
5153
5154     @staticmethod
5155     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5156         """Utility function for reporting retries"""
5157         if count > retries:
5158             if error:
5159                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5160             raise e
5161
5162         if not count:
5163             return warn(e)
5164         elif isinstance(e, ExtractorError):
5165             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5166         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5167
5168         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5169         if delay:
5170             info(f'Sleeping {delay:.2f} seconds ...')
5171             time.sleep(delay)
5172
5173
5174 def make_archive_id(ie, video_id):
5175     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5176     return f'{ie_key.lower()} {video_id}'
5177
5178
5179 def truncate_string(s, left, right=0):
5180     assert left > 3 and right >= 0
5181     if s is None or len(s) <= left + right:
5182         return s
5183     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5184
5185
5186 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5187     assert 'all' in alias_dict, '"all" alias is required'
5188     requested = list(start or [])
5189     for val in options:
5190         discard = val.startswith('-')
5191         if discard:
5192             val = val[1:]
5193
5194         if val in alias_dict:
5195             val = alias_dict[val] if not discard else [
5196                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5197             # NB: Do not allow regex in aliases for performance
5198             requested = orderedSet_from_options(val, alias_dict, start=requested)
5199             continue
5200
5201         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5202                    else [val] if val in alias_dict['all'] else None)
5203         if current is None:
5204             raise ValueError(val)
5205
5206         if discard:
5207             for item in current:
5208                 while item in requested:
5209                     requested.remove(item)
5210         else:
5211             requested.extend(current)
5212
5213     return orderedSet(requested)
5214
5215
5216 # TODO: Rewrite
5217 class FormatSorter:
5218     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5219
5220     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5221                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5222                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5223     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5224                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5225                     'fps', 'fs_approx', 'source', 'id')
5226
5227     settings = {
5228         'vcodec': {'type': 'ordered', 'regex': True,
5229                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5230         'acodec': {'type': 'ordered', 'regex': True,
5231                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5232         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5233                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5234         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5235                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5236         'vext': {'type': 'ordered', 'field': 'video_ext',
5237                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5238                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5239         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5240                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5241                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5242         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5243         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5244                        'field': ('vcodec', 'acodec'),
5245                        'function': lambda it: int(any(v != 'none' for v in it))},
5246         'ie_pref': {'priority': True, 'type': 'extractor'},
5247         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5248         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5249         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5250         'quality': {'convert': 'float', 'default': -1},
5251         'filesize': {'convert': 'bytes'},
5252         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5253         'id': {'convert': 'string', 'field': 'format_id'},
5254         'height': {'convert': 'float_none'},
5255         'width': {'convert': 'float_none'},
5256         'fps': {'convert': 'float_none'},
5257         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5258         'tbr': {'convert': 'float_none'},
5259         'vbr': {'convert': 'float_none'},
5260         'abr': {'convert': 'float_none'},
5261         'asr': {'convert': 'float_none'},
5262         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5263
5264         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5265         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5266                'function': lambda it: next(filter(None, it), None)},
5267         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5268                  'function': lambda it: next(filter(None, it), None)},
5269         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5270         'res': {'type': 'multiple', 'field': ('height', 'width'),
5271                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5272
5273         # Actual field names
5274         'format_id': {'type': 'alias', 'field': 'id'},
5275         'preference': {'type': 'alias', 'field': 'ie_pref'},
5276         'language_preference': {'type': 'alias', 'field': 'lang'},
5277         'source_preference': {'type': 'alias', 'field': 'source'},
5278         'protocol': {'type': 'alias', 'field': 'proto'},
5279         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5280         'audio_channels': {'type': 'alias', 'field': 'channels'},
5281
5282         # Deprecated
5283         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5284         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5285         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5286         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5287         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5288         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5289         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5290         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5291         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5292         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5293         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5294         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5295         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5296         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5297         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5298         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5299         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5300         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5301         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5302         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5303     }
5304
5305     def __init__(self, ydl, field_preference):
5306         self.ydl = ydl
5307         self._order = []
5308         self.evaluate_params(self.ydl.params, field_preference)
5309         if ydl.params.get('verbose'):
5310             self.print_verbose_info(self.ydl.write_debug)
5311
5312     def _get_field_setting(self, field, key):
5313         if field not in self.settings:
5314             if key in ('forced', 'priority'):
5315                 return False
5316             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5317                                         'deprecated and may be removed in a future version')
5318             self.settings[field] = {}
5319         propObj = self.settings[field]
5320         if key not in propObj:
5321             type = propObj.get('type')
5322             if key == 'field':
5323                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5324             elif key == 'convert':
5325                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5326             else:
5327                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5328             propObj[key] = default
5329         return propObj[key]
5330
5331     def _resolve_field_value(self, field, value, convertNone=False):
5332         if value is None:
5333             if not convertNone:
5334                 return None
5335         else:
5336             value = value.lower()
5337         conversion = self._get_field_setting(field, 'convert')
5338         if conversion == 'ignore':
5339             return None
5340         if conversion == 'string':
5341             return value
5342         elif conversion == 'float_none':
5343             return float_or_none(value)
5344         elif conversion == 'bytes':
5345             return parse_bytes(value)
5346         elif conversion == 'order':
5347             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5348             use_regex = self._get_field_setting(field, 'regex')
5349             list_length = len(order_list)
5350             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5351             if use_regex and value is not None:
5352                 for i, regex in enumerate(order_list):
5353                     if regex and re.match(regex, value):
5354                         return list_length - i
5355                 return list_length - empty_pos  # not in list
5356             else:  # not regex or  value = None
5357                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5358         else:
5359             if value.isnumeric():
5360                 return float(value)
5361             else:
5362                 self.settings[field]['convert'] = 'string'
5363                 return value
5364
5365     def evaluate_params(self, params, sort_extractor):
5366         self._use_free_order = params.get('prefer_free_formats', False)
5367         self._sort_user = params.get('format_sort', [])
5368         self._sort_extractor = sort_extractor
5369
5370         def add_item(field, reverse, closest, limit_text):
5371             field = field.lower()
5372             if field in self._order:
5373                 return
5374             self._order.append(field)
5375             limit = self._resolve_field_value(field, limit_text)
5376             data = {
5377                 'reverse': reverse,
5378                 'closest': False if limit is None else closest,
5379                 'limit_text': limit_text,
5380                 'limit': limit}
5381             if field in self.settings:
5382                 self.settings[field].update(data)
5383             else:
5384                 self.settings[field] = data
5385
5386         sort_list = (
5387             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5388             + (tuple() if params.get('format_sort_force', False)
5389                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5390             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5391
5392         for item in sort_list:
5393             match = re.match(self.regex, item)
5394             if match is None:
5395                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5396             field = match.group('field')
5397             if field is None:
5398                 continue
5399             if self._get_field_setting(field, 'type') == 'alias':
5400                 alias, field = field, self._get_field_setting(field, 'field')
5401                 if self._get_field_setting(alias, 'deprecated'):
5402                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5403                                                 f'be removed in a future version. Please use {field} instead')
5404             reverse = match.group('reverse') is not None
5405             closest = match.group('separator') == '~'
5406             limit_text = match.group('limit')
5407
5408             has_limit = limit_text is not None
5409             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5410             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5411
5412             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5413             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5414             limit_count = len(limits)
5415             for (i, f) in enumerate(fields):
5416                 add_item(f, reverse, closest,
5417                          limits[i] if i < limit_count
5418                          else limits[0] if has_limit and not has_multiple_limits
5419                          else None)
5420
5421     def print_verbose_info(self, write_debug):
5422         if self._sort_user:
5423             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5424         if self._sort_extractor:
5425             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5426         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5427             '+' if self._get_field_setting(field, 'reverse') else '', field,
5428             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5429                           self._get_field_setting(field, 'limit_text'),
5430                           self._get_field_setting(field, 'limit'))
5431             if self._get_field_setting(field, 'limit_text') is not None else '')
5432             for field in self._order if self._get_field_setting(field, 'visible')]))
5433
5434     def _calculate_field_preference_from_value(self, format, field, type, value):
5435         reverse = self._get_field_setting(field, 'reverse')
5436         closest = self._get_field_setting(field, 'closest')
5437         limit = self._get_field_setting(field, 'limit')
5438
5439         if type == 'extractor':
5440             maximum = self._get_field_setting(field, 'max')
5441             if value is None or (maximum is not None and value >= maximum):
5442                 value = -1
5443         elif type == 'boolean':
5444             in_list = self._get_field_setting(field, 'in_list')
5445             not_in_list = self._get_field_setting(field, 'not_in_list')
5446             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5447         elif type == 'ordered':
5448             value = self._resolve_field_value(field, value, True)
5449
5450         # try to convert to number
5451         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5452         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5453         if is_num:
5454             value = val_num
5455
5456         return ((-10, 0) if value is None
5457                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5458                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5459                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5460                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5461                 else (-1, value, 0))
5462
5463     def _calculate_field_preference(self, format, field):
5464         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5465         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5466         if type == 'multiple':
5467             type = 'field'  # Only 'field' is allowed in multiple for now
5468             actual_fields = self._get_field_setting(field, 'field')
5469
5470             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5471         else:
5472             value = get_value(field)
5473         return self._calculate_field_preference_from_value(format, field, type, value)
5474
5475     def calculate_preference(self, format):
5476         # Determine missing protocol
5477         if not format.get('protocol'):
5478             format['protocol'] = determine_protocol(format)
5479
5480         # Determine missing ext
5481         if not format.get('ext') and 'url' in format:
5482             format['ext'] = determine_ext(format['url'])
5483         if format.get('vcodec') == 'none':
5484             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5485             format['video_ext'] = 'none'
5486         else:
5487             format['video_ext'] = format['ext']
5488             format['audio_ext'] = 'none'
5489         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5490         #    format['preference'] = -1000
5491
5492         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5493             # HEVC-over-FLV is out-of-spec by FLV's original spec
5494             # ref. https://trac.ffmpeg.org/ticket/6389
5495             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5496             format['preference'] = -100
5497
5498         # Determine missing bitrates
5499         if format.get('vcodec') == 'none':
5500             format['vbr'] = 0
5501         if format.get('acodec') == 'none':
5502             format['abr'] = 0
5503         if not format.get('vbr') and format.get('vcodec') != 'none':
5504             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5505         if not format.get('abr') and format.get('acodec') != 'none':
5506             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5507         if not format.get('tbr'):
5508             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5509
5510         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5511
5512
5513 # XXX: Temporary
5514 class _YDLLogger:
5515     def __init__(self, ydl=None):
5516         self._ydl = ydl
5517
5518     def debug(self, message):
5519         if self._ydl:
5520             self._ydl.write_debug(message)
5521
5522     def info(self, message):
5523         if self._ydl:
5524             self._ydl.to_screen(message)
5525
5526     def warning(self, message, *, once=False):
5527         if self._ydl:
5528             self._ydl.report_warning(message, only_once=once)
5529
5530     def error(self, message, *, is_error=True):
5531         if self._ydl:
5532             self._ydl.report_error(message, is_error=is_error)
5533
5534     def stdout(self, message):
5535         if self._ydl:
5536             self._ydl.to_stdout(message)
5537
5538     def stderr(self, message):
5539         if self._ydl:
5540             self._ydl.to_stderr(message)