yt_dlp/utils/_utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import collections.abc
   9 import contextlib
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import http.client
  19 import http.cookiejar
  20 import inspect
  21 import io
  22 import itertools
  23 import json
  24 import locale
  25 import math
  26 import mimetypes
  27 import netrc
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import unicodedata
  44 import urllib.error
  45 import urllib.parse
  46 import urllib.request
  47 import xml.etree.ElementTree
  48
  49 from . import traversal
  50
  51 from ..compat import functools  # isort: split
  52 from ..compat import (
  53     compat_etree_fromstring,
  54     compat_expanduser,
  55     compat_HTMLParseError,
  56     compat_os_name,
  57     compat_shlex_quote,
  58 )
  59 from ..dependencies import websockets, xattr
  60
  61 __name__ = __name__.rsplit('.', 1)[0]  # Pretend to be the parent module
  62
  63 # This is not clearly defined otherwise
  64 compiled_regex_type = type(re.compile(''))
  65
  66
  67 USER_AGENTS = {
  68     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  69 }
  70
  71
  72 class NO_DEFAULT:
  73     pass
  74
  75
  76 def IDENTITY(x):
  77     return x
  78
  79
  80 ENGLISH_MONTH_NAMES = [
  81     'January', 'February', 'March', 'April', 'May', 'June',
  82     'July', 'August', 'September', 'October', 'November', 'December']
  83
  84 MONTH_NAMES = {
  85     'en': ENGLISH_MONTH_NAMES,
  86     'fr': [
  87         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  88         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  89     # these follow the genitive grammatical case (dopełniacz)
  90     # some websites might be using nominative, which will require another month list
  91     # https://en.wikibooks.org/wiki/Polish/Noun_cases
  92     'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  93            'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  94 }
  95
  96 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  97 TIMEZONE_NAMES = {
  98     'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  99     'AST': -4, 'ADT': -3,  # Atlantic (used in Canada)
 100     'EST': -5, 'EDT': -4,  # Eastern
 101     'CST': -6, 'CDT': -5,  # Central
 102     'MST': -7, 'MDT': -6,  # Mountain
 103     'PST': -8, 'PDT': -7   # Pacific
 104 }
 105
 106 # needed for sanitizing filenames in restricted mode
 107 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 108                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 109                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 110
 111 DATE_FORMATS = (
 112     '%d %B %Y',
 113     '%d %b %Y',
 114     '%B %d %Y',
 115     '%B %dst %Y',
 116     '%B %dnd %Y',
 117     '%B %drd %Y',
 118     '%B %dth %Y',
 119     '%b %d %Y',
 120     '%b %dst %Y',
 121     '%b %dnd %Y',
 122     '%b %drd %Y',
 123     '%b %dth %Y',
 124     '%b %dst %Y %I:%M',
 125     '%b %dnd %Y %I:%M',
 126     '%b %drd %Y %I:%M',
 127     '%b %dth %Y %I:%M',
 128     '%Y %m %d',
 129     '%Y-%m-%d',
 130     '%Y.%m.%d.',
 131     '%Y/%m/%d',
 132     '%Y/%m/%d %H:%M',
 133     '%Y/%m/%d %H:%M:%S',
 134     '%Y%m%d%H%M',
 135     '%Y%m%d%H%M%S',
 136     '%Y%m%d',
 137     '%Y-%m-%d %H:%M',
 138     '%Y-%m-%d %H:%M:%S',
 139     '%Y-%m-%d %H:%M:%S.%f',
 140     '%Y-%m-%d %H:%M:%S:%f',
 141     '%d.%m.%Y %H:%M',
 142     '%d.%m.%Y %H.%M',
 143     '%Y-%m-%dT%H:%M:%SZ',
 144     '%Y-%m-%dT%H:%M:%S.%fZ',
 145     '%Y-%m-%dT%H:%M:%S.%f0Z',
 146     '%Y-%m-%dT%H:%M:%S',
 147     '%Y-%m-%dT%H:%M:%S.%f',
 148     '%Y-%m-%dT%H:%M',
 149     '%b %d %Y at %H:%M',
 150     '%b %d %Y at %H:%M:%S',
 151     '%B %d %Y at %H:%M',
 152     '%B %d %Y at %H:%M:%S',
 153     '%H:%M %d-%b-%Y',
 154 )
 155
 156 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 157 DATE_FORMATS_DAY_FIRST.extend([
 158     '%d-%m-%Y',
 159     '%d.%m.%Y',
 160     '%d.%m.%y',
 161     '%d/%m/%Y',
 162     '%d/%m/%y',
 163     '%d/%m/%Y %H:%M:%S',
 164     '%d-%m-%Y %H:%M',
 165     '%H:%M %d/%m/%Y',
 166 ])
 167
 168 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 169 DATE_FORMATS_MONTH_FIRST.extend([
 170     '%m-%d-%Y',
 171     '%m.%d.%Y',
 172     '%m/%d/%Y',
 173     '%m/%d/%y',
 174     '%m/%d/%Y %H:%M:%S',
 175 ])
 176
 177 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 178 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
 179
 180 NUMBER_RE = r'\d+(?:\.\d+)?'
 181
 182
 183 @functools.cache
 184 def preferredencoding():
 185     """Get preferred encoding.
 186
 187     Returns the best encoding scheme for the system, based on
 188     locale.getpreferredencoding() and some further tweaks.
 189     """
 190     try:
 191         pref = locale.getpreferredencoding()
 192         'TEST'.encode(pref)
 193     except Exception:
 194         pref = 'UTF-8'
 195
 196     return pref
 197
 198
 199 def write_json_file(obj, fn):
 200     """ Encode obj as JSON and write it to fn, atomically if possible """
 201
 202     tf = tempfile.NamedTemporaryFile(
 203         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 204         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 205
 206     try:
 207         with tf:
 208             json.dump(obj, tf, ensure_ascii=False)
 209         if sys.platform == 'win32':
 210             # Need to remove existing file on Windows, else os.rename raises
 211             # WindowsError or FileExistsError.
 212             with contextlib.suppress(OSError):
 213                 os.unlink(fn)
 214         with contextlib.suppress(OSError):
 215             mask = os.umask(0)
 216             os.umask(mask)
 217             os.chmod(tf.name, 0o666 & ~mask)
 218         os.rename(tf.name, fn)
 219     except Exception:
 220         with contextlib.suppress(OSError):
 221             os.remove(tf.name)
 222         raise
 223
 224
 225 def find_xpath_attr(node, xpath, key, val=None):
 226     """ Find the xpath xpath[@key=val] """
 227     assert re.match(r'^[a-zA-Z_-]+$', key)
 228     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 229     return node.find(expr)
 230
 231 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 232 # the namespace parameter
 233
 234
 235 def xpath_with_ns(path, ns_map):
 236     components = [c.split(':') for c in path.split('/')]
 237     replaced = []
 238     for c in components:
 239         if len(c) == 1:
 240             replaced.append(c[0])
 241         else:
 242             ns, tag = c
 243             replaced.append('{%s}%s' % (ns_map[ns], tag))
 244     return '/'.join(replaced)
 245
 246
 247 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 248     def _find_xpath(xpath):
 249         return node.find(xpath)
 250
 251     if isinstance(xpath, str):
 252         n = _find_xpath(xpath)
 253     else:
 254         for xp in xpath:
 255             n = _find_xpath(xp)
 256             if n is not None:
 257                 break
 258
 259     if n is None:
 260         if default is not NO_DEFAULT:
 261             return default
 262         elif fatal:
 263             name = xpath if name is None else name
 264             raise ExtractorError('Could not find XML element %s' % name)
 265         else:
 266             return None
 267     return n
 268
 269
 270 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 271     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 272     if n is None or n == default:
 273         return n
 274     if n.text is None:
 275         if default is not NO_DEFAULT:
 276             return default
 277         elif fatal:
 278             name = xpath if name is None else name
 279             raise ExtractorError('Could not find XML element\'s text %s' % name)
 280         else:
 281             return None
 282     return n.text
 283
 284
 285 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 286     n = find_xpath_attr(node, xpath, key)
 287     if n is None:
 288         if default is not NO_DEFAULT:
 289             return default
 290         elif fatal:
 291             name = f'{xpath}[@{key}]' if name is None else name
 292             raise ExtractorError('Could not find XML attribute %s' % name)
 293         else:
 294             return None
 295     return n.attrib[key]
 296
 297
 298 def get_element_by_id(id, html, **kwargs):
 299     """Return the content of the tag with the specified ID in the passed HTML document"""
 300     return get_element_by_attribute('id', id, html, **kwargs)
 301
 302
 303 def get_element_html_by_id(id, html, **kwargs):
 304     """Return the html of the tag with the specified ID in the passed HTML document"""
 305     return get_element_html_by_attribute('id', id, html, **kwargs)
 306
 307
 308 def get_element_by_class(class_name, html):
 309     """Return the content of the first tag with the specified class in the passed HTML document"""
 310     retval = get_elements_by_class(class_name, html)
 311     return retval[0] if retval else None
 312
 313
 314 def get_element_html_by_class(class_name, html):
 315     """Return the html of the first tag with the specified class in the passed HTML document"""
 316     retval = get_elements_html_by_class(class_name, html)
 317     return retval[0] if retval else None
 318
 319
 320 def get_element_by_attribute(attribute, value, html, **kwargs):
 321     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 322     return retval[0] if retval else None
 323
 324
 325 def get_element_html_by_attribute(attribute, value, html, **kargs):
 326     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 327     return retval[0] if retval else None
 328
 329
 330 def get_elements_by_class(class_name, html, **kargs):
 331     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 332     return get_elements_by_attribute(
 333         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 334         html, escape_value=False)
 335
 336
 337 def get_elements_html_by_class(class_name, html):
 338     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 339     return get_elements_html_by_attribute(
 340         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 341         html, escape_value=False)
 342
 343
 344 def get_elements_by_attribute(*args, **kwargs):
 345     """Return the content of the tag with the specified attribute in the passed HTML document"""
 346     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 347
 348
 349 def get_elements_html_by_attribute(*args, **kwargs):
 350     """Return the html of the tag with the specified attribute in the passed HTML document"""
 351     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 352
 353
 354 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
 355     """
 356     Return the text (content) and the html (whole) of the tag with the specified
 357     attribute in the passed HTML document
 358     """
 359     if not value:
 360         return
 361
 362     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 363
 364     value = re.escape(value) if escape_value else value
 365
 366     partial_element_re = rf'''(?x)
 367         <(?P<tag>{tag})
 368          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 369          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 370         '''
 371
 372     for m in re.finditer(partial_element_re, html):
 373         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 374
 375         yield (
 376             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 377             whole
 378         )
 379
 380
 381 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 382     """
 383     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 384     closing tag for the first opening tag it has encountered, and can be used
 385     as a context manager
 386     """
 387
 388     class HTMLBreakOnClosingTagException(Exception):
 389         pass
 390
 391     def __init__(self):
 392         self.tagstack = collections.deque()
 393         html.parser.HTMLParser.__init__(self)
 394
 395     def __enter__(self):
 396         return self
 397
 398     def __exit__(self, *_):
 399         self.close()
 400
 401     def close(self):
 402         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 403         # so data remains buffered; we no longer have any interest in it, thus
 404         # override this method to discard it
 405         pass
 406
 407     def handle_starttag(self, tag, _):
 408         self.tagstack.append(tag)
 409
 410     def handle_endtag(self, tag):
 411         if not self.tagstack:
 412             raise compat_HTMLParseError('no tags in the stack')
 413         while self.tagstack:
 414             inner_tag = self.tagstack.pop()
 415             if inner_tag == tag:
 416                 break
 417         else:
 418             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 419         if not self.tagstack:
 420             raise self.HTMLBreakOnClosingTagException()
 421
 422
 423 # XXX: This should be far less strict
 424 def get_element_text_and_html_by_tag(tag, html):
 425     """
 426     For the first element with the specified tag in the passed HTML document
 427     return its' content (text) and the whole element (html)
 428     """
 429     def find_or_raise(haystack, needle, exc):
 430         try:
 431             return haystack.index(needle)
 432         except ValueError:
 433             raise exc
 434     closing_tag = f'</{tag}>'
 435     whole_start = find_or_raise(
 436         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 437     content_start = find_or_raise(
 438         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 439     content_start += whole_start + 1
 440     with HTMLBreakOnClosingTagParser() as parser:
 441         parser.feed(html[whole_start:content_start])
 442         if not parser.tagstack or parser.tagstack[0] != tag:
 443             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 444         offset = content_start
 445         while offset < len(html):
 446             next_closing_tag_start = find_or_raise(
 447                 html[offset:], closing_tag,
 448                 compat_HTMLParseError(f'closing {tag} tag not found'))
 449             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 450             try:
 451                 parser.feed(html[offset:offset + next_closing_tag_end])
 452                 offset += next_closing_tag_end
 453             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 454                 return html[content_start:offset + next_closing_tag_start], \
 455                     html[whole_start:offset + next_closing_tag_end]
 456         raise compat_HTMLParseError('unexpected end of html')
 457
 458
 459 class HTMLAttributeParser(html.parser.HTMLParser):
 460     """Trivial HTML parser to gather the attributes for a single element"""
 461
 462     def __init__(self):
 463         self.attrs = {}
 464         html.parser.HTMLParser.__init__(self)
 465
 466     def handle_starttag(self, tag, attrs):
 467         self.attrs = dict(attrs)
 468         raise compat_HTMLParseError('done')
 469
 470
 471 class HTMLListAttrsParser(html.parser.HTMLParser):
 472     """HTML parser to gather the attributes for the elements of a list"""
 473
 474     def __init__(self):
 475         html.parser.HTMLParser.__init__(self)
 476         self.items = []
 477         self._level = 0
 478
 479     def handle_starttag(self, tag, attrs):
 480         if tag == 'li' and self._level == 0:
 481             self.items.append(dict(attrs))
 482         self._level += 1
 483
 484     def handle_endtag(self, tag):
 485         self._level -= 1
 486
 487
 488 def extract_attributes(html_element):
 489     """Given a string for an HTML element such as
 490     <el
 491          a="foo" B="bar" c="&98;az" d=boz
 492          empty= noval entity="&amp;"
 493          sq='"' dq="'"
 494     >
 495     Decode and return a dictionary of attributes.
 496     {
 497         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 498         'empty': '', 'noval': None, 'entity': '&',
 499         'sq': '"', 'dq': '\''
 500     }.
 501     """
 502     parser = HTMLAttributeParser()
 503     with contextlib.suppress(compat_HTMLParseError):
 504         parser.feed(html_element)
 505         parser.close()
 506     return parser.attrs
 507
 508
 509 def parse_list(webpage):
 510     """Given a string for an series of HTML <li> elements,
 511     return a dictionary of their attributes"""
 512     parser = HTMLListAttrsParser()
 513     parser.feed(webpage)
 514     parser.close()
 515     return parser.items
 516
 517
 518 def clean_html(html):
 519     """Clean an HTML snippet into a readable string"""
 520
 521     if html is None:  # Convenience for sanitizing descriptions etc.
 522         return html
 523
 524     html = re.sub(r'\s+', ' ', html)
 525     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 526     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 527     # Strip html tags
 528     html = re.sub('<.*?>', '', html)
 529     # Replace html entities
 530     html = unescapeHTML(html)
 531     return html.strip()
 532
 533
 534 class LenientJSONDecoder(json.JSONDecoder):
 535     # TODO: Write tests
 536     def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
 537         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 538         self._close_attempts = 2 * close_objects
 539         super().__init__(*args, **kwargs)
 540
 541     @staticmethod
 542     def _close_object(err):
 543         doc = err.doc[:err.pos]
 544         # We need to add comma first to get the correct error message
 545         if err.msg.startswith('Expecting \',\''):
 546             return doc + ','
 547         elif not doc.endswith(','):
 548             return
 549
 550         if err.msg.startswith('Expecting property name'):
 551             return doc[:-1] + '}'
 552         elif err.msg.startswith('Expecting value'):
 553             return doc[:-1] + ']'
 554
 555     def decode(self, s):
 556         if self.transform_source:
 557             s = self.transform_source(s)
 558         for attempt in range(self._close_attempts + 1):
 559             try:
 560                 if self.ignore_extra:
 561                     return self.raw_decode(s.lstrip())[0]
 562                 return super().decode(s)
 563             except json.JSONDecodeError as e:
 564                 if e.pos is None:
 565                     raise
 566                 elif attempt < self._close_attempts:
 567                     s = self._close_object(e)
 568                     if s is not None:
 569                         continue
 570                 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
 571         assert False, 'Too many attempts to decode JSON'
 572
 573
 574 def sanitize_open(filename, open_mode):
 575     """Try to open the given filename, and slightly tweak it if this fails.
 576
 577     Attempts to open the given filename. If this fails, it tries to change
 578     the filename slightly, step by step, until it's either able to open it
 579     or it fails and raises a final exception, like the standard open()
 580     function.
 581
 582     It returns the tuple (stream, definitive_file_name).
 583     """
 584     if filename == '-':
 585         if sys.platform == 'win32':
 586             import msvcrt
 587
 588             # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
 589             with contextlib.suppress(io.UnsupportedOperation):
 590                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 591         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 592
 593     for attempt in range(2):
 594         try:
 595             try:
 596                 if sys.platform == 'win32':
 597                     # FIXME: An exclusive lock also locks the file from being read.
 598                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 599                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 600                     raise LockingUnsupportedError()
 601                 stream = locked_file(filename, open_mode, block=False).__enter__()
 602             except OSError:
 603                 stream = open(filename, open_mode)
 604             return stream, filename
 605         except OSError as err:
 606             if attempt or err.errno in (errno.EACCES,):
 607                 raise
 608             old_filename, filename = filename, sanitize_path(filename)
 609             if old_filename == filename:
 610                 raise
 611
 612
 613 def timeconvert(timestr):
 614     """Convert RFC 2822 defined time string into system timestamp"""
 615     timestamp = None
 616     timetuple = email.utils.parsedate_tz(timestr)
 617     if timetuple is not None:
 618         timestamp = email.utils.mktime_tz(timetuple)
 619     return timestamp
 620
 621
 622 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 623     """Sanitizes a string so it could be used as part of a filename.
 624     @param restricted   Use a stricter subset of allowed characters
 625     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 626                         If unset, yt-dlp's new sanitization rules are in effect
 627     """
 628     if s == '':
 629         return ''
 630
 631     def replace_insane(char):
 632         if restricted and char in ACCENT_CHARS:
 633             return ACCENT_CHARS[char]
 634         elif not restricted and char == '\n':
 635             return '\0 '
 636         elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
 637             # Replace with their full-width unicode counterparts
 638             return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
 639         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 640             return ''
 641         elif char == '"':
 642             return '' if restricted else '\''
 643         elif char == ':':
 644             return '\0_\0-' if restricted else '\0 \0-'
 645         elif char in '\\/|*<>':
 646             return '\0_'
 647         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 648             return '\0_'
 649         return char
 650
 651     # Replace look-alike Unicode glyphs
 652     if restricted and (is_id is NO_DEFAULT or not is_id):
 653         s = unicodedata.normalize('NFKC', s)
 654     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 655     result = ''.join(map(replace_insane, s))
 656     if is_id is NO_DEFAULT:
 657         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 658         STRIP_RE = r'(?:\0.|[ _-])*'
 659         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 660     result = result.replace('\0', '') or '_'
 661
 662     if not is_id:
 663         while '__' in result:
 664             result = result.replace('__', '_')
 665         result = result.strip('_')
 666         # Common case of "Foreign band name - English song title"
 667         if restricted and result.startswith('-_'):
 668             result = result[2:]
 669         if result.startswith('-'):
 670             result = '_' + result[len('-'):]
 671         result = result.lstrip('.')
 672         if not result:
 673             result = '_'
 674     return result
 675
 676
 677 def sanitize_path(s, force=False):
 678     """Sanitizes and normalizes path on Windows"""
 679     if sys.platform == 'win32':
 680         force = False
 681         drive_or_unc, _ = os.path.splitdrive(s)
 682     elif force:
 683         drive_or_unc = ''
 684     else:
 685         return s
 686
 687     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 688     if drive_or_unc:
 689         norm_path.pop(0)
 690     sanitized_path = [
 691         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 692         for path_part in norm_path]
 693     if drive_or_unc:
 694         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 695     elif force and s and s[0] == os.path.sep:
 696         sanitized_path.insert(0, os.path.sep)
 697     return os.path.join(*sanitized_path)
 698
 699
 700 def sanitize_url(url, *, scheme='http'):
 701     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 702     # the number of unwanted failures due to missing protocol
 703     if url is None:
 704         return
 705     elif url.startswith('//'):
 706         return f'{scheme}:{url}'
 707     # Fix some common typos seen so far
 708     COMMON_TYPOS = (
 709         # https://github.com/ytdl-org/youtube-dl/issues/15649
 710         (r'^httpss://', r'https://'),
 711         # https://bx1.be/lives/direct-tv/
 712         (r'^rmtp([es]?)://', r'rtmp\1://'),
 713     )
 714     for mistake, fixup in COMMON_TYPOS:
 715         if re.match(mistake, url):
 716             return re.sub(mistake, fixup, url)
 717     return url
 718
 719
 720 def extract_basic_auth(url):
 721     parts = urllib.parse.urlsplit(url)
 722     if parts.username is None:
 723         return url, None
 724     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 725         parts.hostname if parts.port is None
 726         else '%s:%d' % (parts.hostname, parts.port))))
 727     auth_payload = base64.b64encode(
 728         ('%s:%s' % (parts.username, parts.password or '')).encode())
 729     return url, f'Basic {auth_payload.decode()}'
 730
 731
 732 def sanitized_Request(url, *args, **kwargs):
 733     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 734     if auth_header is not None:
 735         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 736         headers['Authorization'] = auth_header
 737     return urllib.request.Request(url, *args, **kwargs)
 738
 739
 740 def expand_path(s):
 741     """Expand shell variables and ~"""
 742     return os.path.expandvars(compat_expanduser(s))
 743
 744
 745 def orderedSet(iterable, *, lazy=False):
 746     """Remove all duplicates from the input iterable"""
 747     def _iter():
 748         seen = []  # Do not use set since the items can be unhashable
 749         for x in iterable:
 750             if x not in seen:
 751                 seen.append(x)
 752                 yield x
 753
 754     return _iter() if lazy else list(_iter())
 755
 756
 757 def _htmlentity_transform(entity_with_semicolon):
 758     """Transforms an HTML entity to a character."""
 759     entity = entity_with_semicolon[:-1]
 760
 761     # Known non-numeric HTML entity
 762     if entity in html.entities.name2codepoint:
 763         return chr(html.entities.name2codepoint[entity])
 764
 765     # TODO: HTML5 allows entities without a semicolon.
 766     # E.g. '&Eacuteric' should be decoded as 'Éric'.
 767     if entity_with_semicolon in html.entities.html5:
 768         return html.entities.html5[entity_with_semicolon]
 769
 770     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 771     if mobj is not None:
 772         numstr = mobj.group(1)
 773         if numstr.startswith('x'):
 774             base = 16
 775             numstr = '0%s' % numstr
 776         else:
 777             base = 10
 778         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 779         with contextlib.suppress(ValueError):
 780             return chr(int(numstr, base))
 781
 782     # Unknown entity in name, return its literal representation
 783     return '&%s;' % entity
 784
 785
 786 def unescapeHTML(s):
 787     if s is None:
 788         return None
 789     assert isinstance(s, str)
 790
 791     return re.sub(
 792         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 793
 794
 795 def escapeHTML(text):
 796     return (
 797         text
 798         .replace('&', '&amp;')
 799         .replace('<', '&lt;')
 800         .replace('>', '&gt;')
 801         .replace('"', '&quot;')
 802         .replace("'", '&#39;')
 803     )
 804
 805
 806 class netrc_from_content(netrc.netrc):
 807     def __init__(self, content):
 808         self.hosts, self.macros = {}, {}
 809         with io.StringIO(content) as stream:
 810             self._parse('-', stream, False)
 811
 812
 813 class Popen(subprocess.Popen):
 814     if sys.platform == 'win32':
 815         _startupinfo = subprocess.STARTUPINFO()
 816         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 817     else:
 818         _startupinfo = None
 819
 820     @staticmethod
 821     def _fix_pyinstaller_ld_path(env):
 822         """Restore LD_LIBRARY_PATH when using PyInstaller
 823             Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
 824                  https://github.com/yt-dlp/yt-dlp/issues/4573
 825         """
 826         if not hasattr(sys, '_MEIPASS'):
 827             return
 828
 829         def _fix(key):
 830             orig = env.get(f'{key}_ORIG')
 831             if orig is None:
 832                 env.pop(key, None)
 833             else:
 834                 env[key] = orig
 835
 836         _fix('LD_LIBRARY_PATH')  # Linux
 837         _fix('DYLD_LIBRARY_PATH')  # macOS
 838
 839     def __init__(self, *args, env=None, text=False, **kwargs):
 840         if env is None:
 841             env = os.environ.copy()
 842         self._fix_pyinstaller_ld_path(env)
 843
 844         self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
 845         if text is True:
 846             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 847             kwargs.setdefault('encoding', 'utf-8')
 848             kwargs.setdefault('errors', 'replace')
 849         super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
 850
 851     def communicate_or_kill(self, *args, **kwargs):
 852         try:
 853             return self.communicate(*args, **kwargs)
 854         except BaseException:  # Including KeyboardInterrupt
 855             self.kill(timeout=None)
 856             raise
 857
 858     def kill(self, *, timeout=0):
 859         super().kill()
 860         if timeout != 0:
 861             self.wait(timeout=timeout)
 862
 863     @classmethod
 864     def run(cls, *args, timeout=None, **kwargs):
 865         with cls(*args, **kwargs) as proc:
 866             default = '' if proc.__text_mode else b''
 867             stdout, stderr = proc.communicate_or_kill(timeout=timeout)
 868             return stdout or default, stderr or default, proc.returncode
 869
 870
 871 def encodeArgument(s):
 872     # Legacy code that uses byte strings
 873     # Uncomment the following line after fixing all post processors
 874     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 875     return s if isinstance(s, str) else s.decode('ascii')
 876
 877
 878 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 879
 880
 881 def timetuple_from_msec(msec):
 882     secs, msec = divmod(msec, 1000)
 883     mins, secs = divmod(secs, 60)
 884     hrs, mins = divmod(mins, 60)
 885     return _timetuple(hrs, mins, secs, msec)
 886
 887
 888 def formatSeconds(secs, delim=':', msec=False):
 889     time = timetuple_from_msec(secs * 1000)
 890     if time.hours:
 891         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 892     elif time.minutes:
 893         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 894     else:
 895         ret = '%d' % time.seconds
 896     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 897
 898
 899 def make_HTTPS_handler(params, **kwargs):
 900     from ..networking._helper import make_ssl_context
 901     return YoutubeDLHTTPSHandler(params, context=make_ssl_context(
 902         verify=not params.get('nocheckcertificate'),
 903         client_certificate=params.get('client_certificate'),
 904         client_certificate_key=params.get('client_certificate_key'),
 905         client_certificate_password=params.get('client_certificate_password'),
 906         legacy_support=params.get('legacyserverconnect'),
 907         use_certifi='no-certifi' not in params.get('compat_opts', []),
 908     ), **kwargs)
 909
 910
 911 def bug_reports_message(before=';'):
 912     from ..update import REPOSITORY
 913
 914     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 915            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 916
 917     before = before.rstrip()
 918     if not before or before.endswith(('.', '!', '?')):
 919         msg = msg[0].title() + msg[1:]
 920
 921     return (before + ' ' if before else '') + msg
 922
 923
 924 class YoutubeDLError(Exception):
 925     """Base exception for YoutubeDL errors."""
 926     msg = None
 927
 928     def __init__(self, msg=None):
 929         if msg is not None:
 930             self.msg = msg
 931         elif self.msg is None:
 932             self.msg = type(self).__name__
 933         super().__init__(self.msg)
 934
 935
 936 class ExtractorError(YoutubeDLError):
 937     """Error during info extraction."""
 938
 939     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
 940         """ tb, if given, is the original traceback (so that it can be printed out).
 941         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
 942         """
 943         from ..networking.exceptions import network_exceptions
 944         if sys.exc_info()[0] in network_exceptions:
 945             expected = True
 946
 947         self.orig_msg = str(msg)
 948         self.traceback = tb
 949         self.expected = expected
 950         self.cause = cause
 951         self.video_id = video_id
 952         self.ie = ie
 953         self.exc_info = sys.exc_info()  # preserve original exception
 954         if isinstance(self.exc_info[1], ExtractorError):
 955             self.exc_info = self.exc_info[1].exc_info
 956         super().__init__(self.__msg)
 957
 958     @property
 959     def __msg(self):
 960         return ''.join((
 961             format_field(self.ie, None, '[%s] '),
 962             format_field(self.video_id, None, '%s: '),
 963             self.orig_msg,
 964             format_field(self.cause, None, ' (caused by %r)'),
 965             '' if self.expected else bug_reports_message()))
 966
 967     def format_traceback(self):
 968         return join_nonempty(
 969             self.traceback and ''.join(traceback.format_tb(self.traceback)),
 970             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
 971             delim='\n') or None
 972
 973     def __setattr__(self, name, value):
 974         super().__setattr__(name, value)
 975         if getattr(self, 'msg', None) and name not in ('msg', 'args'):
 976             self.msg = self.__msg or type(self).__name__
 977             self.args = (self.msg, )  # Cannot be property
 978
 979
 980 class UnsupportedError(ExtractorError):
 981     def __init__(self, url):
 982         super().__init__(
 983             'Unsupported URL: %s' % url, expected=True)
 984         self.url = url
 985
 986
 987 class RegexNotFoundError(ExtractorError):
 988     """Error when a regex didn't match"""
 989     pass
 990
 991
 992 class GeoRestrictedError(ExtractorError):
 993     """Geographic restriction Error exception.
 994
 995     This exception may be thrown when a video is not available from your
 996     geographic location due to geographic restrictions imposed by a website.
 997     """
 998
 999     def __init__(self, msg, countries=None, **kwargs):
1000         kwargs['expected'] = True
1001         super().__init__(msg, **kwargs)
1002         self.countries = countries
1003
1004
1005 class UserNotLive(ExtractorError):
1006     """Error when a channel/user is not live"""
1007
1008     def __init__(self, msg=None, **kwargs):
1009         kwargs['expected'] = True
1010         super().__init__(msg or 'The channel is not currently live', **kwargs)
1011
1012
1013 class DownloadError(YoutubeDLError):
1014     """Download Error exception.
1015
1016     This exception may be thrown by FileDownloader objects if they are not
1017     configured to continue on errors. They will contain the appropriate
1018     error message.
1019     """
1020
1021     def __init__(self, msg, exc_info=None):
1022         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1023         super().__init__(msg)
1024         self.exc_info = exc_info
1025
1026
1027 class EntryNotInPlaylist(YoutubeDLError):
1028     """Entry not in playlist exception.
1029
1030     This exception will be thrown by YoutubeDL when a requested entry
1031     is not found in the playlist info_dict
1032     """
1033     msg = 'Entry not found in info'
1034
1035
1036 class SameFileError(YoutubeDLError):
1037     """Same File exception.
1038
1039     This exception will be thrown by FileDownloader objects if they detect
1040     multiple files would have to be downloaded to the same file on disk.
1041     """
1042     msg = 'Fixed output name but more than one file to download'
1043
1044     def __init__(self, filename=None):
1045         if filename is not None:
1046             self.msg += f': {filename}'
1047         super().__init__(self.msg)
1048
1049
1050 class PostProcessingError(YoutubeDLError):
1051     """Post Processing exception.
1052
1053     This exception may be raised by PostProcessor's .run() method to
1054     indicate an error in the postprocessing task.
1055     """
1056
1057
1058 class DownloadCancelled(YoutubeDLError):
1059     """ Exception raised when the download queue should be interrupted """
1060     msg = 'The download was cancelled'
1061
1062
1063 class ExistingVideoReached(DownloadCancelled):
1064     """ --break-on-existing triggered """
1065     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1066
1067
1068 class RejectedVideoReached(DownloadCancelled):
1069     """ --break-match-filter triggered """
1070     msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1071
1072
1073 class MaxDownloadsReached(DownloadCancelled):
1074     """ --max-downloads limit has been reached. """
1075     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1076
1077
1078 class ReExtractInfo(YoutubeDLError):
1079     """ Video info needs to be re-extracted. """
1080
1081     def __init__(self, msg, expected=False):
1082         super().__init__(msg)
1083         self.expected = expected
1084
1085
1086 class ThrottledDownload(ReExtractInfo):
1087     """ Download speed below --throttled-rate. """
1088     msg = 'The download speed is below throttle limit'
1089
1090     def __init__(self):
1091         super().__init__(self.msg, expected=False)
1092
1093
1094 class UnavailableVideoError(YoutubeDLError):
1095     """Unavailable Format exception.
1096
1097     This exception will be thrown when a video is requested
1098     in a format that is not available for that video.
1099     """
1100     msg = 'Unable to download video'
1101
1102     def __init__(self, err=None):
1103         if err is not None:
1104             self.msg += f': {err}'
1105         super().__init__(self.msg)
1106
1107
1108 class ContentTooShortError(YoutubeDLError):
1109     """Content Too Short exception.
1110
1111     This exception may be raised by FileDownloader objects when a file they
1112     download is too small for what the server announced first, indicating
1113     the connection was probably interrupted.
1114     """
1115
1116     def __init__(self, downloaded, expected):
1117         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1118         # Both in bytes
1119         self.downloaded = downloaded
1120         self.expected = expected
1121
1122
1123 class XAttrMetadataError(YoutubeDLError):
1124     def __init__(self, code=None, msg='Unknown error'):
1125         super().__init__(msg)
1126         self.code = code
1127         self.msg = msg
1128
1129         # Parsing code and msg
1130         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1131                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1132             self.reason = 'NO_SPACE'
1133         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1134             self.reason = 'VALUE_TOO_LONG'
1135         else:
1136             self.reason = 'NOT_SUPPORTED'
1137
1138
1139 class XAttrUnavailableError(YoutubeDLError):
1140     pass
1141
1142
1143 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1144     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1145         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1146         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1147         self._params = params
1148
1149     def https_open(self, req):
1150         kwargs = {}
1151         conn_class = self._https_conn_class
1152
1153         if hasattr(self, '_context'):  # python > 2.6
1154             kwargs['context'] = self._context
1155         if hasattr(self, '_check_hostname'):  # python 3.x
1156             kwargs['check_hostname'] = self._check_hostname
1157
1158         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1159         if socks_proxy:
1160             from ..networking._urllib import make_socks_conn_class
1161             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1162             del req.headers['Ytdl-socks-proxy']
1163
1164         from ..networking._urllib import _create_http_connection
1165         try:
1166             return self.do_open(
1167                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1168         except urllib.error.URLError as e:
1169             if (isinstance(e.reason, ssl.SSLError)
1170                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1171                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1172             raise
1173
1174
1175 def is_path_like(f):
1176     return isinstance(f, (str, bytes, os.PathLike))
1177
1178
1179 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1180     def __init__(self, cookiejar=None):
1181         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1182
1183     def http_response(self, request, response):
1184         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1185
1186     https_request = urllib.request.HTTPCookieProcessor.http_request
1187     https_response = http_response
1188
1189
1190 def extract_timezone(date_str):
1191     m = re.search(
1192         r'''(?x)
1193             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1194             (?P<tz>Z|                                            # just the UTC Z, or
1195                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1196                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1197                    [ ]?                                          # optional space
1198                 (?P<sign>\+|-)                                   # +/-
1199                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1200             $)
1201         ''', date_str)
1202     if not m:
1203         m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1204         timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1205         if timezone is not None:
1206             date_str = date_str[:-len(m.group('tz'))]
1207         timezone = datetime.timedelta(hours=timezone or 0)
1208     else:
1209         date_str = date_str[:-len(m.group('tz'))]
1210         if not m.group('sign'):
1211             timezone = datetime.timedelta()
1212         else:
1213             sign = 1 if m.group('sign') == '+' else -1
1214             timezone = datetime.timedelta(
1215                 hours=sign * int(m.group('hours')),
1216                 minutes=sign * int(m.group('minutes')))
1217     return timezone, date_str
1218
1219
1220 def parse_iso8601(date_str, delimiter='T', timezone=None):
1221     """ Return a UNIX timestamp from the given date """
1222
1223     if date_str is None:
1224         return None
1225
1226     date_str = re.sub(r'\.[0-9]+', '', date_str)
1227
1228     if timezone is None:
1229         timezone, date_str = extract_timezone(date_str)
1230
1231     with contextlib.suppress(ValueError):
1232         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1233         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1234         return calendar.timegm(dt.timetuple())
1235
1236
1237 def date_formats(day_first=True):
1238     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1239
1240
1241 def unified_strdate(date_str, day_first=True):
1242     """Return a string with the date in the format YYYYMMDD"""
1243
1244     if date_str is None:
1245         return None
1246     upload_date = None
1247     # Replace commas
1248     date_str = date_str.replace(',', ' ')
1249     # Remove AM/PM + timezone
1250     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1251     _, date_str = extract_timezone(date_str)
1252
1253     for expression in date_formats(day_first):
1254         with contextlib.suppress(ValueError):
1255             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1256     if upload_date is None:
1257         timetuple = email.utils.parsedate_tz(date_str)
1258         if timetuple:
1259             with contextlib.suppress(ValueError):
1260                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1261     if upload_date is not None:
1262         return str(upload_date)
1263
1264
1265 def unified_timestamp(date_str, day_first=True):
1266     if not isinstance(date_str, str):
1267         return None
1268
1269     date_str = re.sub(r'\s+', ' ', re.sub(
1270         r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1271
1272     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1273     timezone, date_str = extract_timezone(date_str)
1274
1275     # Remove AM/PM + timezone
1276     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1277
1278     # Remove unrecognized timezones from ISO 8601 alike timestamps
1279     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1280     if m:
1281         date_str = date_str[:-len(m.group('tz'))]
1282
1283     # Python only supports microseconds, so remove nanoseconds
1284     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1285     if m:
1286         date_str = m.group(1)
1287
1288     for expression in date_formats(day_first):
1289         with contextlib.suppress(ValueError):
1290             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1291             return calendar.timegm(dt.timetuple())
1292
1293     timetuple = email.utils.parsedate_tz(date_str)
1294     if timetuple:
1295         return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1296
1297
1298 def determine_ext(url, default_ext='unknown_video'):
1299     if url is None or '.' not in url:
1300         return default_ext
1301     guess = url.partition('?')[0].rpartition('.')[2]
1302     if re.match(r'^[A-Za-z0-9]+$', guess):
1303         return guess
1304     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1305     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1306         return guess.rstrip('/')
1307     else:
1308         return default_ext
1309
1310
1311 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1312     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1313
1314
1315 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1316     R"""
1317     Return a datetime object from a string.
1318     Supported format:
1319         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1320
1321     @param format       strftime format of DATE
1322     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1323                         auto: round to the unit provided in date_str (if applicable).
1324     """
1325     auto_precision = False
1326     if precision == 'auto':
1327         auto_precision = True
1328         precision = 'microsecond'
1329     today = datetime_round(datetime.datetime.utcnow(), precision)
1330     if date_str in ('now', 'today'):
1331         return today
1332     if date_str == 'yesterday':
1333         return today - datetime.timedelta(days=1)
1334     match = re.match(
1335         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1336         date_str)
1337     if match is not None:
1338         start_time = datetime_from_str(match.group('start'), precision, format)
1339         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1340         unit = match.group('unit')
1341         if unit == 'month' or unit == 'year':
1342             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1343             unit = 'day'
1344         else:
1345             if unit == 'week':
1346                 unit = 'day'
1347                 time *= 7
1348             delta = datetime.timedelta(**{unit + 's': time})
1349             new_date = start_time + delta
1350         if auto_precision:
1351             return datetime_round(new_date, unit)
1352         return new_date
1353
1354     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1355
1356
1357 def date_from_str(date_str, format='%Y%m%d', strict=False):
1358     R"""
1359     Return a date object from a string using datetime_from_str
1360
1361     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1362                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1363     """
1364     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1365         raise ValueError(f'Invalid date format "{date_str}"')
1366     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1367
1368
1369 def datetime_add_months(dt, months):
1370     """Increment/Decrement a datetime object by months."""
1371     month = dt.month + months - 1
1372     year = dt.year + month // 12
1373     month = month % 12 + 1
1374     day = min(dt.day, calendar.monthrange(year, month)[1])
1375     return dt.replace(year, month, day)
1376
1377
1378 def datetime_round(dt, precision='day'):
1379     """
1380     Round a datetime object's time to a specific precision
1381     """
1382     if precision == 'microsecond':
1383         return dt
1384
1385     unit_seconds = {
1386         'day': 86400,
1387         'hour': 3600,
1388         'minute': 60,
1389         'second': 1,
1390     }
1391     roundto = lambda x, n: ((x + n / 2) // n) * n
1392     timestamp = calendar.timegm(dt.timetuple())
1393     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1394
1395
1396 def hyphenate_date(date_str):
1397     """
1398     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1399     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1400     if match is not None:
1401         return '-'.join(match.groups())
1402     else:
1403         return date_str
1404
1405
1406 class DateRange:
1407     """Represents a time interval between two dates"""
1408
1409     def __init__(self, start=None, end=None):
1410         """start and end must be strings in the format accepted by date"""
1411         if start is not None:
1412             self.start = date_from_str(start, strict=True)
1413         else:
1414             self.start = datetime.datetime.min.date()
1415         if end is not None:
1416             self.end = date_from_str(end, strict=True)
1417         else:
1418             self.end = datetime.datetime.max.date()
1419         if self.start > self.end:
1420             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1421
1422     @classmethod
1423     def day(cls, day):
1424         """Returns a range that only contains the given day"""
1425         return cls(day, day)
1426
1427     def __contains__(self, date):
1428         """Check if the date is in the range"""
1429         if not isinstance(date, datetime.date):
1430             date = date_from_str(date)
1431         return self.start <= date <= self.end
1432
1433     def __repr__(self):
1434         return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1435
1436     def __eq__(self, other):
1437         return (isinstance(other, DateRange)
1438                 and self.start == other.start and self.end == other.end)
1439
1440
1441 @functools.cache
1442 def system_identifier():
1443     python_implementation = platform.python_implementation()
1444     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1445         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1446     libc_ver = []
1447     with contextlib.suppress(OSError):  # We may not have access to the executable
1448         libc_ver = platform.libc_ver()
1449
1450     return 'Python %s (%s %s %s) - %s (%s%s)' % (
1451         platform.python_version(),
1452         python_implementation,
1453         platform.machine(),
1454         platform.architecture()[0],
1455         platform.platform(),
1456         ssl.OPENSSL_VERSION,
1457         format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1458     )
1459
1460
1461 @functools.cache
1462 def get_windows_version():
1463     ''' Get Windows version. returns () if it's not running on Windows '''
1464     if compat_os_name == 'nt':
1465         return version_tuple(platform.win32_ver()[1])
1466     else:
1467         return ()
1468
1469
1470 def write_string(s, out=None, encoding=None):
1471     assert isinstance(s, str)
1472     out = out or sys.stderr
1473     # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1474     if not out:
1475         return
1476
1477     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1478         s = re.sub(r'([\r\n]+)', r' \1', s)
1479
1480     enc, buffer = None, out
1481     if 'b' in getattr(out, 'mode', ''):
1482         enc = encoding or preferredencoding()
1483     elif hasattr(out, 'buffer'):
1484         buffer = out.buffer
1485         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1486
1487     buffer.write(s.encode(enc, 'ignore') if enc else s)
1488     out.flush()
1489
1490
1491 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1492     from .. import _IN_CLI
1493     if _IN_CLI:
1494         if msg in deprecation_warning._cache:
1495             return
1496         deprecation_warning._cache.add(msg)
1497         if printer:
1498             return printer(f'{msg}{bug_reports_message()}', **kwargs)
1499         return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1500     else:
1501         import warnings
1502         warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1503
1504
1505 deprecation_warning._cache = set()
1506
1507
1508 def bytes_to_intlist(bs):
1509     if not bs:
1510         return []
1511     if isinstance(bs[0], int):  # Python 3
1512         return list(bs)
1513     else:
1514         return [ord(c) for c in bs]
1515
1516
1517 def intlist_to_bytes(xs):
1518     if not xs:
1519         return b''
1520     return struct.pack('%dB' % len(xs), *xs)
1521
1522
1523 class LockingUnsupportedError(OSError):
1524     msg = 'File locking is not supported'
1525
1526     def __init__(self):
1527         super().__init__(self.msg)
1528
1529
1530 # Cross-platform file locking
1531 if sys.platform == 'win32':
1532     import ctypes
1533     import ctypes.wintypes
1534     import msvcrt
1535
1536     class OVERLAPPED(ctypes.Structure):
1537         _fields_ = [
1538             ('Internal', ctypes.wintypes.LPVOID),
1539             ('InternalHigh', ctypes.wintypes.LPVOID),
1540             ('Offset', ctypes.wintypes.DWORD),
1541             ('OffsetHigh', ctypes.wintypes.DWORD),
1542             ('hEvent', ctypes.wintypes.HANDLE),
1543         ]
1544
1545     kernel32 = ctypes.WinDLL('kernel32')
1546     LockFileEx = kernel32.LockFileEx
1547     LockFileEx.argtypes = [
1548         ctypes.wintypes.HANDLE,     # hFile
1549         ctypes.wintypes.DWORD,      # dwFlags
1550         ctypes.wintypes.DWORD,      # dwReserved
1551         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1552         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1553         ctypes.POINTER(OVERLAPPED)  # Overlapped
1554     ]
1555     LockFileEx.restype = ctypes.wintypes.BOOL
1556     UnlockFileEx = kernel32.UnlockFileEx
1557     UnlockFileEx.argtypes = [
1558         ctypes.wintypes.HANDLE,     # hFile
1559         ctypes.wintypes.DWORD,      # dwReserved
1560         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1561         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1562         ctypes.POINTER(OVERLAPPED)  # Overlapped
1563     ]
1564     UnlockFileEx.restype = ctypes.wintypes.BOOL
1565     whole_low = 0xffffffff
1566     whole_high = 0x7fffffff
1567
1568     def _lock_file(f, exclusive, block):
1569         overlapped = OVERLAPPED()
1570         overlapped.Offset = 0
1571         overlapped.OffsetHigh = 0
1572         overlapped.hEvent = 0
1573         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1574
1575         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1576                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1577                           0, whole_low, whole_high, f._lock_file_overlapped_p):
1578             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1579             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1580
1581     def _unlock_file(f):
1582         assert f._lock_file_overlapped_p
1583         handle = msvcrt.get_osfhandle(f.fileno())
1584         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1585             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1586
1587 else:
1588     try:
1589         import fcntl
1590
1591         def _lock_file(f, exclusive, block):
1592             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1593             if not block:
1594                 flags |= fcntl.LOCK_NB
1595             try:
1596                 fcntl.flock(f, flags)
1597             except BlockingIOError:
1598                 raise
1599             except OSError:  # AOSP does not have flock()
1600                 fcntl.lockf(f, flags)
1601
1602         def _unlock_file(f):
1603             with contextlib.suppress(OSError):
1604                 return fcntl.flock(f, fcntl.LOCK_UN)
1605             with contextlib.suppress(OSError):
1606                 return fcntl.lockf(f, fcntl.LOCK_UN)  # AOSP does not have flock()
1607             return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB)  # virtiofs needs LOCK_NB on unlocking
1608
1609     except ImportError:
1610
1611         def _lock_file(f, exclusive, block):
1612             raise LockingUnsupportedError()
1613
1614         def _unlock_file(f):
1615             raise LockingUnsupportedError()
1616
1617
1618 class locked_file:
1619     locked = False
1620
1621     def __init__(self, filename, mode, block=True, encoding=None):
1622         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1623             raise NotImplementedError(mode)
1624         self.mode, self.block = mode, block
1625
1626         writable = any(f in mode for f in 'wax+')
1627         readable = any(f in mode for f in 'r+')
1628         flags = functools.reduce(operator.ior, (
1629             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
1630             getattr(os, 'O_BINARY', 0),  # Windows only
1631             getattr(os, 'O_NOINHERIT', 0),  # Windows only
1632             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
1633             os.O_APPEND if 'a' in mode else 0,
1634             os.O_EXCL if 'x' in mode else 0,
1635             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1636         ))
1637
1638         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1639
1640     def __enter__(self):
1641         exclusive = 'r' not in self.mode
1642         try:
1643             _lock_file(self.f, exclusive, self.block)
1644             self.locked = True
1645         except OSError:
1646             self.f.close()
1647             raise
1648         if 'w' in self.mode:
1649             try:
1650                 self.f.truncate()
1651             except OSError as e:
1652                 if e.errno not in (
1653                     errno.ESPIPE,  # Illegal seek - expected for FIFO
1654                     errno.EINVAL,  # Invalid argument - expected for /dev/null
1655                 ):
1656                     raise
1657         return self
1658
1659     def unlock(self):
1660         if not self.locked:
1661             return
1662         try:
1663             _unlock_file(self.f)
1664         finally:
1665             self.locked = False
1666
1667     def __exit__(self, *_):
1668         try:
1669             self.unlock()
1670         finally:
1671             self.f.close()
1672
1673     open = __enter__
1674     close = __exit__
1675
1676     def __getattr__(self, attr):
1677         return getattr(self.f, attr)
1678
1679     def __iter__(self):
1680         return iter(self.f)
1681
1682
1683 @functools.cache
1684 def get_filesystem_encoding():
1685     encoding = sys.getfilesystemencoding()
1686     return encoding if encoding is not None else 'utf-8'
1687
1688
1689 def shell_quote(args):
1690     quoted_args = []
1691     encoding = get_filesystem_encoding()
1692     for a in args:
1693         if isinstance(a, bytes):
1694             # We may get a filename encoded with 'encodeFilename'
1695             a = a.decode(encoding)
1696         quoted_args.append(compat_shlex_quote(a))
1697     return ' '.join(quoted_args)
1698
1699
1700 def smuggle_url(url, data):
1701     """ Pass additional data in a URL for internal use. """
1702
1703     url, idata = unsmuggle_url(url, {})
1704     data.update(idata)
1705     sdata = urllib.parse.urlencode(
1706         {'__youtubedl_smuggle': json.dumps(data)})
1707     return url + '#' + sdata
1708
1709
1710 def unsmuggle_url(smug_url, default=None):
1711     if '#__youtubedl_smuggle' not in smug_url:
1712         return smug_url, default
1713     url, _, sdata = smug_url.rpartition('#')
1714     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1715     data = json.loads(jsond)
1716     return url, data
1717
1718
1719 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1720     """ Formats numbers with decimal sufixes like K, M, etc """
1721     num, factor = float_or_none(num), float(factor)
1722     if num is None or num < 0:
1723         return None
1724     POSSIBLE_SUFFIXES = 'kMGTPEZY'
1725     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1726     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1727     if factor == 1024:
1728         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1729     converted = num / (factor ** exponent)
1730     return fmt % (converted, suffix)
1731
1732
1733 def format_bytes(bytes):
1734     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1735
1736
1737 def lookup_unit_table(unit_table, s, strict=False):
1738     num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1739     units_re = '|'.join(re.escape(u) for u in unit_table)
1740     m = (re.fullmatch if strict else re.match)(
1741         rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1742     if not m:
1743         return None
1744
1745     num = float(m.group('num').replace(',', '.'))
1746     mult = unit_table[m.group('unit')]
1747     return round(num * mult)
1748
1749
1750 def parse_bytes(s):
1751     """Parse a string indicating a byte quantity into an integer"""
1752     return lookup_unit_table(
1753         {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1754         s.upper(), strict=True)
1755
1756
1757 def parse_filesize(s):
1758     if s is None:
1759         return None
1760
1761     # The lower-case forms are of course incorrect and unofficial,
1762     # but we support those too
1763     _UNIT_TABLE = {
1764         'B': 1,
1765         'b': 1,
1766         'bytes': 1,
1767         'KiB': 1024,
1768         'KB': 1000,
1769         'kB': 1024,
1770         'Kb': 1000,
1771         'kb': 1000,
1772         'kilobytes': 1000,
1773         'kibibytes': 1024,
1774         'MiB': 1024 ** 2,
1775         'MB': 1000 ** 2,
1776         'mB': 1024 ** 2,
1777         'Mb': 1000 ** 2,
1778         'mb': 1000 ** 2,
1779         'megabytes': 1000 ** 2,
1780         'mebibytes': 1024 ** 2,
1781         'GiB': 1024 ** 3,
1782         'GB': 1000 ** 3,
1783         'gB': 1024 ** 3,
1784         'Gb': 1000 ** 3,
1785         'gb': 1000 ** 3,
1786         'gigabytes': 1000 ** 3,
1787         'gibibytes': 1024 ** 3,
1788         'TiB': 1024 ** 4,
1789         'TB': 1000 ** 4,
1790         'tB': 1024 ** 4,
1791         'Tb': 1000 ** 4,
1792         'tb': 1000 ** 4,
1793         'terabytes': 1000 ** 4,
1794         'tebibytes': 1024 ** 4,
1795         'PiB': 1024 ** 5,
1796         'PB': 1000 ** 5,
1797         'pB': 1024 ** 5,
1798         'Pb': 1000 ** 5,
1799         'pb': 1000 ** 5,
1800         'petabytes': 1000 ** 5,
1801         'pebibytes': 1024 ** 5,
1802         'EiB': 1024 ** 6,
1803         'EB': 1000 ** 6,
1804         'eB': 1024 ** 6,
1805         'Eb': 1000 ** 6,
1806         'eb': 1000 ** 6,
1807         'exabytes': 1000 ** 6,
1808         'exbibytes': 1024 ** 6,
1809         'ZiB': 1024 ** 7,
1810         'ZB': 1000 ** 7,
1811         'zB': 1024 ** 7,
1812         'Zb': 1000 ** 7,
1813         'zb': 1000 ** 7,
1814         'zettabytes': 1000 ** 7,
1815         'zebibytes': 1024 ** 7,
1816         'YiB': 1024 ** 8,
1817         'YB': 1000 ** 8,
1818         'yB': 1024 ** 8,
1819         'Yb': 1000 ** 8,
1820         'yb': 1000 ** 8,
1821         'yottabytes': 1000 ** 8,
1822         'yobibytes': 1024 ** 8,
1823     }
1824
1825     return lookup_unit_table(_UNIT_TABLE, s)
1826
1827
1828 def parse_count(s):
1829     if s is None:
1830         return None
1831
1832     s = re.sub(r'^[^\d]+\s', '', s).strip()
1833
1834     if re.match(r'^[\d,.]+$', s):
1835         return str_to_int(s)
1836
1837     _UNIT_TABLE = {
1838         'k': 1000,
1839         'K': 1000,
1840         'm': 1000 ** 2,
1841         'M': 1000 ** 2,
1842         'kk': 1000 ** 2,
1843         'KK': 1000 ** 2,
1844         'b': 1000 ** 3,
1845         'B': 1000 ** 3,
1846     }
1847
1848     ret = lookup_unit_table(_UNIT_TABLE, s)
1849     if ret is not None:
1850         return ret
1851
1852     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1853     if mobj:
1854         return str_to_int(mobj.group(1))
1855
1856
1857 def parse_resolution(s, *, lenient=False):
1858     if s is None:
1859         return {}
1860
1861     if lenient:
1862         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1863     else:
1864         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1865     if mobj:
1866         return {
1867             'width': int(mobj.group('w')),
1868             'height': int(mobj.group('h')),
1869         }
1870
1871     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1872     if mobj:
1873         return {'height': int(mobj.group(1))}
1874
1875     mobj = re.search(r'\b([48])[kK]\b', s)
1876     if mobj:
1877         return {'height': int(mobj.group(1)) * 540}
1878
1879     return {}
1880
1881
1882 def parse_bitrate(s):
1883     if not isinstance(s, str):
1884         return
1885     mobj = re.search(r'\b(\d+)\s*kbps', s)
1886     if mobj:
1887         return int(mobj.group(1))
1888
1889
1890 def month_by_name(name, lang='en'):
1891     """ Return the number of a month by (locale-independently) English name """
1892
1893     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1894
1895     try:
1896         return month_names.index(name) + 1
1897     except ValueError:
1898         return None
1899
1900
1901 def month_by_abbreviation(abbrev):
1902     """ Return the number of a month by (locale-independently) English
1903         abbreviations """
1904
1905     try:
1906         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1907     except ValueError:
1908         return None
1909
1910
1911 def fix_xml_ampersands(xml_str):
1912     """Replace all the '&' by '&amp;' in XML"""
1913     return re.sub(
1914         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1915         '&amp;',
1916         xml_str)
1917
1918
1919 def setproctitle(title):
1920     assert isinstance(title, str)
1921
1922     # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1923     try:
1924         import ctypes
1925     except ImportError:
1926         return
1927
1928     try:
1929         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1930     except OSError:
1931         return
1932     except TypeError:
1933         # LoadLibrary in Windows Python 2.7.13 only expects
1934         # a bytestring, but since unicode_literals turns
1935         # every string into a unicode string, it fails.
1936         return
1937     title_bytes = title.encode()
1938     buf = ctypes.create_string_buffer(len(title_bytes))
1939     buf.value = title_bytes
1940     try:
1941         libc.prctl(15, buf, 0, 0, 0)
1942     except AttributeError:
1943         return  # Strange libc, just skip this
1944
1945
1946 def remove_start(s, start):
1947     return s[len(start):] if s is not None and s.startswith(start) else s
1948
1949
1950 def remove_end(s, end):
1951     return s[:-len(end)] if s is not None and s.endswith(end) else s
1952
1953
1954 def remove_quotes(s):
1955     if s is None or len(s) < 2:
1956         return s
1957     for quote in ('"', "'", ):
1958         if s[0] == quote and s[-1] == quote:
1959             return s[1:-1]
1960     return s
1961
1962
1963 def get_domain(url):
1964     """
1965     This implementation is inconsistent, but is kept for compatibility.
1966     Use this only for "webpage_url_domain"
1967     """
1968     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1969
1970
1971 def url_basename(url):
1972     path = urllib.parse.urlparse(url).path
1973     return path.strip('/').split('/')[-1]
1974
1975
1976 def base_url(url):
1977     return re.match(r'https?://[^?#]+/', url).group()
1978
1979
1980 def urljoin(base, path):
1981     if isinstance(path, bytes):
1982         path = path.decode()
1983     if not isinstance(path, str) or not path:
1984         return None
1985     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1986         return path
1987     if isinstance(base, bytes):
1988         base = base.decode()
1989     if not isinstance(base, str) or not re.match(
1990             r'^(?:https?:)?//', base):
1991         return None
1992     return urllib.parse.urljoin(base, path)
1993
1994
1995 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1996     if get_attr and v is not None:
1997         v = getattr(v, get_attr, None)
1998     try:
1999         return int(v) * invscale // scale
2000     except (ValueError, TypeError, OverflowError):
2001         return default
2002
2003
2004 def str_or_none(v, default=None):
2005     return default if v is None else str(v)
2006
2007
2008 def str_to_int(int_str):
2009     """ A more relaxed version of int_or_none """
2010     if isinstance(int_str, int):
2011         return int_str
2012     elif isinstance(int_str, str):
2013         int_str = re.sub(r'[,\.\+]', '', int_str)
2014         return int_or_none(int_str)
2015
2016
2017 def float_or_none(v, scale=1, invscale=1, default=None):
2018     if v is None:
2019         return default
2020     try:
2021         return float(v) * invscale / scale
2022     except (ValueError, TypeError):
2023         return default
2024
2025
2026 def bool_or_none(v, default=None):
2027     return v if isinstance(v, bool) else default
2028
2029
2030 def strip_or_none(v, default=None):
2031     return v.strip() if isinstance(v, str) else default
2032
2033
2034 def url_or_none(url):
2035     if not url or not isinstance(url, str):
2036         return None
2037     url = url.strip()
2038     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2039
2040
2041 def request_to_url(req):
2042     if isinstance(req, urllib.request.Request):
2043         return req.get_full_url()
2044     else:
2045         return req
2046
2047
2048 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
2049     datetime_object = None
2050     try:
2051         if isinstance(timestamp, (int, float)):  # unix timestamp
2052             # Using naive datetime here can break timestamp() in Windows
2053             # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2054             # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2055             # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2056             datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2057                                + datetime.timedelta(seconds=timestamp))
2058         elif isinstance(timestamp, str):  # assume YYYYMMDD
2059             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2060         date_format = re.sub(  # Support %s on windows
2061             r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2062         return datetime_object.strftime(date_format)
2063     except (ValueError, TypeError, AttributeError):
2064         return default
2065
2066
2067 def parse_duration(s):
2068     if not isinstance(s, str):
2069         return None
2070     s = s.strip()
2071     if not s:
2072         return None
2073
2074     days, hours, mins, secs, ms = [None] * 5
2075     m = re.match(r'''(?x)
2076             (?P<before_secs>
2077                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2078             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2079             (?P<ms>[.:][0-9]+)?Z?$
2080         ''', s)
2081     if m:
2082         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2083     else:
2084         m = re.match(
2085             r'''(?ix)(?:P?
2086                 (?:
2087                     [0-9]+\s*y(?:ears?)?,?\s*
2088                 )?
2089                 (?:
2090                     [0-9]+\s*m(?:onths?)?,?\s*
2091                 )?
2092                 (?:
2093                     [0-9]+\s*w(?:eeks?)?,?\s*
2094                 )?
2095                 (?:
2096                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2097                 )?
2098                 T)?
2099                 (?:
2100                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2101                 )?
2102                 (?:
2103                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2104                 )?
2105                 (?:
2106                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2107                 )?Z?$''', s)
2108         if m:
2109             days, hours, mins, secs, ms = m.groups()
2110         else:
2111             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2112             if m:
2113                 hours, mins = m.groups()
2114             else:
2115                 return None
2116
2117     if ms:
2118         ms = ms.replace(':', '.')
2119     return sum(float(part or 0) * mult for part, mult in (
2120         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2121
2122
2123 def prepend_extension(filename, ext, expected_real_ext=None):
2124     name, real_ext = os.path.splitext(filename)
2125     return (
2126         f'{name}.{ext}{real_ext}'
2127         if not expected_real_ext or real_ext[1:] == expected_real_ext
2128         else f'{filename}.{ext}')
2129
2130
2131 def replace_extension(filename, ext, expected_real_ext=None):
2132     name, real_ext = os.path.splitext(filename)
2133     return '{}.{}'.format(
2134         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2135         ext)
2136
2137
2138 def check_executable(exe, args=[]):
2139     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2140     args can be a list of arguments for a short output (like -version) """
2141     try:
2142         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2143     except OSError:
2144         return False
2145     return exe
2146
2147
2148 def _get_exe_version_output(exe, args):
2149     try:
2150         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2151         # SIGTTOU if yt-dlp is run in the background.
2152         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2153         stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2154                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2155         if ret:
2156             return None
2157     except OSError:
2158         return False
2159     return stdout
2160
2161
2162 def detect_exe_version(output, version_re=None, unrecognized='present'):
2163     assert isinstance(output, str)
2164     if version_re is None:
2165         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2166     m = re.search(version_re, output)
2167     if m:
2168         return m.group(1)
2169     else:
2170         return unrecognized
2171
2172
2173 def get_exe_version(exe, args=['--version'],
2174                     version_re=None, unrecognized=('present', 'broken')):
2175     """ Returns the version of the specified executable,
2176     or False if the executable is not present """
2177     unrecognized = variadic(unrecognized)
2178     assert len(unrecognized) in (1, 2)
2179     out = _get_exe_version_output(exe, args)
2180     if out is None:
2181         return unrecognized[-1]
2182     return out and detect_exe_version(out, version_re, unrecognized[0])
2183
2184
2185 def frange(start=0, stop=None, step=1):
2186     """Float range"""
2187     if stop is None:
2188         start, stop = 0, start
2189     sign = [-1, 1][step > 0] if step else 0
2190     while sign * start < sign * stop:
2191         yield start
2192         start += step
2193
2194
2195 class LazyList(collections.abc.Sequence):
2196     """Lazy immutable list from an iterable
2197     Note that slices of a LazyList are lists and not LazyList"""
2198
2199     class IndexError(IndexError):
2200         pass
2201
2202     def __init__(self, iterable, *, reverse=False, _cache=None):
2203         self._iterable = iter(iterable)
2204         self._cache = [] if _cache is None else _cache
2205         self._reversed = reverse
2206
2207     def __iter__(self):
2208         if self._reversed:
2209             # We need to consume the entire iterable to iterate in reverse
2210             yield from self.exhaust()
2211             return
2212         yield from self._cache
2213         for item in self._iterable:
2214             self._cache.append(item)
2215             yield item
2216
2217     def _exhaust(self):
2218         self._cache.extend(self._iterable)
2219         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2220         return self._cache
2221
2222     def exhaust(self):
2223         """Evaluate the entire iterable"""
2224         return self._exhaust()[::-1 if self._reversed else 1]
2225
2226     @staticmethod
2227     def _reverse_index(x):
2228         return None if x is None else ~x
2229
2230     def __getitem__(self, idx):
2231         if isinstance(idx, slice):
2232             if self._reversed:
2233                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2234             start, stop, step = idx.start, idx.stop, idx.step or 1
2235         elif isinstance(idx, int):
2236             if self._reversed:
2237                 idx = self._reverse_index(idx)
2238             start, stop, step = idx, idx, 0
2239         else:
2240             raise TypeError('indices must be integers or slices')
2241         if ((start or 0) < 0 or (stop or 0) < 0
2242                 or (start is None and step < 0)
2243                 or (stop is None and step > 0)):
2244             # We need to consume the entire iterable to be able to slice from the end
2245             # Obviously, never use this with infinite iterables
2246             self._exhaust()
2247             try:
2248                 return self._cache[idx]
2249             except IndexError as e:
2250                 raise self.IndexError(e) from e
2251         n = max(start or 0, stop or 0) - len(self._cache) + 1
2252         if n > 0:
2253             self._cache.extend(itertools.islice(self._iterable, n))
2254         try:
2255             return self._cache[idx]
2256         except IndexError as e:
2257             raise self.IndexError(e) from e
2258
2259     def __bool__(self):
2260         try:
2261             self[-1] if self._reversed else self[0]
2262         except self.IndexError:
2263             return False
2264         return True
2265
2266     def __len__(self):
2267         self._exhaust()
2268         return len(self._cache)
2269
2270     def __reversed__(self):
2271         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2272
2273     def __copy__(self):
2274         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2275
2276     def __repr__(self):
2277         # repr and str should mimic a list. So we exhaust the iterable
2278         return repr(self.exhaust())
2279
2280     def __str__(self):
2281         return repr(self.exhaust())
2282
2283
2284 class PagedList:
2285
2286     class IndexError(IndexError):
2287         pass
2288
2289     def __len__(self):
2290         # This is only useful for tests
2291         return len(self.getslice())
2292
2293     def __init__(self, pagefunc, pagesize, use_cache=True):
2294         self._pagefunc = pagefunc
2295         self._pagesize = pagesize
2296         self._pagecount = float('inf')
2297         self._use_cache = use_cache
2298         self._cache = {}
2299
2300     def getpage(self, pagenum):
2301         page_results = self._cache.get(pagenum)
2302         if page_results is None:
2303             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2304         if self._use_cache:
2305             self._cache[pagenum] = page_results
2306         return page_results
2307
2308     def getslice(self, start=0, end=None):
2309         return list(self._getslice(start, end))
2310
2311     def _getslice(self, start, end):
2312         raise NotImplementedError('This method must be implemented by subclasses')
2313
2314     def __getitem__(self, idx):
2315         assert self._use_cache, 'Indexing PagedList requires cache'
2316         if not isinstance(idx, int) or idx < 0:
2317             raise TypeError('indices must be non-negative integers')
2318         entries = self.getslice(idx, idx + 1)
2319         if not entries:
2320             raise self.IndexError()
2321         return entries[0]
2322
2323
2324 class OnDemandPagedList(PagedList):
2325     """Download pages until a page with less than maximum results"""
2326
2327     def _getslice(self, start, end):
2328         for pagenum in itertools.count(start // self._pagesize):
2329             firstid = pagenum * self._pagesize
2330             nextfirstid = pagenum * self._pagesize + self._pagesize
2331             if start >= nextfirstid:
2332                 continue
2333
2334             startv = (
2335                 start % self._pagesize
2336                 if firstid <= start < nextfirstid
2337                 else 0)
2338             endv = (
2339                 ((end - 1) % self._pagesize) + 1
2340                 if (end is not None and firstid <= end <= nextfirstid)
2341                 else None)
2342
2343             try:
2344                 page_results = self.getpage(pagenum)
2345             except Exception:
2346                 self._pagecount = pagenum - 1
2347                 raise
2348             if startv != 0 or endv is not None:
2349                 page_results = page_results[startv:endv]
2350             yield from page_results
2351
2352             # A little optimization - if current page is not "full", ie. does
2353             # not contain page_size videos then we can assume that this page
2354             # is the last one - there are no more ids on further pages -
2355             # i.e. no need to query again.
2356             if len(page_results) + startv < self._pagesize:
2357                 break
2358
2359             # If we got the whole page, but the next page is not interesting,
2360             # break out early as well
2361             if end == nextfirstid:
2362                 break
2363
2364
2365 class InAdvancePagedList(PagedList):
2366     """PagedList with total number of pages known in advance"""
2367
2368     def __init__(self, pagefunc, pagecount, pagesize):
2369         PagedList.__init__(self, pagefunc, pagesize, True)
2370         self._pagecount = pagecount
2371
2372     def _getslice(self, start, end):
2373         start_page = start // self._pagesize
2374         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2375         skip_elems = start - start_page * self._pagesize
2376         only_more = None if end is None else end - start
2377         for pagenum in range(start_page, end_page):
2378             page_results = self.getpage(pagenum)
2379             if skip_elems:
2380                 page_results = page_results[skip_elems:]
2381                 skip_elems = None
2382             if only_more is not None:
2383                 if len(page_results) < only_more:
2384                     only_more -= len(page_results)
2385                 else:
2386                     yield from page_results[:only_more]
2387                     break
2388             yield from page_results
2389
2390
2391 class PlaylistEntries:
2392     MissingEntry = object()
2393     is_exhausted = False
2394
2395     def __init__(self, ydl, info_dict):
2396         self.ydl = ydl
2397
2398         # _entries must be assigned now since infodict can change during iteration
2399         entries = info_dict.get('entries')
2400         if entries is None:
2401             raise EntryNotInPlaylist('There are no entries')
2402         elif isinstance(entries, list):
2403             self.is_exhausted = True
2404
2405         requested_entries = info_dict.get('requested_entries')
2406         self.is_incomplete = requested_entries is not None
2407         if self.is_incomplete:
2408             assert self.is_exhausted
2409             self._entries = [self.MissingEntry] * max(requested_entries or [0])
2410             for i, entry in zip(requested_entries, entries):
2411                 self._entries[i - 1] = entry
2412         elif isinstance(entries, (list, PagedList, LazyList)):
2413             self._entries = entries
2414         else:
2415             self._entries = LazyList(entries)
2416
2417     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2418         (?P<start>[+-]?\d+)?
2419         (?P<range>[:-]
2420             (?P<end>[+-]?\d+|inf(?:inite)?)?
2421             (?::(?P<step>[+-]?\d+))?
2422         )?''')
2423
2424     @classmethod
2425     def parse_playlist_items(cls, string):
2426         for segment in string.split(','):
2427             if not segment:
2428                 raise ValueError('There is two or more consecutive commas')
2429             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2430             if not mobj:
2431                 raise ValueError(f'{segment!r} is not a valid specification')
2432             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2433             if int_or_none(step) == 0:
2434                 raise ValueError(f'Step in {segment!r} cannot be zero')
2435             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2436
2437     def get_requested_items(self):
2438         playlist_items = self.ydl.params.get('playlist_items')
2439         playlist_start = self.ydl.params.get('playliststart', 1)
2440         playlist_end = self.ydl.params.get('playlistend')
2441         # For backwards compatibility, interpret -1 as whole list
2442         if playlist_end in (-1, None):
2443             playlist_end = ''
2444         if not playlist_items:
2445             playlist_items = f'{playlist_start}:{playlist_end}'
2446         elif playlist_start != 1 or playlist_end:
2447             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2448
2449         for index in self.parse_playlist_items(playlist_items):
2450             for i, entry in self[index]:
2451                 yield i, entry
2452                 if not entry:
2453                     continue
2454                 try:
2455                     # The item may have just been added to archive. Don't break due to it
2456                     if not self.ydl.params.get('lazy_playlist'):
2457                         # TODO: Add auto-generated fields
2458                         self.ydl._match_entry(entry, incomplete=True, silent=True)
2459                 except (ExistingVideoReached, RejectedVideoReached):
2460                     return
2461
2462     def get_full_count(self):
2463         if self.is_exhausted and not self.is_incomplete:
2464             return len(self)
2465         elif isinstance(self._entries, InAdvancePagedList):
2466             if self._entries._pagesize == 1:
2467                 return self._entries._pagecount
2468
2469     @functools.cached_property
2470     def _getter(self):
2471         if isinstance(self._entries, list):
2472             def get_entry(i):
2473                 try:
2474                     entry = self._entries[i]
2475                 except IndexError:
2476                     entry = self.MissingEntry
2477                     if not self.is_incomplete:
2478                         raise self.IndexError()
2479                 if entry is self.MissingEntry:
2480                     raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2481                 return entry
2482         else:
2483             def get_entry(i):
2484                 try:
2485                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2486                 except (LazyList.IndexError, PagedList.IndexError):
2487                     raise self.IndexError()
2488         return get_entry
2489
2490     def __getitem__(self, idx):
2491         if isinstance(idx, int):
2492             idx = slice(idx, idx)
2493
2494         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2495         step = 1 if idx.step is None else idx.step
2496         if idx.start is None:
2497             start = 0 if step > 0 else len(self) - 1
2498         else:
2499             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2500
2501         # NB: Do not call len(self) when idx == [:]
2502         if idx.stop is None:
2503             stop = 0 if step < 0 else float('inf')
2504         else:
2505             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2506         stop += [-1, 1][step > 0]
2507
2508         for i in frange(start, stop, step):
2509             if i < 0:
2510                 continue
2511             try:
2512                 entry = self._getter(i)
2513             except self.IndexError:
2514                 self.is_exhausted = True
2515                 if step > 0:
2516                     break
2517                 continue
2518             yield i + 1, entry
2519
2520     def __len__(self):
2521         return len(tuple(self[:]))
2522
2523     class IndexError(IndexError):
2524         pass
2525
2526
2527 def uppercase_escape(s):
2528     unicode_escape = codecs.getdecoder('unicode_escape')
2529     return re.sub(
2530         r'\\U[0-9a-fA-F]{8}',
2531         lambda m: unicode_escape(m.group(0))[0],
2532         s)
2533
2534
2535 def lowercase_escape(s):
2536     unicode_escape = codecs.getdecoder('unicode_escape')
2537     return re.sub(
2538         r'\\u[0-9a-fA-F]{4}',
2539         lambda m: unicode_escape(m.group(0))[0],
2540         s)
2541
2542
2543 def escape_rfc3986(s):
2544     """Escape non-ASCII characters as suggested by RFC 3986"""
2545     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2546
2547
2548 def escape_url(url):
2549     """Escape URL as suggested by RFC 3986"""
2550     url_parsed = urllib.parse.urlparse(url)
2551     return url_parsed._replace(
2552         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2553         path=escape_rfc3986(url_parsed.path),
2554         params=escape_rfc3986(url_parsed.params),
2555         query=escape_rfc3986(url_parsed.query),
2556         fragment=escape_rfc3986(url_parsed.fragment)
2557     ).geturl()
2558
2559
2560 def parse_qs(url, **kwargs):
2561     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2562
2563
2564 def read_batch_urls(batch_fd):
2565     def fixup(url):
2566         if not isinstance(url, str):
2567             url = url.decode('utf-8', 'replace')
2568         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2569         for bom in BOM_UTF8:
2570             if url.startswith(bom):
2571                 url = url[len(bom):]
2572         url = url.lstrip()
2573         if not url or url.startswith(('#', ';', ']')):
2574             return False
2575         # "#" cannot be stripped out since it is part of the URI
2576         # However, it can be safely stripped out if following a whitespace
2577         return re.split(r'\s#', url, 1)[0].rstrip()
2578
2579     with contextlib.closing(batch_fd) as fd:
2580         return [url for url in map(fixup, fd) if url]
2581
2582
2583 def urlencode_postdata(*args, **kargs):
2584     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2585
2586
2587 def update_url(url, *, query_update=None, **kwargs):
2588     """Replace URL components specified by kwargs
2589        @param url           str or parse url tuple
2590        @param query_update  update query
2591        @returns             str
2592     """
2593     if isinstance(url, str):
2594         if not kwargs and not query_update:
2595             return url
2596         else:
2597             url = urllib.parse.urlparse(url)
2598     if query_update:
2599         assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2600         kwargs['query'] = urllib.parse.urlencode({
2601             **urllib.parse.parse_qs(url.query),
2602             **query_update
2603         }, True)
2604     return urllib.parse.urlunparse(url._replace(**kwargs))
2605
2606
2607 def update_url_query(url, query):
2608     return update_url(url, query_update=query)
2609
2610
2611 def _multipart_encode_impl(data, boundary):
2612     content_type = 'multipart/form-data; boundary=%s' % boundary
2613
2614     out = b''
2615     for k, v in data.items():
2616         out += b'--' + boundary.encode('ascii') + b'\r\n'
2617         if isinstance(k, str):
2618             k = k.encode()
2619         if isinstance(v, str):
2620             v = v.encode()
2621         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2622         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2623         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2624         if boundary.encode('ascii') in content:
2625             raise ValueError('Boundary overlaps with data')
2626         out += content
2627
2628     out += b'--' + boundary.encode('ascii') + b'--\r\n'
2629
2630     return out, content_type
2631
2632
2633 def multipart_encode(data, boundary=None):
2634     '''
2635     Encode a dict to RFC 7578-compliant form-data
2636
2637     data:
2638         A dict where keys and values can be either Unicode or bytes-like
2639         objects.
2640     boundary:
2641         If specified a Unicode object, it's used as the boundary. Otherwise
2642         a random boundary is generated.
2643
2644     Reference: https://tools.ietf.org/html/rfc7578
2645     '''
2646     has_specified_boundary = boundary is not None
2647
2648     while True:
2649         if boundary is None:
2650             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2651
2652         try:
2653             out, content_type = _multipart_encode_impl(data, boundary)
2654             break
2655         except ValueError:
2656             if has_specified_boundary:
2657                 raise
2658             boundary = None
2659
2660     return out, content_type
2661
2662
2663 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2664     if blocked_types is NO_DEFAULT:
2665         blocked_types = (str, bytes, collections.abc.Mapping)
2666     return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2667
2668
2669 def variadic(x, allowed_types=NO_DEFAULT):
2670     if not isinstance(allowed_types, (tuple, type)):
2671         deprecation_warning('allowed_types should be a tuple or a type')
2672         allowed_types = tuple(allowed_types)
2673     return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2674
2675
2676 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2677     for f in funcs:
2678         try:
2679             val = f(*args, **kwargs)
2680         except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2681             pass
2682         else:
2683             if expected_type is None or isinstance(val, expected_type):
2684                 return val
2685
2686
2687 def try_get(src, getter, expected_type=None):
2688     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2689
2690
2691 def filter_dict(dct, cndn=lambda _, v: v is not None):
2692     return {k: v for k, v in dct.items() if cndn(k, v)}
2693
2694
2695 def merge_dicts(*dicts):
2696     merged = {}
2697     for a_dict in dicts:
2698         for k, v in a_dict.items():
2699             if (v is not None and k not in merged
2700                     or isinstance(v, str) and merged[k] == ''):
2701                 merged[k] = v
2702     return merged
2703
2704
2705 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2706     return string if isinstance(string, str) else str(string, encoding, errors)
2707
2708
2709 US_RATINGS = {
2710     'G': 0,
2711     'PG': 10,
2712     'PG-13': 13,
2713     'R': 16,
2714     'NC': 18,
2715 }
2716
2717
2718 TV_PARENTAL_GUIDELINES = {
2719     'TV-Y': 0,
2720     'TV-Y7': 7,
2721     'TV-G': 0,
2722     'TV-PG': 0,
2723     'TV-14': 14,
2724     'TV-MA': 17,
2725 }
2726
2727
2728 def parse_age_limit(s):
2729     # isinstance(False, int) is True. So type() must be used instead
2730     if type(s) is int:  # noqa: E721
2731         return s if 0 <= s <= 21 else None
2732     elif not isinstance(s, str):
2733         return None
2734     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2735     if m:
2736         return int(m.group('age'))
2737     s = s.upper()
2738     if s in US_RATINGS:
2739         return US_RATINGS[s]
2740     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2741     if m:
2742         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2743     return None
2744
2745
2746 def strip_jsonp(code):
2747     return re.sub(
2748         r'''(?sx)^
2749             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2750             (?:\s*&&\s*(?P=func_name))?
2751             \s*\(\s*(?P<callback_data>.*)\);?
2752             \s*?(?://[^\n]*)*$''',
2753         r'\g<callback_data>', code)
2754
2755
2756 def js_to_json(code, vars={}, *, strict=False):
2757     # vars is a dict of var, val pairs to substitute
2758     STRING_QUOTES = '\'"`'
2759     STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2760     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2761     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2762     INTEGER_TABLE = (
2763         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2764         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2765     )
2766
2767     def process_escape(match):
2768         JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2769         escape = match.group(1) or match.group(2)
2770
2771         return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2772                 else R'\u00' if escape == 'x'
2773                 else '' if escape == '\n'
2774                 else escape)
2775
2776     def template_substitute(match):
2777         evaluated = js_to_json(match.group(1), vars, strict=strict)
2778         if evaluated[0] == '"':
2779             return json.loads(evaluated)
2780         return evaluated
2781
2782     def fix_kv(m):
2783         v = m.group(0)
2784         if v in ('true', 'false', 'null'):
2785             return v
2786         elif v in ('undefined', 'void 0'):
2787             return 'null'
2788         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2789             return ''
2790
2791         if v[0] in STRING_QUOTES:
2792             v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2793             escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2794             return f'"{escaped}"'
2795
2796         for regex, base in INTEGER_TABLE:
2797             im = re.match(regex, v)
2798             if im:
2799                 i = int(im.group(1), base)
2800                 return f'"{i}":' if v.endswith(':') else str(i)
2801
2802         if v in vars:
2803             try:
2804                 if not strict:
2805                     json.loads(vars[v])
2806             except json.JSONDecodeError:
2807                 return json.dumps(vars[v])
2808             else:
2809                 return vars[v]
2810
2811         if not strict:
2812             return f'"{v}"'
2813
2814         raise ValueError(f'Unknown value: {v}')
2815
2816     def create_map(mobj):
2817         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2818
2819     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2820     if not strict:
2821         code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
2822         code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2823         code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2824         code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2825
2826     return re.sub(rf'''(?sx)
2827         {STRING_RE}|
2828         {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2829         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2830         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2831         [0-9]+(?={SKIP_RE}:)|
2832         !+
2833         ''', fix_kv, code)
2834
2835
2836 def qualities(quality_ids):
2837     """ Get a numeric quality value out of a list of possible values """
2838     def q(qid):
2839         try:
2840             return quality_ids.index(qid)
2841         except ValueError:
2842             return -1
2843     return q
2844
2845
2846 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2847
2848
2849 DEFAULT_OUTTMPL = {
2850     'default': '%(title)s [%(id)s].%(ext)s',
2851     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2852 }
2853 OUTTMPL_TYPES = {
2854     'chapter': None,
2855     'subtitle': None,
2856     'thumbnail': None,
2857     'description': 'description',
2858     'annotation': 'annotations.xml',
2859     'infojson': 'info.json',
2860     'link': None,
2861     'pl_video': None,
2862     'pl_thumbnail': None,
2863     'pl_description': 'description',
2864     'pl_infojson': 'info.json',
2865 }
2866
2867 # As of [1] format syntax is:
2868 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2869 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2870 STR_FORMAT_RE_TMPL = r'''(?x)
2871     (?<!%)(?P<prefix>(?:%%)*)
2872     %
2873     (?P<has_key>\((?P<key>{0})\))?
2874     (?P<format>
2875         (?P<conversion>[#0\-+ ]+)?
2876         (?P<min_width>\d+)?
2877         (?P<precision>\.\d+)?
2878         (?P<len_mod>[hlL])?  # unused in python
2879         {1}  # conversion type
2880     )
2881 '''
2882
2883
2884 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2885
2886
2887 def limit_length(s, length):
2888     """ Add ellipses to overly long strings """
2889     if s is None:
2890         return None
2891     ELLIPSES = '...'
2892     if len(s) > length:
2893         return s[:length - len(ELLIPSES)] + ELLIPSES
2894     return s
2895
2896
2897 def version_tuple(v):
2898     return tuple(int(e) for e in re.split(r'[-.]', v))
2899
2900
2901 def is_outdated_version(version, limit, assume_new=True):
2902     if not version:
2903         return not assume_new
2904     try:
2905         return version_tuple(version) < version_tuple(limit)
2906     except ValueError:
2907         return not assume_new
2908
2909
2910 def ytdl_is_updateable():
2911     """ Returns if yt-dlp can be updated with -U """
2912
2913     from ..update import is_non_updateable
2914
2915     return not is_non_updateable()
2916
2917
2918 def args_to_str(args):
2919     # Get a short string representation for a subprocess command
2920     return ' '.join(compat_shlex_quote(a) for a in args)
2921
2922
2923 def error_to_str(err):
2924     return f'{type(err).__name__}: {err}'
2925
2926
2927 def mimetype2ext(mt, default=NO_DEFAULT):
2928     if not isinstance(mt, str):
2929         if default is not NO_DEFAULT:
2930             return default
2931         return None
2932
2933     MAP = {
2934         # video
2935         '3gpp': '3gp',
2936         'mp2t': 'ts',
2937         'mp4': 'mp4',
2938         'mpeg': 'mpeg',
2939         'mpegurl': 'm3u8',
2940         'quicktime': 'mov',
2941         'webm': 'webm',
2942         'vp9': 'vp9',
2943         'x-flv': 'flv',
2944         'x-m4v': 'm4v',
2945         'x-matroska': 'mkv',
2946         'x-mng': 'mng',
2947         'x-mp4-fragmented': 'mp4',
2948         'x-ms-asf': 'asf',
2949         'x-ms-wmv': 'wmv',
2950         'x-msvideo': 'avi',
2951
2952         # application (streaming playlists)
2953         'dash+xml': 'mpd',
2954         'f4m+xml': 'f4m',
2955         'hds+xml': 'f4m',
2956         'vnd.apple.mpegurl': 'm3u8',
2957         'vnd.ms-sstr+xml': 'ism',
2958         'x-mpegurl': 'm3u8',
2959
2960         # audio
2961         'audio/mp4': 'm4a',
2962         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2963         # Using .mp3 as it's the most popular one
2964         'audio/mpeg': 'mp3',
2965         'audio/webm': 'webm',
2966         'audio/x-matroska': 'mka',
2967         'audio/x-mpegurl': 'm3u',
2968         'midi': 'mid',
2969         'ogg': 'ogg',
2970         'wav': 'wav',
2971         'wave': 'wav',
2972         'x-aac': 'aac',
2973         'x-flac': 'flac',
2974         'x-m4a': 'm4a',
2975         'x-realaudio': 'ra',
2976         'x-wav': 'wav',
2977
2978         # image
2979         'avif': 'avif',
2980         'bmp': 'bmp',
2981         'gif': 'gif',
2982         'jpeg': 'jpg',
2983         'png': 'png',
2984         'svg+xml': 'svg',
2985         'tiff': 'tif',
2986         'vnd.wap.wbmp': 'wbmp',
2987         'webp': 'webp',
2988         'x-icon': 'ico',
2989         'x-jng': 'jng',
2990         'x-ms-bmp': 'bmp',
2991
2992         # caption
2993         'filmstrip+json': 'fs',
2994         'smptett+xml': 'tt',
2995         'ttaf+xml': 'dfxp',
2996         'ttml+xml': 'ttml',
2997         'x-ms-sami': 'sami',
2998
2999         # misc
3000         'gzip': 'gz',
3001         'json': 'json',
3002         'xml': 'xml',
3003         'zip': 'zip',
3004     }
3005
3006     mimetype = mt.partition(';')[0].strip().lower()
3007     _, _, subtype = mimetype.rpartition('/')
3008
3009     ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3010     if ext:
3011         return ext
3012     elif default is not NO_DEFAULT:
3013         return default
3014     return subtype.replace('+', '.')
3015
3016
3017 def ext2mimetype(ext_or_url):
3018     if not ext_or_url:
3019         return None
3020     if '.' not in ext_or_url:
3021         ext_or_url = f'file.{ext_or_url}'
3022     return mimetypes.guess_type(ext_or_url)[0]
3023
3024
3025 def parse_codecs(codecs_str):
3026     # http://tools.ietf.org/html/rfc6381
3027     if not codecs_str:
3028         return {}
3029     split_codecs = list(filter(None, map(
3030         str.strip, codecs_str.strip().strip(',').split(','))))
3031     vcodec, acodec, scodec, hdr = None, None, None, None
3032     for full_codec in split_codecs:
3033         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3034         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3035                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3036             if vcodec:
3037                 continue
3038             vcodec = full_codec
3039             if parts[0] in ('dvh1', 'dvhe'):
3040                 hdr = 'DV'
3041             elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3042                 hdr = 'HDR10'
3043             elif parts[:2] == ['vp9', '2']:
3044                 hdr = 'HDR10'
3045         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3046                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3047             acodec = acodec or full_codec
3048         elif parts[0] in ('stpp', 'wvtt'):
3049             scodec = scodec or full_codec
3050         else:
3051             write_string(f'WARNING: Unknown codec {full_codec}\n')
3052     if vcodec or acodec or scodec:
3053         return {
3054             'vcodec': vcodec or 'none',
3055             'acodec': acodec or 'none',
3056             'dynamic_range': hdr,
3057             **({'scodec': scodec} if scodec is not None else {}),
3058         }
3059     elif len(split_codecs) == 2:
3060         return {
3061             'vcodec': split_codecs[0],
3062             'acodec': split_codecs[1],
3063         }
3064     return {}
3065
3066
3067 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3068     assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3069
3070     allow_mkv = not preferences or 'mkv' in preferences
3071
3072     if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3073         return 'mkv'  # TODO: any other format allows this?
3074
3075     # TODO: All codecs supported by parse_codecs isn't handled here
3076     COMPATIBLE_CODECS = {
3077         'mp4': {
3078             'av1', 'hevc', 'avc1', 'mp4a', 'ac-4',  # fourcc (m3u8, mpd)
3079             'h264', 'aacl', 'ec-3',  # Set in ISM
3080         },
3081         'webm': {
3082             'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3083             'vp9x', 'vp8x',  # in the webm spec
3084         },
3085     }
3086
3087     sanitize_codec = functools.partial(
3088         try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3089     vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3090
3091     for ext in preferences or COMPATIBLE_CODECS.keys():
3092         codec_set = COMPATIBLE_CODECS.get(ext, set())
3093         if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3094             return ext
3095
3096     COMPATIBLE_EXTS = (
3097         {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3098         {'webm', 'weba'},
3099     )
3100     for ext in preferences or vexts:
3101         current_exts = {ext, *vexts, *aexts}
3102         if ext == 'mkv' or current_exts == {ext} or any(
3103                 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3104             return ext
3105     return 'mkv' if allow_mkv else preferences[-1]
3106
3107
3108 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3109     getheader = url_handle.headers.get
3110
3111     cd = getheader('Content-Disposition')
3112     if cd:
3113         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3114         if m:
3115             e = determine_ext(m.group('filename'), default_ext=None)
3116             if e:
3117                 return e
3118
3119     meta_ext = getheader('x-amz-meta-name')
3120     if meta_ext:
3121         e = meta_ext.rpartition('.')[2]
3122         if e:
3123             return e
3124
3125     return mimetype2ext(getheader('Content-Type'), default=default)
3126
3127
3128 def encode_data_uri(data, mime_type):
3129     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3130
3131
3132 def age_restricted(content_limit, age_limit):
3133     """ Returns True iff the content should be blocked """
3134
3135     if age_limit is None:  # No limit set
3136         return False
3137     if content_limit is None:
3138         return False  # Content available for everyone
3139     return age_limit < content_limit
3140
3141
3142 # List of known byte-order-marks (BOM)
3143 BOMS = [
3144     (b'\xef\xbb\xbf', 'utf-8'),
3145     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3146     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3147     (b'\xff\xfe', 'utf-16-le'),
3148     (b'\xfe\xff', 'utf-16-be'),
3149 ]
3150
3151
3152 def is_html(first_bytes):
3153     """ Detect whether a file contains HTML by examining its first bytes. """
3154
3155     encoding = 'utf-8'
3156     for bom, enc in BOMS:
3157         while first_bytes.startswith(bom):
3158             encoding, first_bytes = enc, first_bytes[len(bom):]
3159
3160     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3161
3162
3163 def determine_protocol(info_dict):
3164     protocol = info_dict.get('protocol')
3165     if protocol is not None:
3166         return protocol
3167
3168     url = sanitize_url(info_dict['url'])
3169     if url.startswith('rtmp'):
3170         return 'rtmp'
3171     elif url.startswith('mms'):
3172         return 'mms'
3173     elif url.startswith('rtsp'):
3174         return 'rtsp'
3175
3176     ext = determine_ext(url)
3177     if ext == 'm3u8':
3178         return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3179     elif ext == 'f4m':
3180         return 'f4m'
3181
3182     return urllib.parse.urlparse(url).scheme
3183
3184
3185 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3186     """ Render a list of rows, each as a list of values.
3187     Text after a \t will be right aligned """
3188     def width(string):
3189         return len(remove_terminal_sequences(string).replace('\t', ''))
3190
3191     def get_max_lens(table):
3192         return [max(width(str(v)) for v in col) for col in zip(*table)]
3193
3194     def filter_using_list(row, filterArray):
3195         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3196
3197     max_lens = get_max_lens(data) if hide_empty else []
3198     header_row = filter_using_list(header_row, max_lens)
3199     data = [filter_using_list(row, max_lens) for row in data]
3200
3201     table = [header_row] + data
3202     max_lens = get_max_lens(table)
3203     extra_gap += 1
3204     if delim:
3205         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3206         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3207     for row in table:
3208         for pos, text in enumerate(map(str, row)):
3209             if '\t' in text:
3210                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3211             else:
3212                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3213     ret = '\n'.join(''.join(row).rstrip() for row in table)
3214     return ret
3215
3216
3217 def _match_one(filter_part, dct, incomplete):
3218     # TODO: Generalize code with YoutubeDL._build_format_filter
3219     STRING_OPERATORS = {
3220         '*=': operator.contains,
3221         '^=': lambda attr, value: attr.startswith(value),
3222         '$=': lambda attr, value: attr.endswith(value),
3223         '~=': lambda attr, value: re.search(value, attr),
3224     }
3225     COMPARISON_OPERATORS = {
3226         **STRING_OPERATORS,
3227         '<=': operator.le,  # "<=" must be defined above "<"
3228         '<': operator.lt,
3229         '>=': operator.ge,
3230         '>': operator.gt,
3231         '=': operator.eq,
3232     }
3233
3234     if isinstance(incomplete, bool):
3235         is_incomplete = lambda _: incomplete
3236     else:
3237         is_incomplete = lambda k: k in incomplete
3238
3239     operator_rex = re.compile(r'''(?x)
3240         (?P<key>[a-z_]+)
3241         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3242         (?:
3243             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3244             (?P<strval>.+?)
3245         )
3246         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3247     m = operator_rex.fullmatch(filter_part.strip())
3248     if m:
3249         m = m.groupdict()
3250         unnegated_op = COMPARISON_OPERATORS[m['op']]
3251         if m['negation']:
3252             op = lambda attr, value: not unnegated_op(attr, value)
3253         else:
3254             op = unnegated_op
3255         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3256         if m['quote']:
3257             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3258         actual_value = dct.get(m['key'])
3259         numeric_comparison = None
3260         if isinstance(actual_value, (int, float)):
3261             # If the original field is a string and matching comparisonvalue is
3262             # a number we should respect the origin of the original field
3263             # and process comparison value as a string (see
3264             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3265             try:
3266                 numeric_comparison = int(comparison_value)
3267             except ValueError:
3268                 numeric_comparison = parse_filesize(comparison_value)
3269                 if numeric_comparison is None:
3270                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3271                 if numeric_comparison is None:
3272                     numeric_comparison = parse_duration(comparison_value)
3273         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3274             raise ValueError('Operator %s only supports string values!' % m['op'])
3275         if actual_value is None:
3276             return is_incomplete(m['key']) or m['none_inclusive']
3277         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3278
3279     UNARY_OPERATORS = {
3280         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3281         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3282     }
3283     operator_rex = re.compile(r'''(?x)
3284         (?P<op>%s)\s*(?P<key>[a-z_]+)
3285         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3286     m = operator_rex.fullmatch(filter_part.strip())
3287     if m:
3288         op = UNARY_OPERATORS[m.group('op')]
3289         actual_value = dct.get(m.group('key'))
3290         if is_incomplete(m.group('key')) and actual_value is None:
3291             return True
3292         return op(actual_value)
3293
3294     raise ValueError('Invalid filter part %r' % filter_part)
3295
3296
3297 def match_str(filter_str, dct, incomplete=False):
3298     """ Filter a dictionary with a simple string syntax.
3299     @returns           Whether the filter passes
3300     @param incomplete  Set of keys that is expected to be missing from dct.
3301                        Can be True/False to indicate all/none of the keys may be missing.
3302                        All conditions on incomplete keys pass if the key is missing
3303     """
3304     return all(
3305         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3306         for filter_part in re.split(r'(?<!\\)&', filter_str))
3307
3308
3309 def match_filter_func(filters, breaking_filters=None):
3310     if not filters and not breaking_filters:
3311         return None
3312     breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3313     filters = set(variadic(filters or []))
3314
3315     interactive = '-' in filters
3316     if interactive:
3317         filters.remove('-')
3318
3319     def _match_func(info_dict, incomplete=False):
3320         ret = breaking_filters(info_dict, incomplete)
3321         if ret is not None:
3322             raise RejectedVideoReached(ret)
3323
3324         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3325             return NO_DEFAULT if interactive and not incomplete else None
3326         else:
3327             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3328             filter_str = ') | ('.join(map(str.strip, filters))
3329             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3330     return _match_func
3331
3332
3333 class download_range_func:
3334     def __init__(self, chapters, ranges, from_info=False):
3335         self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3336
3337     def __call__(self, info_dict, ydl):
3338
3339         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3340                    else 'Cannot match chapters since chapter information is unavailable')
3341         for regex in self.chapters or []:
3342             for i, chapter in enumerate(info_dict.get('chapters') or []):
3343                 if re.search(regex, chapter['title']):
3344                     warning = None
3345                     yield {**chapter, 'index': i}
3346         if self.chapters and warning:
3347             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3348
3349         for start, end in self.ranges or []:
3350             yield {
3351                 'start_time': self._handle_negative_timestamp(start, info_dict),
3352                 'end_time': self._handle_negative_timestamp(end, info_dict),
3353             }
3354
3355         if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3356             yield {
3357                 'start_time': info_dict.get('start_time') or 0,
3358                 'end_time': info_dict.get('end_time') or float('inf'),
3359             }
3360         elif not self.ranges and not self.chapters:
3361             yield {}
3362
3363     @staticmethod
3364     def _handle_negative_timestamp(time, info):
3365         return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3366
3367     def __eq__(self, other):
3368         return (isinstance(other, download_range_func)
3369                 and self.chapters == other.chapters and self.ranges == other.ranges)
3370
3371     def __repr__(self):
3372         return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3373
3374
3375 def parse_dfxp_time_expr(time_expr):
3376     if not time_expr:
3377         return
3378
3379     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3380     if mobj:
3381         return float(mobj.group('time_offset'))
3382
3383     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3384     if mobj:
3385         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3386
3387
3388 def srt_subtitles_timecode(seconds):
3389     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3390
3391
3392 def ass_subtitles_timecode(seconds):
3393     time = timetuple_from_msec(seconds * 1000)
3394     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3395
3396
3397 def dfxp2srt(dfxp_data):
3398     '''
3399     @param dfxp_data A bytes-like object containing DFXP data
3400     @returns A unicode object containing converted SRT data
3401     '''
3402     LEGACY_NAMESPACES = (
3403         (b'http://www.w3.org/ns/ttml', [
3404             b'http://www.w3.org/2004/11/ttaf1',
3405             b'http://www.w3.org/2006/04/ttaf1',
3406             b'http://www.w3.org/2006/10/ttaf1',
3407         ]),
3408         (b'http://www.w3.org/ns/ttml#styling', [
3409             b'http://www.w3.org/ns/ttml#style',
3410         ]),
3411     )
3412
3413     SUPPORTED_STYLING = [
3414         'color',
3415         'fontFamily',
3416         'fontSize',
3417         'fontStyle',
3418         'fontWeight',
3419         'textDecoration'
3420     ]
3421
3422     _x = functools.partial(xpath_with_ns, ns_map={
3423         'xml': 'http://www.w3.org/XML/1998/namespace',
3424         'ttml': 'http://www.w3.org/ns/ttml',
3425         'tts': 'http://www.w3.org/ns/ttml#styling',
3426     })
3427
3428     styles = {}
3429     default_style = {}
3430
3431     class TTMLPElementParser:
3432         _out = ''
3433         _unclosed_elements = []
3434         _applied_styles = []
3435
3436         def start(self, tag, attrib):
3437             if tag in (_x('ttml:br'), 'br'):
3438                 self._out += '\n'
3439             else:
3440                 unclosed_elements = []
3441                 style = {}
3442                 element_style_id = attrib.get('style')
3443                 if default_style:
3444                     style.update(default_style)
3445                 if element_style_id:
3446                     style.update(styles.get(element_style_id, {}))
3447                 for prop in SUPPORTED_STYLING:
3448                     prop_val = attrib.get(_x('tts:' + prop))
3449                     if prop_val:
3450                         style[prop] = prop_val
3451                 if style:
3452                     font = ''
3453                     for k, v in sorted(style.items()):
3454                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3455                             continue
3456                         if k == 'color':
3457                             font += ' color="%s"' % v
3458                         elif k == 'fontSize':
3459                             font += ' size="%s"' % v
3460                         elif k == 'fontFamily':
3461                             font += ' face="%s"' % v
3462                         elif k == 'fontWeight' and v == 'bold':
3463                             self._out += '<b>'
3464                             unclosed_elements.append('b')
3465                         elif k == 'fontStyle' and v == 'italic':
3466                             self._out += '<i>'
3467                             unclosed_elements.append('i')
3468                         elif k == 'textDecoration' and v == 'underline':
3469                             self._out += '<u>'
3470                             unclosed_elements.append('u')
3471                     if font:
3472                         self._out += '<font' + font + '>'
3473                         unclosed_elements.append('font')
3474                     applied_style = {}
3475                     if self._applied_styles:
3476                         applied_style.update(self._applied_styles[-1])
3477                     applied_style.update(style)
3478                     self._applied_styles.append(applied_style)
3479                 self._unclosed_elements.append(unclosed_elements)
3480
3481         def end(self, tag):
3482             if tag not in (_x('ttml:br'), 'br'):
3483                 unclosed_elements = self._unclosed_elements.pop()
3484                 for element in reversed(unclosed_elements):
3485                     self._out += '</%s>' % element
3486                 if unclosed_elements and self._applied_styles:
3487                     self._applied_styles.pop()
3488
3489         def data(self, data):
3490             self._out += data
3491
3492         def close(self):
3493             return self._out.strip()
3494
3495     # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3496     # This will not trigger false positives since only UTF-8 text is being replaced
3497     dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3498
3499     def parse_node(node):
3500         target = TTMLPElementParser()
3501         parser = xml.etree.ElementTree.XMLParser(target=target)
3502         parser.feed(xml.etree.ElementTree.tostring(node))
3503         return parser.close()
3504
3505     for k, v in LEGACY_NAMESPACES:
3506         for ns in v:
3507             dfxp_data = dfxp_data.replace(ns, k)
3508
3509     dfxp = compat_etree_fromstring(dfxp_data)
3510     out = []
3511     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3512
3513     if not paras:
3514         raise ValueError('Invalid dfxp/TTML subtitle')
3515
3516     repeat = False
3517     while True:
3518         for style in dfxp.findall(_x('.//ttml:style')):
3519             style_id = style.get('id') or style.get(_x('xml:id'))
3520             if not style_id:
3521                 continue
3522             parent_style_id = style.get('style')
3523             if parent_style_id:
3524                 if parent_style_id not in styles:
3525                     repeat = True
3526                     continue
3527                 styles[style_id] = styles[parent_style_id].copy()
3528             for prop in SUPPORTED_STYLING:
3529                 prop_val = style.get(_x('tts:' + prop))
3530                 if prop_val:
3531                     styles.setdefault(style_id, {})[prop] = prop_val
3532         if repeat:
3533             repeat = False
3534         else:
3535             break
3536
3537     for p in ('body', 'div'):
3538         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3539         if ele is None:
3540             continue
3541         style = styles.get(ele.get('style'))
3542         if not style:
3543             continue
3544         default_style.update(style)
3545
3546     for para, index in zip(paras, itertools.count(1)):
3547         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3548         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3549         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3550         if begin_time is None:
3551             continue
3552         if not end_time:
3553             if not dur:
3554                 continue
3555             end_time = begin_time + dur
3556         out.append('%d\n%s --> %s\n%s\n\n' % (
3557             index,
3558             srt_subtitles_timecode(begin_time),
3559             srt_subtitles_timecode(end_time),
3560             parse_node(para)))
3561
3562     return ''.join(out)
3563
3564
3565 def cli_option(params, command_option, param, separator=None):
3566     param = params.get(param)
3567     return ([] if param is None
3568             else [command_option, str(param)] if separator is None
3569             else [f'{command_option}{separator}{param}'])
3570
3571
3572 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3573     param = params.get(param)
3574     assert param in (True, False, None)
3575     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3576
3577
3578 def cli_valueless_option(params, command_option, param, expected_value=True):
3579     return [command_option] if params.get(param) == expected_value else []
3580
3581
3582 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3583     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3584         if use_compat:
3585             return argdict
3586         else:
3587             argdict = None
3588     if argdict is None:
3589         return default
3590     assert isinstance(argdict, dict)
3591
3592     assert isinstance(keys, (list, tuple))
3593     for key_list in keys:
3594         arg_list = list(filter(
3595             lambda x: x is not None,
3596             [argdict.get(key.lower()) for key in variadic(key_list)]))
3597         if arg_list:
3598             return [arg for args in arg_list for arg in args]
3599     return default
3600
3601
3602 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3603     main_key, exe = main_key.lower(), exe.lower()
3604     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3605     keys = [f'{root_key}{k}' for k in (keys or [''])]
3606     if root_key in keys:
3607         if main_key != exe:
3608             keys.append((main_key, exe))
3609         keys.append('default')
3610     else:
3611         use_compat = False
3612     return cli_configuration_args(argdict, keys, default, use_compat)
3613
3614
3615 class ISO639Utils:
3616     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3617     _lang_map = {
3618         'aa': 'aar',
3619         'ab': 'abk',
3620         'ae': 'ave',
3621         'af': 'afr',
3622         'ak': 'aka',
3623         'am': 'amh',
3624         'an': 'arg',
3625         'ar': 'ara',
3626         'as': 'asm',
3627         'av': 'ava',
3628         'ay': 'aym',
3629         'az': 'aze',
3630         'ba': 'bak',
3631         'be': 'bel',
3632         'bg': 'bul',
3633         'bh': 'bih',
3634         'bi': 'bis',
3635         'bm': 'bam',
3636         'bn': 'ben',
3637         'bo': 'bod',
3638         'br': 'bre',
3639         'bs': 'bos',
3640         'ca': 'cat',
3641         'ce': 'che',
3642         'ch': 'cha',
3643         'co': 'cos',
3644         'cr': 'cre',
3645         'cs': 'ces',
3646         'cu': 'chu',
3647         'cv': 'chv',
3648         'cy': 'cym',
3649         'da': 'dan',
3650         'de': 'deu',
3651         'dv': 'div',
3652         'dz': 'dzo',
3653         'ee': 'ewe',
3654         'el': 'ell',
3655         'en': 'eng',
3656         'eo': 'epo',
3657         'es': 'spa',
3658         'et': 'est',
3659         'eu': 'eus',
3660         'fa': 'fas',
3661         'ff': 'ful',
3662         'fi': 'fin',
3663         'fj': 'fij',
3664         'fo': 'fao',
3665         'fr': 'fra',
3666         'fy': 'fry',
3667         'ga': 'gle',
3668         'gd': 'gla',
3669         'gl': 'glg',
3670         'gn': 'grn',
3671         'gu': 'guj',
3672         'gv': 'glv',
3673         'ha': 'hau',
3674         'he': 'heb',
3675         'iw': 'heb',  # Replaced by he in 1989 revision
3676         'hi': 'hin',
3677         'ho': 'hmo',
3678         'hr': 'hrv',
3679         'ht': 'hat',
3680         'hu': 'hun',
3681         'hy': 'hye',
3682         'hz': 'her',
3683         'ia': 'ina',
3684         'id': 'ind',
3685         'in': 'ind',  # Replaced by id in 1989 revision
3686         'ie': 'ile',
3687         'ig': 'ibo',
3688         'ii': 'iii',
3689         'ik': 'ipk',
3690         'io': 'ido',
3691         'is': 'isl',
3692         'it': 'ita',
3693         'iu': 'iku',
3694         'ja': 'jpn',
3695         'jv': 'jav',
3696         'ka': 'kat',
3697         'kg': 'kon',
3698         'ki': 'kik',
3699         'kj': 'kua',
3700         'kk': 'kaz',
3701         'kl': 'kal',
3702         'km': 'khm',
3703         'kn': 'kan',
3704         'ko': 'kor',
3705         'kr': 'kau',
3706         'ks': 'kas',
3707         'ku': 'kur',
3708         'kv': 'kom',
3709         'kw': 'cor',
3710         'ky': 'kir',
3711         'la': 'lat',
3712         'lb': 'ltz',
3713         'lg': 'lug',
3714         'li': 'lim',
3715         'ln': 'lin',
3716         'lo': 'lao',
3717         'lt': 'lit',
3718         'lu': 'lub',
3719         'lv': 'lav',
3720         'mg': 'mlg',
3721         'mh': 'mah',
3722         'mi': 'mri',
3723         'mk': 'mkd',
3724         'ml': 'mal',
3725         'mn': 'mon',
3726         'mr': 'mar',
3727         'ms': 'msa',
3728         'mt': 'mlt',
3729         'my': 'mya',
3730         'na': 'nau',
3731         'nb': 'nob',
3732         'nd': 'nde',
3733         'ne': 'nep',
3734         'ng': 'ndo',
3735         'nl': 'nld',
3736         'nn': 'nno',
3737         'no': 'nor',
3738         'nr': 'nbl',
3739         'nv': 'nav',
3740         'ny': 'nya',
3741         'oc': 'oci',
3742         'oj': 'oji',
3743         'om': 'orm',
3744         'or': 'ori',
3745         'os': 'oss',
3746         'pa': 'pan',
3747         'pe': 'per',
3748         'pi': 'pli',
3749         'pl': 'pol',
3750         'ps': 'pus',
3751         'pt': 'por',
3752         'qu': 'que',
3753         'rm': 'roh',
3754         'rn': 'run',
3755         'ro': 'ron',
3756         'ru': 'rus',
3757         'rw': 'kin',
3758         'sa': 'san',
3759         'sc': 'srd',
3760         'sd': 'snd',
3761         'se': 'sme',
3762         'sg': 'sag',
3763         'si': 'sin',
3764         'sk': 'slk',
3765         'sl': 'slv',
3766         'sm': 'smo',
3767         'sn': 'sna',
3768         'so': 'som',
3769         'sq': 'sqi',
3770         'sr': 'srp',
3771         'ss': 'ssw',
3772         'st': 'sot',
3773         'su': 'sun',
3774         'sv': 'swe',
3775         'sw': 'swa',
3776         'ta': 'tam',
3777         'te': 'tel',
3778         'tg': 'tgk',
3779         'th': 'tha',
3780         'ti': 'tir',
3781         'tk': 'tuk',
3782         'tl': 'tgl',
3783         'tn': 'tsn',
3784         'to': 'ton',
3785         'tr': 'tur',
3786         'ts': 'tso',
3787         'tt': 'tat',
3788         'tw': 'twi',
3789         'ty': 'tah',
3790         'ug': 'uig',
3791         'uk': 'ukr',
3792         'ur': 'urd',
3793         'uz': 'uzb',
3794         've': 'ven',
3795         'vi': 'vie',
3796         'vo': 'vol',
3797         'wa': 'wln',
3798         'wo': 'wol',
3799         'xh': 'xho',
3800         'yi': 'yid',
3801         'ji': 'yid',  # Replaced by yi in 1989 revision
3802         'yo': 'yor',
3803         'za': 'zha',
3804         'zh': 'zho',
3805         'zu': 'zul',
3806     }
3807
3808     @classmethod
3809     def short2long(cls, code):
3810         """Convert language code from ISO 639-1 to ISO 639-2/T"""
3811         return cls._lang_map.get(code[:2])
3812
3813     @classmethod
3814     def long2short(cls, code):
3815         """Convert language code from ISO 639-2/T to ISO 639-1"""
3816         for short_name, long_name in cls._lang_map.items():
3817             if long_name == code:
3818                 return short_name
3819
3820
3821 class ISO3166Utils:
3822     # From http://data.okfn.org/data/core/country-list
3823     _country_map = {
3824         'AF': 'Afghanistan',
3825         'AX': 'Åland Islands',
3826         'AL': 'Albania',
3827         'DZ': 'Algeria',
3828         'AS': 'American Samoa',
3829         'AD': 'Andorra',
3830         'AO': 'Angola',
3831         'AI': 'Anguilla',
3832         'AQ': 'Antarctica',
3833         'AG': 'Antigua and Barbuda',
3834         'AR': 'Argentina',
3835         'AM': 'Armenia',
3836         'AW': 'Aruba',
3837         'AU': 'Australia',
3838         'AT': 'Austria',
3839         'AZ': 'Azerbaijan',
3840         'BS': 'Bahamas',
3841         'BH': 'Bahrain',
3842         'BD': 'Bangladesh',
3843         'BB': 'Barbados',
3844         'BY': 'Belarus',
3845         'BE': 'Belgium',
3846         'BZ': 'Belize',
3847         'BJ': 'Benin',
3848         'BM': 'Bermuda',
3849         'BT': 'Bhutan',
3850         'BO': 'Bolivia, Plurinational State of',
3851         'BQ': 'Bonaire, Sint Eustatius and Saba',
3852         'BA': 'Bosnia and Herzegovina',
3853         'BW': 'Botswana',
3854         'BV': 'Bouvet Island',
3855         'BR': 'Brazil',
3856         'IO': 'British Indian Ocean Territory',
3857         'BN': 'Brunei Darussalam',
3858         'BG': 'Bulgaria',
3859         'BF': 'Burkina Faso',
3860         'BI': 'Burundi',
3861         'KH': 'Cambodia',
3862         'CM': 'Cameroon',
3863         'CA': 'Canada',
3864         'CV': 'Cape Verde',
3865         'KY': 'Cayman Islands',
3866         'CF': 'Central African Republic',
3867         'TD': 'Chad',
3868         'CL': 'Chile',
3869         'CN': 'China',
3870         'CX': 'Christmas Island',
3871         'CC': 'Cocos (Keeling) Islands',
3872         'CO': 'Colombia',
3873         'KM': 'Comoros',
3874         'CG': 'Congo',
3875         'CD': 'Congo, the Democratic Republic of the',
3876         'CK': 'Cook Islands',
3877         'CR': 'Costa Rica',
3878         'CI': 'Côte d\'Ivoire',
3879         'HR': 'Croatia',
3880         'CU': 'Cuba',
3881         'CW': 'Curaçao',
3882         'CY': 'Cyprus',
3883         'CZ': 'Czech Republic',
3884         'DK': 'Denmark',
3885         'DJ': 'Djibouti',
3886         'DM': 'Dominica',
3887         'DO': 'Dominican Republic',
3888         'EC': 'Ecuador',
3889         'EG': 'Egypt',
3890         'SV': 'El Salvador',
3891         'GQ': 'Equatorial Guinea',
3892         'ER': 'Eritrea',
3893         'EE': 'Estonia',
3894         'ET': 'Ethiopia',
3895         'FK': 'Falkland Islands (Malvinas)',
3896         'FO': 'Faroe Islands',
3897         'FJ': 'Fiji',
3898         'FI': 'Finland',
3899         'FR': 'France',
3900         'GF': 'French Guiana',
3901         'PF': 'French Polynesia',
3902         'TF': 'French Southern Territories',
3903         'GA': 'Gabon',
3904         'GM': 'Gambia',
3905         'GE': 'Georgia',
3906         'DE': 'Germany',
3907         'GH': 'Ghana',
3908         'GI': 'Gibraltar',
3909         'GR': 'Greece',
3910         'GL': 'Greenland',
3911         'GD': 'Grenada',
3912         'GP': 'Guadeloupe',
3913         'GU': 'Guam',
3914         'GT': 'Guatemala',
3915         'GG': 'Guernsey',
3916         'GN': 'Guinea',
3917         'GW': 'Guinea-Bissau',
3918         'GY': 'Guyana',
3919         'HT': 'Haiti',
3920         'HM': 'Heard Island and McDonald Islands',
3921         'VA': 'Holy See (Vatican City State)',
3922         'HN': 'Honduras',
3923         'HK': 'Hong Kong',
3924         'HU': 'Hungary',
3925         'IS': 'Iceland',
3926         'IN': 'India',
3927         'ID': 'Indonesia',
3928         'IR': 'Iran, Islamic Republic of',
3929         'IQ': 'Iraq',
3930         'IE': 'Ireland',
3931         'IM': 'Isle of Man',
3932         'IL': 'Israel',
3933         'IT': 'Italy',
3934         'JM': 'Jamaica',
3935         'JP': 'Japan',
3936         'JE': 'Jersey',
3937         'JO': 'Jordan',
3938         'KZ': 'Kazakhstan',
3939         'KE': 'Kenya',
3940         'KI': 'Kiribati',
3941         'KP': 'Korea, Democratic People\'s Republic of',
3942         'KR': 'Korea, Republic of',
3943         'KW': 'Kuwait',
3944         'KG': 'Kyrgyzstan',
3945         'LA': 'Lao People\'s Democratic Republic',
3946         'LV': 'Latvia',
3947         'LB': 'Lebanon',
3948         'LS': 'Lesotho',
3949         'LR': 'Liberia',
3950         'LY': 'Libya',
3951         'LI': 'Liechtenstein',
3952         'LT': 'Lithuania',
3953         'LU': 'Luxembourg',
3954         'MO': 'Macao',
3955         'MK': 'Macedonia, the Former Yugoslav Republic of',
3956         'MG': 'Madagascar',
3957         'MW': 'Malawi',
3958         'MY': 'Malaysia',
3959         'MV': 'Maldives',
3960         'ML': 'Mali',
3961         'MT': 'Malta',
3962         'MH': 'Marshall Islands',
3963         'MQ': 'Martinique',
3964         'MR': 'Mauritania',
3965         'MU': 'Mauritius',
3966         'YT': 'Mayotte',
3967         'MX': 'Mexico',
3968         'FM': 'Micronesia, Federated States of',
3969         'MD': 'Moldova, Republic of',
3970         'MC': 'Monaco',
3971         'MN': 'Mongolia',
3972         'ME': 'Montenegro',
3973         'MS': 'Montserrat',
3974         'MA': 'Morocco',
3975         'MZ': 'Mozambique',
3976         'MM': 'Myanmar',
3977         'NA': 'Namibia',
3978         'NR': 'Nauru',
3979         'NP': 'Nepal',
3980         'NL': 'Netherlands',
3981         'NC': 'New Caledonia',
3982         'NZ': 'New Zealand',
3983         'NI': 'Nicaragua',
3984         'NE': 'Niger',
3985         'NG': 'Nigeria',
3986         'NU': 'Niue',
3987         'NF': 'Norfolk Island',
3988         'MP': 'Northern Mariana Islands',
3989         'NO': 'Norway',
3990         'OM': 'Oman',
3991         'PK': 'Pakistan',
3992         'PW': 'Palau',
3993         'PS': 'Palestine, State of',
3994         'PA': 'Panama',
3995         'PG': 'Papua New Guinea',
3996         'PY': 'Paraguay',
3997         'PE': 'Peru',
3998         'PH': 'Philippines',
3999         'PN': 'Pitcairn',
4000         'PL': 'Poland',
4001         'PT': 'Portugal',
4002         'PR': 'Puerto Rico',
4003         'QA': 'Qatar',
4004         'RE': 'Réunion',
4005         'RO': 'Romania',
4006         'RU': 'Russian Federation',
4007         'RW': 'Rwanda',
4008         'BL': 'Saint Barthélemy',
4009         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4010         'KN': 'Saint Kitts and Nevis',
4011         'LC': 'Saint Lucia',
4012         'MF': 'Saint Martin (French part)',
4013         'PM': 'Saint Pierre and Miquelon',
4014         'VC': 'Saint Vincent and the Grenadines',
4015         'WS': 'Samoa',
4016         'SM': 'San Marino',
4017         'ST': 'Sao Tome and Principe',
4018         'SA': 'Saudi Arabia',
4019         'SN': 'Senegal',
4020         'RS': 'Serbia',
4021         'SC': 'Seychelles',
4022         'SL': 'Sierra Leone',
4023         'SG': 'Singapore',
4024         'SX': 'Sint Maarten (Dutch part)',
4025         'SK': 'Slovakia',
4026         'SI': 'Slovenia',
4027         'SB': 'Solomon Islands',
4028         'SO': 'Somalia',
4029         'ZA': 'South Africa',
4030         'GS': 'South Georgia and the South Sandwich Islands',
4031         'SS': 'South Sudan',
4032         'ES': 'Spain',
4033         'LK': 'Sri Lanka',
4034         'SD': 'Sudan',
4035         'SR': 'Suriname',
4036         'SJ': 'Svalbard and Jan Mayen',
4037         'SZ': 'Swaziland',
4038         'SE': 'Sweden',
4039         'CH': 'Switzerland',
4040         'SY': 'Syrian Arab Republic',
4041         'TW': 'Taiwan, Province of China',
4042         'TJ': 'Tajikistan',
4043         'TZ': 'Tanzania, United Republic of',
4044         'TH': 'Thailand',
4045         'TL': 'Timor-Leste',
4046         'TG': 'Togo',
4047         'TK': 'Tokelau',
4048         'TO': 'Tonga',
4049         'TT': 'Trinidad and Tobago',
4050         'TN': 'Tunisia',
4051         'TR': 'Turkey',
4052         'TM': 'Turkmenistan',
4053         'TC': 'Turks and Caicos Islands',
4054         'TV': 'Tuvalu',
4055         'UG': 'Uganda',
4056         'UA': 'Ukraine',
4057         'AE': 'United Arab Emirates',
4058         'GB': 'United Kingdom',
4059         'US': 'United States',
4060         'UM': 'United States Minor Outlying Islands',
4061         'UY': 'Uruguay',
4062         'UZ': 'Uzbekistan',
4063         'VU': 'Vanuatu',
4064         'VE': 'Venezuela, Bolivarian Republic of',
4065         'VN': 'Viet Nam',
4066         'VG': 'Virgin Islands, British',
4067         'VI': 'Virgin Islands, U.S.',
4068         'WF': 'Wallis and Futuna',
4069         'EH': 'Western Sahara',
4070         'YE': 'Yemen',
4071         'ZM': 'Zambia',
4072         'ZW': 'Zimbabwe',
4073         # Not ISO 3166 codes, but used for IP blocks
4074         'AP': 'Asia/Pacific Region',
4075         'EU': 'Europe',
4076     }
4077
4078     @classmethod
4079     def short2full(cls, code):
4080         """Convert an ISO 3166-2 country code to the corresponding full name"""
4081         return cls._country_map.get(code.upper())
4082
4083
4084 class GeoUtils:
4085     # Major IPv4 address blocks per country
4086     _country_ip_map = {
4087         'AD': '46.172.224.0/19',
4088         'AE': '94.200.0.0/13',
4089         'AF': '149.54.0.0/17',
4090         'AG': '209.59.64.0/18',
4091         'AI': '204.14.248.0/21',
4092         'AL': '46.99.0.0/16',
4093         'AM': '46.70.0.0/15',
4094         'AO': '105.168.0.0/13',
4095         'AP': '182.50.184.0/21',
4096         'AQ': '23.154.160.0/24',
4097         'AR': '181.0.0.0/12',
4098         'AS': '202.70.112.0/20',
4099         'AT': '77.116.0.0/14',
4100         'AU': '1.128.0.0/11',
4101         'AW': '181.41.0.0/18',
4102         'AX': '185.217.4.0/22',
4103         'AZ': '5.197.0.0/16',
4104         'BA': '31.176.128.0/17',
4105         'BB': '65.48.128.0/17',
4106         'BD': '114.130.0.0/16',
4107         'BE': '57.0.0.0/8',
4108         'BF': '102.178.0.0/15',
4109         'BG': '95.42.0.0/15',
4110         'BH': '37.131.0.0/17',
4111         'BI': '154.117.192.0/18',
4112         'BJ': '137.255.0.0/16',
4113         'BL': '185.212.72.0/23',
4114         'BM': '196.12.64.0/18',
4115         'BN': '156.31.0.0/16',
4116         'BO': '161.56.0.0/16',
4117         'BQ': '161.0.80.0/20',
4118         'BR': '191.128.0.0/12',
4119         'BS': '24.51.64.0/18',
4120         'BT': '119.2.96.0/19',
4121         'BW': '168.167.0.0/16',
4122         'BY': '178.120.0.0/13',
4123         'BZ': '179.42.192.0/18',
4124         'CA': '99.224.0.0/11',
4125         'CD': '41.243.0.0/16',
4126         'CF': '197.242.176.0/21',
4127         'CG': '160.113.0.0/16',
4128         'CH': '85.0.0.0/13',
4129         'CI': '102.136.0.0/14',
4130         'CK': '202.65.32.0/19',
4131         'CL': '152.172.0.0/14',
4132         'CM': '102.244.0.0/14',
4133         'CN': '36.128.0.0/10',
4134         'CO': '181.240.0.0/12',
4135         'CR': '201.192.0.0/12',
4136         'CU': '152.206.0.0/15',
4137         'CV': '165.90.96.0/19',
4138         'CW': '190.88.128.0/17',
4139         'CY': '31.153.0.0/16',
4140         'CZ': '88.100.0.0/14',
4141         'DE': '53.0.0.0/8',
4142         'DJ': '197.241.0.0/17',
4143         'DK': '87.48.0.0/12',
4144         'DM': '192.243.48.0/20',
4145         'DO': '152.166.0.0/15',
4146         'DZ': '41.96.0.0/12',
4147         'EC': '186.68.0.0/15',
4148         'EE': '90.190.0.0/15',
4149         'EG': '156.160.0.0/11',
4150         'ER': '196.200.96.0/20',
4151         'ES': '88.0.0.0/11',
4152         'ET': '196.188.0.0/14',
4153         'EU': '2.16.0.0/13',
4154         'FI': '91.152.0.0/13',
4155         'FJ': '144.120.0.0/16',
4156         'FK': '80.73.208.0/21',
4157         'FM': '119.252.112.0/20',
4158         'FO': '88.85.32.0/19',
4159         'FR': '90.0.0.0/9',
4160         'GA': '41.158.0.0/15',
4161         'GB': '25.0.0.0/8',
4162         'GD': '74.122.88.0/21',
4163         'GE': '31.146.0.0/16',
4164         'GF': '161.22.64.0/18',
4165         'GG': '62.68.160.0/19',
4166         'GH': '154.160.0.0/12',
4167         'GI': '95.164.0.0/16',
4168         'GL': '88.83.0.0/19',
4169         'GM': '160.182.0.0/15',
4170         'GN': '197.149.192.0/18',
4171         'GP': '104.250.0.0/19',
4172         'GQ': '105.235.224.0/20',
4173         'GR': '94.64.0.0/13',
4174         'GT': '168.234.0.0/16',
4175         'GU': '168.123.0.0/16',
4176         'GW': '197.214.80.0/20',
4177         'GY': '181.41.64.0/18',
4178         'HK': '113.252.0.0/14',
4179         'HN': '181.210.0.0/16',
4180         'HR': '93.136.0.0/13',
4181         'HT': '148.102.128.0/17',
4182         'HU': '84.0.0.0/14',
4183         'ID': '39.192.0.0/10',
4184         'IE': '87.32.0.0/12',
4185         'IL': '79.176.0.0/13',
4186         'IM': '5.62.80.0/20',
4187         'IN': '117.192.0.0/10',
4188         'IO': '203.83.48.0/21',
4189         'IQ': '37.236.0.0/14',
4190         'IR': '2.176.0.0/12',
4191         'IS': '82.221.0.0/16',
4192         'IT': '79.0.0.0/10',
4193         'JE': '87.244.64.0/18',
4194         'JM': '72.27.0.0/17',
4195         'JO': '176.29.0.0/16',
4196         'JP': '133.0.0.0/8',
4197         'KE': '105.48.0.0/12',
4198         'KG': '158.181.128.0/17',
4199         'KH': '36.37.128.0/17',
4200         'KI': '103.25.140.0/22',
4201         'KM': '197.255.224.0/20',
4202         'KN': '198.167.192.0/19',
4203         'KP': '175.45.176.0/22',
4204         'KR': '175.192.0.0/10',
4205         'KW': '37.36.0.0/14',
4206         'KY': '64.96.0.0/15',
4207         'KZ': '2.72.0.0/13',
4208         'LA': '115.84.64.0/18',
4209         'LB': '178.135.0.0/16',
4210         'LC': '24.92.144.0/20',
4211         'LI': '82.117.0.0/19',
4212         'LK': '112.134.0.0/15',
4213         'LR': '102.183.0.0/16',
4214         'LS': '129.232.0.0/17',
4215         'LT': '78.56.0.0/13',
4216         'LU': '188.42.0.0/16',
4217         'LV': '46.109.0.0/16',
4218         'LY': '41.252.0.0/14',
4219         'MA': '105.128.0.0/11',
4220         'MC': '88.209.64.0/18',
4221         'MD': '37.246.0.0/16',
4222         'ME': '178.175.0.0/17',
4223         'MF': '74.112.232.0/21',
4224         'MG': '154.126.0.0/17',
4225         'MH': '117.103.88.0/21',
4226         'MK': '77.28.0.0/15',
4227         'ML': '154.118.128.0/18',
4228         'MM': '37.111.0.0/17',
4229         'MN': '49.0.128.0/17',
4230         'MO': '60.246.0.0/16',
4231         'MP': '202.88.64.0/20',
4232         'MQ': '109.203.224.0/19',
4233         'MR': '41.188.64.0/18',
4234         'MS': '208.90.112.0/22',
4235         'MT': '46.11.0.0/16',
4236         'MU': '105.16.0.0/12',
4237         'MV': '27.114.128.0/18',
4238         'MW': '102.70.0.0/15',
4239         'MX': '187.192.0.0/11',
4240         'MY': '175.136.0.0/13',
4241         'MZ': '197.218.0.0/15',
4242         'NA': '41.182.0.0/16',
4243         'NC': '101.101.0.0/18',
4244         'NE': '197.214.0.0/18',
4245         'NF': '203.17.240.0/22',
4246         'NG': '105.112.0.0/12',
4247         'NI': '186.76.0.0/15',
4248         'NL': '145.96.0.0/11',
4249         'NO': '84.208.0.0/13',
4250         'NP': '36.252.0.0/15',
4251         'NR': '203.98.224.0/19',
4252         'NU': '49.156.48.0/22',
4253         'NZ': '49.224.0.0/14',
4254         'OM': '5.36.0.0/15',
4255         'PA': '186.72.0.0/15',
4256         'PE': '186.160.0.0/14',
4257         'PF': '123.50.64.0/18',
4258         'PG': '124.240.192.0/19',
4259         'PH': '49.144.0.0/13',
4260         'PK': '39.32.0.0/11',
4261         'PL': '83.0.0.0/11',
4262         'PM': '70.36.0.0/20',
4263         'PR': '66.50.0.0/16',
4264         'PS': '188.161.0.0/16',
4265         'PT': '85.240.0.0/13',
4266         'PW': '202.124.224.0/20',
4267         'PY': '181.120.0.0/14',
4268         'QA': '37.210.0.0/15',
4269         'RE': '102.35.0.0/16',
4270         'RO': '79.112.0.0/13',
4271         'RS': '93.86.0.0/15',
4272         'RU': '5.136.0.0/13',
4273         'RW': '41.186.0.0/16',
4274         'SA': '188.48.0.0/13',
4275         'SB': '202.1.160.0/19',
4276         'SC': '154.192.0.0/11',
4277         'SD': '102.120.0.0/13',
4278         'SE': '78.64.0.0/12',
4279         'SG': '8.128.0.0/10',
4280         'SI': '188.196.0.0/14',
4281         'SK': '78.98.0.0/15',
4282         'SL': '102.143.0.0/17',
4283         'SM': '89.186.32.0/19',
4284         'SN': '41.82.0.0/15',
4285         'SO': '154.115.192.0/18',
4286         'SR': '186.179.128.0/17',
4287         'SS': '105.235.208.0/21',
4288         'ST': '197.159.160.0/19',
4289         'SV': '168.243.0.0/16',
4290         'SX': '190.102.0.0/20',
4291         'SY': '5.0.0.0/16',
4292         'SZ': '41.84.224.0/19',
4293         'TC': '65.255.48.0/20',
4294         'TD': '154.68.128.0/19',
4295         'TG': '196.168.0.0/14',
4296         'TH': '171.96.0.0/13',
4297         'TJ': '85.9.128.0/18',
4298         'TK': '27.96.24.0/21',
4299         'TL': '180.189.160.0/20',
4300         'TM': '95.85.96.0/19',
4301         'TN': '197.0.0.0/11',
4302         'TO': '175.176.144.0/21',
4303         'TR': '78.160.0.0/11',
4304         'TT': '186.44.0.0/15',
4305         'TV': '202.2.96.0/19',
4306         'TW': '120.96.0.0/11',
4307         'TZ': '156.156.0.0/14',
4308         'UA': '37.52.0.0/14',
4309         'UG': '102.80.0.0/13',
4310         'US': '6.0.0.0/8',
4311         'UY': '167.56.0.0/13',
4312         'UZ': '84.54.64.0/18',
4313         'VA': '212.77.0.0/19',
4314         'VC': '207.191.240.0/21',
4315         'VE': '186.88.0.0/13',
4316         'VG': '66.81.192.0/20',
4317         'VI': '146.226.0.0/16',
4318         'VN': '14.160.0.0/11',
4319         'VU': '202.80.32.0/20',
4320         'WF': '117.20.32.0/21',
4321         'WS': '202.4.32.0/19',
4322         'YE': '134.35.0.0/16',
4323         'YT': '41.242.116.0/22',
4324         'ZA': '41.0.0.0/11',
4325         'ZM': '102.144.0.0/13',
4326         'ZW': '102.177.192.0/18',
4327     }
4328
4329     @classmethod
4330     def random_ipv4(cls, code_or_block):
4331         if len(code_or_block) == 2:
4332             block = cls._country_ip_map.get(code_or_block.upper())
4333             if not block:
4334                 return None
4335         else:
4336             block = code_or_block
4337         addr, preflen = block.split('/')
4338         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4339         addr_max = addr_min | (0xffffffff >> int(preflen))
4340         return str(socket.inet_ntoa(
4341             struct.pack('!L', random.randint(addr_min, addr_max))))
4342
4343
4344 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4345 # released into Public Domain
4346 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4347
4348 def long_to_bytes(n, blocksize=0):
4349     """long_to_bytes(n:long, blocksize:int) : string
4350     Convert a long integer to a byte string.
4351
4352     If optional blocksize is given and greater than zero, pad the front of the
4353     byte string with binary zeros so that the length is a multiple of
4354     blocksize.
4355     """
4356     # after much testing, this algorithm was deemed to be the fastest
4357     s = b''
4358     n = int(n)
4359     while n > 0:
4360         s = struct.pack('>I', n & 0xffffffff) + s
4361         n = n >> 32
4362     # strip off leading zeros
4363     for i in range(len(s)):
4364         if s[i] != b'\000'[0]:
4365             break
4366     else:
4367         # only happens when n == 0
4368         s = b'\000'
4369         i = 0
4370     s = s[i:]
4371     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4372     # de-padding being done above, but sigh...
4373     if blocksize > 0 and len(s) % blocksize:
4374         s = (blocksize - len(s) % blocksize) * b'\000' + s
4375     return s
4376
4377
4378 def bytes_to_long(s):
4379     """bytes_to_long(string) : long
4380     Convert a byte string to a long integer.
4381
4382     This is (essentially) the inverse of long_to_bytes().
4383     """
4384     acc = 0
4385     length = len(s)
4386     if length % 4:
4387         extra = (4 - length % 4)
4388         s = b'\000' * extra + s
4389         length = length + extra
4390     for i in range(0, length, 4):
4391         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4392     return acc
4393
4394
4395 def ohdave_rsa_encrypt(data, exponent, modulus):
4396     '''
4397     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4398
4399     Input:
4400         data: data to encrypt, bytes-like object
4401         exponent, modulus: parameter e and N of RSA algorithm, both integer
4402     Output: hex string of encrypted data
4403
4404     Limitation: supports one block encryption only
4405     '''
4406
4407     payload = int(binascii.hexlify(data[::-1]), 16)
4408     encrypted = pow(payload, exponent, modulus)
4409     return '%x' % encrypted
4410
4411
4412 def pkcs1pad(data, length):
4413     """
4414     Padding input data with PKCS#1 scheme
4415
4416     @param {int[]} data        input data
4417     @param {int}   length      target length
4418     @returns {int[]}           padded data
4419     """
4420     if len(data) > length - 11:
4421         raise ValueError('Input data too long for PKCS#1 padding')
4422
4423     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4424     return [0, 2] + pseudo_random + [0] + data
4425
4426
4427 def _base_n_table(n, table):
4428     if not table and not n:
4429         raise ValueError('Either table or n must be specified')
4430     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4431
4432     if n and n != len(table):
4433         raise ValueError(f'base {n} exceeds table length {len(table)}')
4434     return table
4435
4436
4437 def encode_base_n(num, n=None, table=None):
4438     """Convert given int to a base-n string"""
4439     table = _base_n_table(n, table)
4440     if not num:
4441         return table[0]
4442
4443     result, base = '', len(table)
4444     while num:
4445         result = table[num % base] + result
4446         num = num // base
4447     return result
4448
4449
4450 def decode_base_n(string, n=None, table=None):
4451     """Convert given base-n string to int"""
4452     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4453     result, base = 0, len(table)
4454     for char in string:
4455         result = result * base + table[char]
4456     return result
4457
4458
4459 def decode_packed_codes(code):
4460     mobj = re.search(PACKED_CODES_RE, code)
4461     obfuscated_code, base, count, symbols = mobj.groups()
4462     base = int(base)
4463     count = int(count)
4464     symbols = symbols.split('|')
4465     symbol_table = {}
4466
4467     while count:
4468         count -= 1
4469         base_n_count = encode_base_n(count, base)
4470         symbol_table[base_n_count] = symbols[count] or base_n_count
4471
4472     return re.sub(
4473         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4474         obfuscated_code)
4475
4476
4477 def caesar(s, alphabet, shift):
4478     if shift == 0:
4479         return s
4480     l = len(alphabet)
4481     return ''.join(
4482         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4483         for c in s)
4484
4485
4486 def rot47(s):
4487     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4488
4489
4490 def parse_m3u8_attributes(attrib):
4491     info = {}
4492     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4493         if val.startswith('"'):
4494             val = val[1:-1]
4495         info[key] = val
4496     return info
4497
4498
4499 def urshift(val, n):
4500     return val >> n if val >= 0 else (val + 0x100000000) >> n
4501
4502
4503 def write_xattr(path, key, value):
4504     # Windows: Write xattrs to NTFS Alternate Data Streams:
4505     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4506     if compat_os_name == 'nt':
4507         assert ':' not in key
4508         assert os.path.exists(path)
4509
4510         try:
4511             with open(f'{path}:{key}', 'wb') as f:
4512                 f.write(value)
4513         except OSError as e:
4514             raise XAttrMetadataError(e.errno, e.strerror)
4515         return
4516
4517     # UNIX Method 1. Use xattrs/pyxattrs modules
4518
4519     setxattr = None
4520     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4521         # Unicode arguments are not supported in pyxattr until version 0.5.0
4522         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4523         if version_tuple(xattr.__version__) >= (0, 5, 0):
4524             setxattr = xattr.set
4525     elif xattr:
4526         setxattr = xattr.setxattr
4527
4528     if setxattr:
4529         try:
4530             setxattr(path, key, value)
4531         except OSError as e:
4532             raise XAttrMetadataError(e.errno, e.strerror)
4533         return
4534
4535     # UNIX Method 2. Use setfattr/xattr executables
4536     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4537            else 'xattr' if check_executable('xattr', ['-h']) else None)
4538     if not exe:
4539         raise XAttrUnavailableError(
4540             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4541             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4542
4543     value = value.decode()
4544     try:
4545         _, stderr, returncode = Popen.run(
4546             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4547             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4548     except OSError as e:
4549         raise XAttrMetadataError(e.errno, e.strerror)
4550     if returncode:
4551         raise XAttrMetadataError(returncode, stderr)
4552
4553
4554 def random_birthday(year_field, month_field, day_field):
4555     start_date = datetime.date(1950, 1, 1)
4556     end_date = datetime.date(1995, 12, 31)
4557     offset = random.randint(0, (end_date - start_date).days)
4558     random_date = start_date + datetime.timedelta(offset)
4559     return {
4560         year_field: str(random_date.year),
4561         month_field: str(random_date.month),
4562         day_field: str(random_date.day),
4563     }
4564
4565
4566 def find_available_port(interface=''):
4567     try:
4568         with socket.socket() as sock:
4569             sock.bind((interface, 0))
4570             return sock.getsockname()[1]
4571     except OSError:
4572         return None
4573
4574
4575 # Templates for internet shortcut files, which are plain text files.
4576 DOT_URL_LINK_TEMPLATE = '''\
4577 [InternetShortcut]
4578 URL=%(url)s
4579 '''
4580
4581 DOT_WEBLOC_LINK_TEMPLATE = '''\
4582 <?xml version="1.0" encoding="UTF-8"?>
4583 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4584 <plist version="1.0">
4585 <dict>
4586 \t<key>URL</key>
4587 \t<string>%(url)s</string>
4588 </dict>
4589 </plist>
4590 '''
4591
4592 DOT_DESKTOP_LINK_TEMPLATE = '''\
4593 [Desktop Entry]
4594 Encoding=UTF-8
4595 Name=%(filename)s
4596 Type=Link
4597 URL=%(url)s
4598 Icon=text-html
4599 '''
4600
4601 LINK_TEMPLATES = {
4602     'url': DOT_URL_LINK_TEMPLATE,
4603     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4604     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4605 }
4606
4607
4608 def iri_to_uri(iri):
4609     """
4610     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4611
4612     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4613     """
4614
4615     iri_parts = urllib.parse.urlparse(iri)
4616
4617     if '[' in iri_parts.netloc:
4618         raise ValueError('IPv6 URIs are not, yet, supported.')
4619         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4620
4621     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4622
4623     net_location = ''
4624     if iri_parts.username:
4625         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4626         if iri_parts.password is not None:
4627             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4628         net_location += '@'
4629
4630     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
4631     # The 'idna' encoding produces ASCII text.
4632     if iri_parts.port is not None and iri_parts.port != 80:
4633         net_location += ':' + str(iri_parts.port)
4634
4635     return urllib.parse.urlunparse(
4636         (iri_parts.scheme,
4637             net_location,
4638
4639             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4640
4641             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4642             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4643
4644             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4645             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4646
4647             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4648
4649     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4650
4651
4652 def to_high_limit_path(path):
4653     if sys.platform in ['win32', 'cygwin']:
4654         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4655         return '\\\\?\\' + os.path.abspath(path)
4656
4657     return path
4658
4659
4660 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4661     val = traversal.traverse_obj(obj, *variadic(field))
4662     if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4663         return default
4664     return template % func(val)
4665
4666
4667 def clean_podcast_url(url):
4668     url = re.sub(r'''(?x)
4669         (?:
4670             (?:
4671                 chtbl\.com/track|
4672                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4673                 play\.podtrac\.com|
4674                 chrt\.fm/track|
4675                 mgln\.ai/e
4676             )(?:/[^/.]+)?|
4677             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4678             flex\.acast\.com|
4679             pd(?:
4680                 cn\.co| # https://podcorn.com/analytics-prefix/
4681                 st\.fm # https://podsights.com/docs/
4682             )/e|
4683             [0-9]\.gum\.fm|
4684             pscrb\.fm/rss/p
4685         )/''', '', url)
4686     return re.sub(r'^\w+://(\w+://)', r'\1', url)
4687
4688
4689 _HEX_TABLE = '0123456789abcdef'
4690
4691
4692 def random_uuidv4():
4693     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4694
4695
4696 def make_dir(path, to_screen=None):
4697     try:
4698         dn = os.path.dirname(path)
4699         if dn:
4700             os.makedirs(dn, exist_ok=True)
4701         return True
4702     except OSError as err:
4703         if callable(to_screen) is not None:
4704             to_screen(f'unable to create directory {err}')
4705         return False
4706
4707
4708 def get_executable_path():
4709     from ..update import _get_variant_and_executable_path
4710
4711     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4712
4713
4714 def get_user_config_dirs(package_name):
4715     # .config (e.g. ~/.config/package_name)
4716     xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4717     yield os.path.join(xdg_config_home, package_name)
4718
4719     # appdata (%APPDATA%/package_name)
4720     appdata_dir = os.getenv('appdata')
4721     if appdata_dir:
4722         yield os.path.join(appdata_dir, package_name)
4723
4724     # home (~/.package_name)
4725     yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4726
4727
4728 def get_system_config_dirs(package_name):
4729     # /etc/package_name
4730     yield os.path.join('/etc', package_name)
4731
4732
4733 def time_seconds(**kwargs):
4734     """
4735     Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4736     """
4737     return time.time() + datetime.timedelta(**kwargs).total_seconds()
4738
4739
4740 # create a JSON Web Signature (jws) with HS256 algorithm
4741 # the resulting format is in JWS Compact Serialization
4742 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4743 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4744 def jwt_encode_hs256(payload_data, key, headers={}):
4745     header_data = {
4746         'alg': 'HS256',
4747         'typ': 'JWT',
4748     }
4749     if headers:
4750         header_data.update(headers)
4751     header_b64 = base64.b64encode(json.dumps(header_data).encode())
4752     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4753     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4754     signature_b64 = base64.b64encode(h.digest())
4755     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4756     return token
4757
4758
4759 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4760 def jwt_decode_hs256(jwt):
4761     header_b64, payload_b64, signature_b64 = jwt.split('.')
4762     # add trailing ='s that may have been stripped, superfluous ='s are ignored
4763     payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4764     return payload_data
4765
4766
4767 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4768
4769
4770 @functools.cache
4771 def supports_terminal_sequences(stream):
4772     if compat_os_name == 'nt':
4773         if not WINDOWS_VT_MODE:
4774             return False
4775     elif not os.getenv('TERM'):
4776         return False
4777     try:
4778         return stream.isatty()
4779     except BaseException:
4780         return False
4781
4782
4783 def windows_enable_vt_mode():
4784     """Ref: https://bugs.python.org/issue30075 """
4785     if get_windows_version() < (10, 0, 10586):
4786         return
4787
4788     import ctypes
4789     import ctypes.wintypes
4790     import msvcrt
4791
4792     ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4793
4794     dll = ctypes.WinDLL('kernel32', use_last_error=False)
4795     handle = os.open('CONOUT$', os.O_RDWR)
4796     try:
4797         h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4798         dw_original_mode = ctypes.wintypes.DWORD()
4799         success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4800         if not success:
4801             raise Exception('GetConsoleMode failed')
4802
4803         success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4804             dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4805         if not success:
4806             raise Exception('SetConsoleMode failed')
4807     finally:
4808         os.close(handle)
4809
4810     global WINDOWS_VT_MODE
4811     WINDOWS_VT_MODE = True
4812     supports_terminal_sequences.cache_clear()
4813
4814
4815 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4816
4817
4818 def remove_terminal_sequences(string):
4819     return _terminal_sequences_re.sub('', string)
4820
4821
4822 def number_of_digits(number):
4823     return len('%d' % number)
4824
4825
4826 def join_nonempty(*values, delim='-', from_dict=None):
4827     if from_dict is not None:
4828         values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4829     return delim.join(map(str, filter(None, values)))
4830
4831
4832 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4833     """
4834     Find the largest format dimensions in terms of video width and, for each thumbnail:
4835     * Modify the URL: Match the width with the provided regex and replace with the former width
4836     * Update dimensions
4837
4838     This function is useful with video services that scale the provided thumbnails on demand
4839     """
4840     _keys = ('width', 'height')
4841     max_dimensions = max(
4842         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4843         default=(0, 0))
4844     if not max_dimensions[0]:
4845         return thumbnails
4846     return [
4847         merge_dicts(
4848             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4849             dict(zip(_keys, max_dimensions)), thumbnail)
4850         for thumbnail in thumbnails
4851     ]
4852
4853
4854 def parse_http_range(range):
4855     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4856     if not range:
4857         return None, None, None
4858     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4859     if not crg:
4860         return None, None, None
4861     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4862
4863
4864 def read_stdin(what):
4865     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4866     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4867     return sys.stdin
4868
4869
4870 def determine_file_encoding(data):
4871     """
4872     Detect the text encoding used
4873     @returns (encoding, bytes to skip)
4874     """
4875
4876     # BOM marks are given priority over declarations
4877     for bom, enc in BOMS:
4878         if data.startswith(bom):
4879             return enc, len(bom)
4880
4881     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4882     # We ignore the endianness to get a good enough match
4883     data = data.replace(b'\0', b'')
4884     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4885     return mobj.group(1).decode() if mobj else None, 0
4886
4887
4888 class Config:
4889     own_args = None
4890     parsed_args = None
4891     filename = None
4892     __initialized = False
4893
4894     def __init__(self, parser, label=None):
4895         self.parser, self.label = parser, label
4896         self._loaded_paths, self.configs = set(), []
4897
4898     def init(self, args=None, filename=None):
4899         assert not self.__initialized
4900         self.own_args, self.filename = args, filename
4901         return self.load_configs()
4902
4903     def load_configs(self):
4904         directory = ''
4905         if self.filename:
4906             location = os.path.realpath(self.filename)
4907             directory = os.path.dirname(location)
4908             if location in self._loaded_paths:
4909                 return False
4910             self._loaded_paths.add(location)
4911
4912         self.__initialized = True
4913         opts, _ = self.parser.parse_known_args(self.own_args)
4914         self.parsed_args = self.own_args
4915         for location in opts.config_locations or []:
4916             if location == '-':
4917                 if location in self._loaded_paths:
4918                     continue
4919                 self._loaded_paths.add(location)
4920                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4921                 continue
4922             location = os.path.join(directory, expand_path(location))
4923             if os.path.isdir(location):
4924                 location = os.path.join(location, 'yt-dlp.conf')
4925             if not os.path.exists(location):
4926                 self.parser.error(f'config location {location} does not exist')
4927             self.append_config(self.read_file(location), location)
4928         return True
4929
4930     def __str__(self):
4931         label = join_nonempty(
4932             self.label, 'config', f'"{self.filename}"' if self.filename else '',
4933             delim=' ')
4934         return join_nonempty(
4935             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4936             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4937             delim='\n')
4938
4939     @staticmethod
4940     def read_file(filename, default=[]):
4941         try:
4942             optionf = open(filename, 'rb')
4943         except OSError:
4944             return default  # silently skip if file is not present
4945         try:
4946             enc, skip = determine_file_encoding(optionf.read(512))
4947             optionf.seek(skip, io.SEEK_SET)
4948         except OSError:
4949             enc = None  # silently skip read errors
4950         try:
4951             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4952             contents = optionf.read().decode(enc or preferredencoding())
4953             res = shlex.split(contents, comments=True)
4954         except Exception as err:
4955             raise ValueError(f'Unable to parse "{filename}": {err}')
4956         finally:
4957             optionf.close()
4958         return res
4959
4960     @staticmethod
4961     def hide_login_info(opts):
4962         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4963         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4964
4965         def _scrub_eq(o):
4966             m = eqre.match(o)
4967             if m:
4968                 return m.group('key') + '=PRIVATE'
4969             else:
4970                 return o
4971
4972         opts = list(map(_scrub_eq, opts))
4973         for idx, opt in enumerate(opts):
4974             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4975                 opts[idx + 1] = 'PRIVATE'
4976         return opts
4977
4978     def append_config(self, *args, label=None):
4979         config = type(self)(self.parser, label)
4980         config._loaded_paths = self._loaded_paths
4981         if config.init(*args):
4982             self.configs.append(config)
4983
4984     @property
4985     def all_args(self):
4986         for config in reversed(self.configs):
4987             yield from config.all_args
4988         yield from self.parsed_args or []
4989
4990     def parse_known_args(self, **kwargs):
4991         return self.parser.parse_known_args(self.all_args, **kwargs)
4992
4993     def parse_args(self):
4994         return self.parser.parse_args(self.all_args)
4995
4996
4997 class WebSocketsWrapper:
4998     """Wraps websockets module to use in non-async scopes"""
4999     pool = None
5000
5001     def __init__(self, url, headers=None, connect=True):
5002         self.loop = asyncio.new_event_loop()
5003         # XXX: "loop" is deprecated
5004         self.conn = websockets.connect(
5005             url, extra_headers=headers, ping_interval=None,
5006             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5007         if connect:
5008             self.__enter__()
5009         atexit.register(self.__exit__, None, None, None)
5010
5011     def __enter__(self):
5012         if not self.pool:
5013             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5014         return self
5015
5016     def send(self, *args):
5017         self.run_with_loop(self.pool.send(*args), self.loop)
5018
5019     def recv(self, *args):
5020         return self.run_with_loop(self.pool.recv(*args), self.loop)
5021
5022     def __exit__(self, type, value, traceback):
5023         try:
5024             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5025         finally:
5026             self.loop.close()
5027             self._cancel_all_tasks(self.loop)
5028
5029     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5030     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5031     @staticmethod
5032     def run_with_loop(main, loop):
5033         if not asyncio.iscoroutine(main):
5034             raise ValueError(f'a coroutine was expected, got {main!r}')
5035
5036         try:
5037             return loop.run_until_complete(main)
5038         finally:
5039             loop.run_until_complete(loop.shutdown_asyncgens())
5040             if hasattr(loop, 'shutdown_default_executor'):
5041                 loop.run_until_complete(loop.shutdown_default_executor())
5042
5043     @staticmethod
5044     def _cancel_all_tasks(loop):
5045         to_cancel = asyncio.all_tasks(loop)
5046
5047         if not to_cancel:
5048             return
5049
5050         for task in to_cancel:
5051             task.cancel()
5052
5053         # XXX: "loop" is removed in python 3.10+
5054         loop.run_until_complete(
5055             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5056
5057         for task in to_cancel:
5058             if task.cancelled():
5059                 continue
5060             if task.exception() is not None:
5061                 loop.call_exception_handler({
5062                     'message': 'unhandled exception during asyncio.run() shutdown',
5063                     'exception': task.exception(),
5064                     'task': task,
5065                 })
5066
5067
5068 def merge_headers(*dicts):
5069     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5070     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5071
5072
5073 def cached_method(f):
5074     """Cache a method"""
5075     signature = inspect.signature(f)
5076
5077     @functools.wraps(f)
5078     def wrapper(self, *args, **kwargs):
5079         bound_args = signature.bind(self, *args, **kwargs)
5080         bound_args.apply_defaults()
5081         key = tuple(bound_args.arguments.values())[1:]
5082
5083         cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5084         if key not in cache:
5085             cache[key] = f(self, *args, **kwargs)
5086         return cache[key]
5087     return wrapper
5088
5089
5090 class classproperty:
5091     """property access for class methods with optional caching"""
5092     def __new__(cls, func=None, *args, **kwargs):
5093         if not func:
5094             return functools.partial(cls, *args, **kwargs)
5095         return super().__new__(cls)
5096
5097     def __init__(self, func, *, cache=False):
5098         functools.update_wrapper(self, func)
5099         self.func = func
5100         self._cache = {} if cache else None
5101
5102     def __get__(self, _, cls):
5103         if self._cache is None:
5104             return self.func(cls)
5105         elif cls not in self._cache:
5106             self._cache[cls] = self.func(cls)
5107         return self._cache[cls]
5108
5109
5110 class function_with_repr:
5111     def __init__(self, func, repr_=None):
5112         functools.update_wrapper(self, func)
5113         self.func, self.__repr = func, repr_
5114
5115     def __call__(self, *args, **kwargs):
5116         return self.func(*args, **kwargs)
5117
5118     def __repr__(self):
5119         if self.__repr:
5120             return self.__repr
5121         return f'{self.func.__module__}.{self.func.__qualname__}'
5122
5123
5124 class Namespace(types.SimpleNamespace):
5125     """Immutable namespace"""
5126
5127     def __iter__(self):
5128         return iter(self.__dict__.values())
5129
5130     @property
5131     def items_(self):
5132         return self.__dict__.items()
5133
5134
5135 MEDIA_EXTENSIONS = Namespace(
5136     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5137     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5138     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5139     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5140     thumbnails=('jpg', 'png', 'webp'),
5141     storyboards=('mhtml', ),
5142     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5143     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5144 )
5145 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5146 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5147
5148 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5149
5150
5151 class RetryManager:
5152     """Usage:
5153         for retry in RetryManager(...):
5154             try:
5155                 ...
5156             except SomeException as err:
5157                 retry.error = err
5158                 continue
5159     """
5160     attempt, _error = 0, None
5161
5162     def __init__(self, _retries, _error_callback, **kwargs):
5163         self.retries = _retries or 0
5164         self.error_callback = functools.partial(_error_callback, **kwargs)
5165
5166     def _should_retry(self):
5167         return self._error is not NO_DEFAULT and self.attempt <= self.retries
5168
5169     @property
5170     def error(self):
5171         if self._error is NO_DEFAULT:
5172             return None
5173         return self._error
5174
5175     @error.setter
5176     def error(self, value):
5177         self._error = value
5178
5179     def __iter__(self):
5180         while self._should_retry():
5181             self.error = NO_DEFAULT
5182             self.attempt += 1
5183             yield self
5184             if self.error:
5185                 self.error_callback(self.error, self.attempt, self.retries)
5186
5187     @staticmethod
5188     def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5189         """Utility function for reporting retries"""
5190         if count > retries:
5191             if error:
5192                 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5193             raise e
5194
5195         if not count:
5196             return warn(e)
5197         elif isinstance(e, ExtractorError):
5198             e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5199         warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5200
5201         delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5202         if delay:
5203             info(f'Sleeping {delay:.2f} seconds ...')
5204             time.sleep(delay)
5205
5206
5207 def make_archive_id(ie, video_id):
5208     ie_key = ie if isinstance(ie, str) else ie.ie_key()
5209     return f'{ie_key.lower()} {video_id}'
5210
5211
5212 def truncate_string(s, left, right=0):
5213     assert left > 3 and right >= 0
5214     if s is None or len(s) <= left + right:
5215         return s
5216     return f'{s[:left-3]}...{s[-right:] if right else ""}'
5217
5218
5219 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5220     assert 'all' in alias_dict, '"all" alias is required'
5221     requested = list(start or [])
5222     for val in options:
5223         discard = val.startswith('-')
5224         if discard:
5225             val = val[1:]
5226
5227         if val in alias_dict:
5228             val = alias_dict[val] if not discard else [
5229                 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5230             # NB: Do not allow regex in aliases for performance
5231             requested = orderedSet_from_options(val, alias_dict, start=requested)
5232             continue
5233
5234         current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5235                    else [val] if val in alias_dict['all'] else None)
5236         if current is None:
5237             raise ValueError(val)
5238
5239         if discard:
5240             for item in current:
5241                 while item in requested:
5242                     requested.remove(item)
5243         else:
5244             requested.extend(current)
5245
5246     return orderedSet(requested)
5247
5248
5249 # TODO: Rewrite
5250 class FormatSorter:
5251     regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5252
5253     default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5254                'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5255                'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
5256     ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5257                     'height', 'width', 'proto', 'vext', 'abr', 'aext',
5258                     'fps', 'fs_approx', 'source', 'id')
5259
5260     settings = {
5261         'vcodec': {'type': 'ordered', 'regex': True,
5262                    'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5263         'acodec': {'type': 'ordered', 'regex': True,
5264                    'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5265         'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5266                 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5267         'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5268                   'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5269         'vext': {'type': 'ordered', 'field': 'video_ext',
5270                  'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5271                  'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5272         'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5273                  'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5274                  'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5275         'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5276         'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5277                        'field': ('vcodec', 'acodec'),
5278                        'function': lambda it: int(any(v != 'none' for v in it))},
5279         'ie_pref': {'priority': True, 'type': 'extractor'},
5280         'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5281         'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5282         'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5283         'quality': {'convert': 'float', 'default': -1},
5284         'filesize': {'convert': 'bytes'},
5285         'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5286         'id': {'convert': 'string', 'field': 'format_id'},
5287         'height': {'convert': 'float_none'},
5288         'width': {'convert': 'float_none'},
5289         'fps': {'convert': 'float_none'},
5290         'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5291         'tbr': {'convert': 'float_none'},
5292         'vbr': {'convert': 'float_none'},
5293         'abr': {'convert': 'float_none'},
5294         'asr': {'convert': 'float_none'},
5295         'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5296
5297         'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5298         'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5299                'function': lambda it: next(filter(None, it), None)},
5300         'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5301                  'function': lambda it: next(filter(None, it), None)},
5302         'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5303         'res': {'type': 'multiple', 'field': ('height', 'width'),
5304                 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5305
5306         # Actual field names
5307         'format_id': {'type': 'alias', 'field': 'id'},
5308         'preference': {'type': 'alias', 'field': 'ie_pref'},
5309         'language_preference': {'type': 'alias', 'field': 'lang'},
5310         'source_preference': {'type': 'alias', 'field': 'source'},
5311         'protocol': {'type': 'alias', 'field': 'proto'},
5312         'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5313         'audio_channels': {'type': 'alias', 'field': 'channels'},
5314
5315         # Deprecated
5316         'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5317         'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5318         'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5319         'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5320         'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5321         'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5322         'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5323         'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5324         'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5325         'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5326         'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5327         'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5328         'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5329         'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5330         'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5331         'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5332         'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5333         'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5334         'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5335         'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5336     }
5337
5338     def __init__(self, ydl, field_preference):
5339         self.ydl = ydl
5340         self._order = []
5341         self.evaluate_params(self.ydl.params, field_preference)
5342         if ydl.params.get('verbose'):
5343             self.print_verbose_info(self.ydl.write_debug)
5344
5345     def _get_field_setting(self, field, key):
5346         if field not in self.settings:
5347             if key in ('forced', 'priority'):
5348                 return False
5349             self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5350                                         'deprecated and may be removed in a future version')
5351             self.settings[field] = {}
5352         propObj = self.settings[field]
5353         if key not in propObj:
5354             type = propObj.get('type')
5355             if key == 'field':
5356                 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5357             elif key == 'convert':
5358                 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5359             else:
5360                 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5361             propObj[key] = default
5362         return propObj[key]
5363
5364     def _resolve_field_value(self, field, value, convertNone=False):
5365         if value is None:
5366             if not convertNone:
5367                 return None
5368         else:
5369             value = value.lower()
5370         conversion = self._get_field_setting(field, 'convert')
5371         if conversion == 'ignore':
5372             return None
5373         if conversion == 'string':
5374             return value
5375         elif conversion == 'float_none':
5376             return float_or_none(value)
5377         elif conversion == 'bytes':
5378             return parse_bytes(value)
5379         elif conversion == 'order':
5380             order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5381             use_regex = self._get_field_setting(field, 'regex')
5382             list_length = len(order_list)
5383             empty_pos = order_list.index('') if '' in order_list else list_length + 1
5384             if use_regex and value is not None:
5385                 for i, regex in enumerate(order_list):
5386                     if regex and re.match(regex, value):
5387                         return list_length - i
5388                 return list_length - empty_pos  # not in list
5389             else:  # not regex or  value = None
5390                 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5391         else:
5392             if value.isnumeric():
5393                 return float(value)
5394             else:
5395                 self.settings[field]['convert'] = 'string'
5396                 return value
5397
5398     def evaluate_params(self, params, sort_extractor):
5399         self._use_free_order = params.get('prefer_free_formats', False)
5400         self._sort_user = params.get('format_sort', [])
5401         self._sort_extractor = sort_extractor
5402
5403         def add_item(field, reverse, closest, limit_text):
5404             field = field.lower()
5405             if field in self._order:
5406                 return
5407             self._order.append(field)
5408             limit = self._resolve_field_value(field, limit_text)
5409             data = {
5410                 'reverse': reverse,
5411                 'closest': False if limit is None else closest,
5412                 'limit_text': limit_text,
5413                 'limit': limit}
5414             if field in self.settings:
5415                 self.settings[field].update(data)
5416             else:
5417                 self.settings[field] = data
5418
5419         sort_list = (
5420             tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5421             + (tuple() if params.get('format_sort_force', False)
5422                 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5423             + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5424
5425         for item in sort_list:
5426             match = re.match(self.regex, item)
5427             if match is None:
5428                 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5429             field = match.group('field')
5430             if field is None:
5431                 continue
5432             if self._get_field_setting(field, 'type') == 'alias':
5433                 alias, field = field, self._get_field_setting(field, 'field')
5434                 if self._get_field_setting(alias, 'deprecated'):
5435                     self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5436                                                 f'be removed in a future version. Please use {field} instead')
5437             reverse = match.group('reverse') is not None
5438             closest = match.group('separator') == '~'
5439             limit_text = match.group('limit')
5440
5441             has_limit = limit_text is not None
5442             has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5443             has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5444
5445             fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5446             limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5447             limit_count = len(limits)
5448             for (i, f) in enumerate(fields):
5449                 add_item(f, reverse, closest,
5450                          limits[i] if i < limit_count
5451                          else limits[0] if has_limit and not has_multiple_limits
5452                          else None)
5453
5454     def print_verbose_info(self, write_debug):
5455         if self._sort_user:
5456             write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5457         if self._sort_extractor:
5458             write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5459         write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5460             '+' if self._get_field_setting(field, 'reverse') else '', field,
5461             '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5462                           self._get_field_setting(field, 'limit_text'),
5463                           self._get_field_setting(field, 'limit'))
5464             if self._get_field_setting(field, 'limit_text') is not None else '')
5465             for field in self._order if self._get_field_setting(field, 'visible')]))
5466
5467     def _calculate_field_preference_from_value(self, format, field, type, value):
5468         reverse = self._get_field_setting(field, 'reverse')
5469         closest = self._get_field_setting(field, 'closest')
5470         limit = self._get_field_setting(field, 'limit')
5471
5472         if type == 'extractor':
5473             maximum = self._get_field_setting(field, 'max')
5474             if value is None or (maximum is not None and value >= maximum):
5475                 value = -1
5476         elif type == 'boolean':
5477             in_list = self._get_field_setting(field, 'in_list')
5478             not_in_list = self._get_field_setting(field, 'not_in_list')
5479             value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5480         elif type == 'ordered':
5481             value = self._resolve_field_value(field, value, True)
5482
5483         # try to convert to number
5484         val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5485         is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5486         if is_num:
5487             value = val_num
5488
5489         return ((-10, 0) if value is None
5490                 else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
5491                 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5492                 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5493                 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5494                 else (-1, value, 0))
5495
5496     def _calculate_field_preference(self, format, field):
5497         type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
5498         get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5499         if type == 'multiple':
5500             type = 'field'  # Only 'field' is allowed in multiple for now
5501             actual_fields = self._get_field_setting(field, 'field')
5502
5503             value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5504         else:
5505             value = get_value(field)
5506         return self._calculate_field_preference_from_value(format, field, type, value)
5507
5508     def calculate_preference(self, format):
5509         # Determine missing protocol
5510         if not format.get('protocol'):
5511             format['protocol'] = determine_protocol(format)
5512
5513         # Determine missing ext
5514         if not format.get('ext') and 'url' in format:
5515             format['ext'] = determine_ext(format['url'])
5516         if format.get('vcodec') == 'none':
5517             format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5518             format['video_ext'] = 'none'
5519         else:
5520             format['video_ext'] = format['ext']
5521             format['audio_ext'] = 'none'
5522         # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
5523         #    format['preference'] = -1000
5524
5525         if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5526             # HEVC-over-FLV is out-of-spec by FLV's original spec
5527             # ref. https://trac.ffmpeg.org/ticket/6389
5528             # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5529             format['preference'] = -100
5530
5531         # Determine missing bitrates
5532         if format.get('vcodec') == 'none':
5533             format['vbr'] = 0
5534         if format.get('acodec') == 'none':
5535             format['abr'] = 0
5536         if not format.get('vbr') and format.get('vcodec') != 'none':
5537             format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5538         if not format.get('abr') and format.get('acodec') != 'none':
5539             format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5540         if not format.get('tbr'):
5541             format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5542
5543         return tuple(self._calculate_field_preference(format, field) for field in self._order)
5544
5545
5546 # XXX: Temporary
5547 class _YDLLogger:
5548     def __init__(self, ydl=None):
5549         self._ydl = ydl
5550
5551     def debug(self, message):
5552         if self._ydl:
5553             self._ydl.write_debug(message)
5554
5555     def info(self, message):
5556         if self._ydl:
5557             self._ydl.to_screen(message)
5558
5559     def warning(self, message, *, once=False):
5560         if self._ydl:
5561             self._ydl.report_warning(message, only_once=once)
5562
5563     def error(self, message, *, is_error=True):
5564         if self._ydl:
5565             self._ydl.report_error(message, is_error=is_error)
5566
5567     def stdout(self, message):
5568         if self._ydl:
5569             self._ydl.to_stdout(message)
5570
5571     def stderr(self, message):
5572         if self._ydl:
5573             self._ydl.to_stderr(message)