yt_dlp/utils.py

   1 import atexit
   2 import base64
   3 import binascii
   4 import calendar
   5 import codecs
   6 import collections
   7 import contextlib
   8 import ctypes
   9 import datetime
  10 import email.header
  11 import email.utils
  12 import errno
  13 import gzip
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import http.client
  19 import http.cookiejar
  20 import importlib.util
  21 import io
  22 import itertools
  23 import json
  24 import locale
  25 import math
  26 import mimetypes
  27 import operator
  28 import os
  29 import platform
  30 import random
  31 import re
  32 import shlex
  33 import socket
  34 import ssl
  35 import struct
  36 import subprocess
  37 import sys
  38 import tempfile
  39 import time
  40 import traceback
  41 import types
  42 import urllib.error
  43 import urllib.parse
  44 import urllib.request
  45 import xml.etree.ElementTree
  46 import zlib
  47
  48 from .compat import asyncio, functools  # isort: split
  49 from .compat import (
  50     compat_etree_fromstring,
  51     compat_expanduser,
  52     compat_HTMLParseError,
  53     compat_os_name,
  54     compat_shlex_quote,
  55 )
  56 from .dependencies import brotli, certifi, websockets, xattr
  57 from .socks import ProxyType, sockssocket
  58
  59
  60 def register_socks_protocols():
  61     # "Register" SOCKS protocols
  62     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  63     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  64     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  65         if scheme not in urllib.parse.uses_netloc:
  66             urllib.parse.uses_netloc.append(scheme)
  67
  68
  69 # This is not clearly defined otherwise
  70 compiled_regex_type = type(re.compile(''))
  71
  72
  73 def random_user_agent():
  74     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  75     _CHROME_VERSIONS = (
  76         '90.0.4430.212',
  77         '90.0.4430.24',
  78         '90.0.4430.70',
  79         '90.0.4430.72',
  80         '90.0.4430.85',
  81         '90.0.4430.93',
  82         '91.0.4472.101',
  83         '91.0.4472.106',
  84         '91.0.4472.114',
  85         '91.0.4472.124',
  86         '91.0.4472.164',
  87         '91.0.4472.19',
  88         '91.0.4472.77',
  89         '92.0.4515.107',
  90         '92.0.4515.115',
  91         '92.0.4515.131',
  92         '92.0.4515.159',
  93         '92.0.4515.43',
  94         '93.0.4556.0',
  95         '93.0.4577.15',
  96         '93.0.4577.63',
  97         '93.0.4577.82',
  98         '94.0.4606.41',
  99         '94.0.4606.54',
 100         '94.0.4606.61',
 101         '94.0.4606.71',
 102         '94.0.4606.81',
 103         '94.0.4606.85',
 104         '95.0.4638.17',
 105         '95.0.4638.50',
 106         '95.0.4638.54',
 107         '95.0.4638.69',
 108         '95.0.4638.74',
 109         '96.0.4664.18',
 110         '96.0.4664.45',
 111         '96.0.4664.55',
 112         '96.0.4664.93',
 113         '97.0.4692.20',
 114     )
 115     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 116
 117
 118 SUPPORTED_ENCODINGS = [
 119     'gzip', 'deflate'
 120 ]
 121 if brotli:
 122     SUPPORTED_ENCODINGS.append('br')
 123
 124 std_headers = {
 125     'User-Agent': random_user_agent(),
 126     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 127     'Accept-Language': 'en-us,en;q=0.5',
 128     'Sec-Fetch-Mode': 'navigate',
 129 }
 130
 131
 132 USER_AGENTS = {
 133     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 134 }
 135
 136
 137 NO_DEFAULT = object()
 138 IDENTITY = lambda x: x
 139
 140 ENGLISH_MONTH_NAMES = [
 141     'January', 'February', 'March', 'April', 'May', 'June',
 142     'July', 'August', 'September', 'October', 'November', 'December']
 143
 144 MONTH_NAMES = {
 145     'en': ENGLISH_MONTH_NAMES,
 146     'fr': [
 147         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 148         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 149 }
 150
 151 KNOWN_EXTENSIONS = (
 152     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 153     'flv', 'f4v', 'f4a', 'f4b',
 154     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 155     'mkv', 'mka', 'mk3d',
 156     'avi', 'divx',
 157     'mov',
 158     'asf', 'wmv', 'wma',
 159     '3gp', '3g2',
 160     'mp3',
 161     'flac',
 162     'ape',
 163     'wav',
 164     'f4f', 'f4m', 'm3u8', 'smil')
 165
 166 # needed for sanitizing filenames in restricted mode
 167 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 168                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 169                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 170
 171 DATE_FORMATS = (
 172     '%d %B %Y',
 173     '%d %b %Y',
 174     '%B %d %Y',
 175     '%B %dst %Y',
 176     '%B %dnd %Y',
 177     '%B %drd %Y',
 178     '%B %dth %Y',
 179     '%b %d %Y',
 180     '%b %dst %Y',
 181     '%b %dnd %Y',
 182     '%b %drd %Y',
 183     '%b %dth %Y',
 184     '%b %dst %Y %I:%M',
 185     '%b %dnd %Y %I:%M',
 186     '%b %drd %Y %I:%M',
 187     '%b %dth %Y %I:%M',
 188     '%Y %m %d',
 189     '%Y-%m-%d',
 190     '%Y.%m.%d.',
 191     '%Y/%m/%d',
 192     '%Y/%m/%d %H:%M',
 193     '%Y/%m/%d %H:%M:%S',
 194     '%Y%m%d%H%M',
 195     '%Y%m%d%H%M%S',
 196     '%Y%m%d',
 197     '%Y-%m-%d %H:%M',
 198     '%Y-%m-%d %H:%M:%S',
 199     '%Y-%m-%d %H:%M:%S.%f',
 200     '%Y-%m-%d %H:%M:%S:%f',
 201     '%d.%m.%Y %H:%M',
 202     '%d.%m.%Y %H.%M',
 203     '%Y-%m-%dT%H:%M:%SZ',
 204     '%Y-%m-%dT%H:%M:%S.%fZ',
 205     '%Y-%m-%dT%H:%M:%S.%f0Z',
 206     '%Y-%m-%dT%H:%M:%S',
 207     '%Y-%m-%dT%H:%M:%S.%f',
 208     '%Y-%m-%dT%H:%M',
 209     '%b %d %Y at %H:%M',
 210     '%b %d %Y at %H:%M:%S',
 211     '%B %d %Y at %H:%M',
 212     '%B %d %Y at %H:%M:%S',
 213     '%H:%M %d-%b-%Y',
 214 )
 215
 216 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 217 DATE_FORMATS_DAY_FIRST.extend([
 218     '%d-%m-%Y',
 219     '%d.%m.%Y',
 220     '%d.%m.%y',
 221     '%d/%m/%Y',
 222     '%d/%m/%y',
 223     '%d/%m/%Y %H:%M:%S',
 224 ])
 225
 226 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 227 DATE_FORMATS_MONTH_FIRST.extend([
 228     '%m-%d-%Y',
 229     '%m.%d.%Y',
 230     '%m/%d/%Y',
 231     '%m/%d/%y',
 232     '%m/%d/%Y %H:%M:%S',
 233 ])
 234
 235 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 236 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
 237
 238 NUMBER_RE = r'\d+(?:\.\d+)?'
 239
 240
 241 @functools.cache
 242 def preferredencoding():
 243     """Get preferred encoding.
 244
 245     Returns the best encoding scheme for the system, based on
 246     locale.getpreferredencoding() and some further tweaks.
 247     """
 248     try:
 249         pref = locale.getpreferredencoding()
 250         'TEST'.encode(pref)
 251     except Exception:
 252         pref = 'UTF-8'
 253
 254     return pref
 255
 256
 257 def write_json_file(obj, fn):
 258     """ Encode obj as JSON and write it to fn, atomically if possible """
 259
 260     tf = tempfile.NamedTemporaryFile(
 261         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 262         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 263
 264     try:
 265         with tf:
 266             json.dump(obj, tf, ensure_ascii=False)
 267         if sys.platform == 'win32':
 268             # Need to remove existing file on Windows, else os.rename raises
 269             # WindowsError or FileExistsError.
 270             with contextlib.suppress(OSError):
 271                 os.unlink(fn)
 272         with contextlib.suppress(OSError):
 273             mask = os.umask(0)
 274             os.umask(mask)
 275             os.chmod(tf.name, 0o666 & ~mask)
 276         os.rename(tf.name, fn)
 277     except Exception:
 278         with contextlib.suppress(OSError):
 279             os.remove(tf.name)
 280         raise
 281
 282
 283 def find_xpath_attr(node, xpath, key, val=None):
 284     """ Find the xpath xpath[@key=val] """
 285     assert re.match(r'^[a-zA-Z_-]+$', key)
 286     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 287     return node.find(expr)
 288
 289 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 290 # the namespace parameter
 291
 292
 293 def xpath_with_ns(path, ns_map):
 294     components = [c.split(':') for c in path.split('/')]
 295     replaced = []
 296     for c in components:
 297         if len(c) == 1:
 298             replaced.append(c[0])
 299         else:
 300             ns, tag = c
 301             replaced.append('{%s}%s' % (ns_map[ns], tag))
 302     return '/'.join(replaced)
 303
 304
 305 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 306     def _find_xpath(xpath):
 307         return node.find(xpath)
 308
 309     if isinstance(xpath, str):
 310         n = _find_xpath(xpath)
 311     else:
 312         for xp in xpath:
 313             n = _find_xpath(xp)
 314             if n is not None:
 315                 break
 316
 317     if n is None:
 318         if default is not NO_DEFAULT:
 319             return default
 320         elif fatal:
 321             name = xpath if name is None else name
 322             raise ExtractorError('Could not find XML element %s' % name)
 323         else:
 324             return None
 325     return n
 326
 327
 328 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 329     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 330     if n is None or n == default:
 331         return n
 332     if n.text is None:
 333         if default is not NO_DEFAULT:
 334             return default
 335         elif fatal:
 336             name = xpath if name is None else name
 337             raise ExtractorError('Could not find XML element\'s text %s' % name)
 338         else:
 339             return None
 340     return n.text
 341
 342
 343 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 344     n = find_xpath_attr(node, xpath, key)
 345     if n is None:
 346         if default is not NO_DEFAULT:
 347             return default
 348         elif fatal:
 349             name = f'{xpath}[@{key}]' if name is None else name
 350             raise ExtractorError('Could not find XML attribute %s' % name)
 351         else:
 352             return None
 353     return n.attrib[key]
 354
 355
 356 def get_element_by_id(id, html, **kwargs):
 357     """Return the content of the tag with the specified ID in the passed HTML document"""
 358     return get_element_by_attribute('id', id, html, **kwargs)
 359
 360
 361 def get_element_html_by_id(id, html, **kwargs):
 362     """Return the html of the tag with the specified ID in the passed HTML document"""
 363     return get_element_html_by_attribute('id', id, html, **kwargs)
 364
 365
 366 def get_element_by_class(class_name, html):
 367     """Return the content of the first tag with the specified class in the passed HTML document"""
 368     retval = get_elements_by_class(class_name, html)
 369     return retval[0] if retval else None
 370
 371
 372 def get_element_html_by_class(class_name, html):
 373     """Return the html of the first tag with the specified class in the passed HTML document"""
 374     retval = get_elements_html_by_class(class_name, html)
 375     return retval[0] if retval else None
 376
 377
 378 def get_element_by_attribute(attribute, value, html, **kwargs):
 379     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 380     return retval[0] if retval else None
 381
 382
 383 def get_element_html_by_attribute(attribute, value, html, **kargs):
 384     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 385     return retval[0] if retval else None
 386
 387
 388 def get_elements_by_class(class_name, html, **kargs):
 389     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 390     return get_elements_by_attribute(
 391         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 392         html, escape_value=False)
 393
 394
 395 def get_elements_html_by_class(class_name, html):
 396     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 397     return get_elements_html_by_attribute(
 398         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 399         html, escape_value=False)
 400
 401
 402 def get_elements_by_attribute(*args, **kwargs):
 403     """Return the content of the tag with the specified attribute in the passed HTML document"""
 404     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 405
 406
 407 def get_elements_html_by_attribute(*args, **kwargs):
 408     """Return the html of the tag with the specified attribute in the passed HTML document"""
 409     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 410
 411
 412 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 413     """
 414     Return the text (content) and the html (whole) of the tag with the specified
 415     attribute in the passed HTML document
 416     """
 417
 418     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 419
 420     value = re.escape(value) if escape_value else value
 421
 422     partial_element_re = rf'''(?x)
 423         <(?P<tag>[a-zA-Z0-9:._-]+)
 424          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 425          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 426         '''
 427
 428     for m in re.finditer(partial_element_re, html):
 429         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 430
 431         yield (
 432             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 433             whole
 434         )
 435
 436
 437 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 438     """
 439     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 440     closing tag for the first opening tag it has encountered, and can be used
 441     as a context manager
 442     """
 443
 444     class HTMLBreakOnClosingTagException(Exception):
 445         pass
 446
 447     def __init__(self):
 448         self.tagstack = collections.deque()
 449         html.parser.HTMLParser.__init__(self)
 450
 451     def __enter__(self):
 452         return self
 453
 454     def __exit__(self, *_):
 455         self.close()
 456
 457     def close(self):
 458         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 459         # so data remains buffered; we no longer have any interest in it, thus
 460         # override this method to discard it
 461         pass
 462
 463     def handle_starttag(self, tag, _):
 464         self.tagstack.append(tag)
 465
 466     def handle_endtag(self, tag):
 467         if not self.tagstack:
 468             raise compat_HTMLParseError('no tags in the stack')
 469         while self.tagstack:
 470             inner_tag = self.tagstack.pop()
 471             if inner_tag == tag:
 472                 break
 473         else:
 474             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 475         if not self.tagstack:
 476             raise self.HTMLBreakOnClosingTagException()
 477
 478
 479 def get_element_text_and_html_by_tag(tag, html):
 480     """
 481     For the first element with the specified tag in the passed HTML document
 482     return its' content (text) and the whole element (html)
 483     """
 484     def find_or_raise(haystack, needle, exc):
 485         try:
 486             return haystack.index(needle)
 487         except ValueError:
 488             raise exc
 489     closing_tag = f'</{tag}>'
 490     whole_start = find_or_raise(
 491         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 492     content_start = find_or_raise(
 493         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 494     content_start += whole_start + 1
 495     with HTMLBreakOnClosingTagParser() as parser:
 496         parser.feed(html[whole_start:content_start])
 497         if not parser.tagstack or parser.tagstack[0] != tag:
 498             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 499         offset = content_start
 500         while offset < len(html):
 501             next_closing_tag_start = find_or_raise(
 502                 html[offset:], closing_tag,
 503                 compat_HTMLParseError(f'closing {tag} tag not found'))
 504             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 505             try:
 506                 parser.feed(html[offset:offset + next_closing_tag_end])
 507                 offset += next_closing_tag_end
 508             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 509                 return html[content_start:offset + next_closing_tag_start], \
 510                     html[whole_start:offset + next_closing_tag_end]
 511         raise compat_HTMLParseError('unexpected end of html')
 512
 513
 514 class HTMLAttributeParser(html.parser.HTMLParser):
 515     """Trivial HTML parser to gather the attributes for a single element"""
 516
 517     def __init__(self):
 518         self.attrs = {}
 519         html.parser.HTMLParser.__init__(self)
 520
 521     def handle_starttag(self, tag, attrs):
 522         self.attrs = dict(attrs)
 523
 524
 525 class HTMLListAttrsParser(html.parser.HTMLParser):
 526     """HTML parser to gather the attributes for the elements of a list"""
 527
 528     def __init__(self):
 529         html.parser.HTMLParser.__init__(self)
 530         self.items = []
 531         self._level = 0
 532
 533     def handle_starttag(self, tag, attrs):
 534         if tag == 'li' and self._level == 0:
 535             self.items.append(dict(attrs))
 536         self._level += 1
 537
 538     def handle_endtag(self, tag):
 539         self._level -= 1
 540
 541
 542 def extract_attributes(html_element):
 543     """Given a string for an HTML element such as
 544     <el
 545          a="foo" B="bar" c="&98;az" d=boz
 546          empty= noval entity="&amp;"
 547          sq='"' dq="'"
 548     >
 549     Decode and return a dictionary of attributes.
 550     {
 551         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 552         'empty': '', 'noval': None, 'entity': '&',
 553         'sq': '"', 'dq': '\''
 554     }.
 555     """
 556     parser = HTMLAttributeParser()
 557     with contextlib.suppress(compat_HTMLParseError):
 558         parser.feed(html_element)
 559         parser.close()
 560     return parser.attrs
 561
 562
 563 def parse_list(webpage):
 564     """Given a string for an series of HTML <li> elements,
 565     return a dictionary of their attributes"""
 566     parser = HTMLListAttrsParser()
 567     parser.feed(webpage)
 568     parser.close()
 569     return parser.items
 570
 571
 572 def clean_html(html):
 573     """Clean an HTML snippet into a readable string"""
 574
 575     if html is None:  # Convenience for sanitizing descriptions etc.
 576         return html
 577
 578     html = re.sub(r'\s+', ' ', html)
 579     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 580     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 581     # Strip html tags
 582     html = re.sub('<.*?>', '', html)
 583     # Replace html entities
 584     html = unescapeHTML(html)
 585     return html.strip()
 586
 587
 588 class LenientJSONDecoder(json.JSONDecoder):
 589     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 590         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 591         super().__init__(*args, **kwargs)
 592
 593     def decode(self, s):
 594         if self.transform_source:
 595             s = self.transform_source(s)
 596         if self.ignore_extra:
 597             return self.raw_decode(s.lstrip())[0]
 598         return super().decode(s)
 599
 600
 601 def sanitize_open(filename, open_mode):
 602     """Try to open the given filename, and slightly tweak it if this fails.
 603
 604     Attempts to open the given filename. If this fails, it tries to change
 605     the filename slightly, step by step, until it's either able to open it
 606     or it fails and raises a final exception, like the standard open()
 607     function.
 608
 609     It returns the tuple (stream, definitive_file_name).
 610     """
 611     if filename == '-':
 612         if sys.platform == 'win32':
 613             import msvcrt
 614             msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 615         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 616
 617     for attempt in range(2):
 618         try:
 619             try:
 620                 if sys.platform == 'win32':
 621                     # FIXME: An exclusive lock also locks the file from being read.
 622                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 623                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 624                     raise LockingUnsupportedError()
 625                 stream = locked_file(filename, open_mode, block=False).__enter__()
 626             except OSError:
 627                 stream = open(filename, open_mode)
 628             return stream, filename
 629         except OSError as err:
 630             if attempt or err.errno in (errno.EACCES,):
 631                 raise
 632             old_filename, filename = filename, sanitize_path(filename)
 633             if old_filename == filename:
 634                 raise
 635
 636
 637 def timeconvert(timestr):
 638     """Convert RFC 2822 defined time string into system timestamp"""
 639     timestamp = None
 640     timetuple = email.utils.parsedate_tz(timestr)
 641     if timetuple is not None:
 642         timestamp = email.utils.mktime_tz(timetuple)
 643     return timestamp
 644
 645
 646 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 647     """Sanitizes a string so it could be used as part of a filename.
 648     @param restricted   Use a stricter subset of allowed characters
 649     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 650                         If unset, yt-dlp's new sanitization rules are in effect
 651     """
 652     if s == '':
 653         return ''
 654
 655     def replace_insane(char):
 656         if restricted and char in ACCENT_CHARS:
 657             return ACCENT_CHARS[char]
 658         elif not restricted and char == '\n':
 659             return '\0 '
 660         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 661             return ''
 662         elif char == '"':
 663             return '' if restricted else '\''
 664         elif char == ':':
 665             return '\0_\0-' if restricted else '\0 \0-'
 666         elif char in '\\/|*<>':
 667             return '\0_'
 668         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 669             return '\0_'
 670         return char
 671
 672     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 673     result = ''.join(map(replace_insane, s))
 674     if is_id is NO_DEFAULT:
 675         result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result)  # Remove repeated substitute chars
 676         STRIP_RE = '(?:\0.|[ _-])*'
 677         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 678     result = result.replace('\0', '') or '_'
 679
 680     if not is_id:
 681         while '__' in result:
 682             result = result.replace('__', '_')
 683         result = result.strip('_')
 684         # Common case of "Foreign band name - English song title"
 685         if restricted and result.startswith('-_'):
 686             result = result[2:]
 687         if result.startswith('-'):
 688             result = '_' + result[len('-'):]
 689         result = result.lstrip('.')
 690         if not result:
 691             result = '_'
 692     return result
 693
 694
 695 def sanitize_path(s, force=False):
 696     """Sanitizes and normalizes path on Windows"""
 697     if sys.platform == 'win32':
 698         force = False
 699         drive_or_unc, _ = os.path.splitdrive(s)
 700     elif force:
 701         drive_or_unc = ''
 702     else:
 703         return s
 704
 705     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 706     if drive_or_unc:
 707         norm_path.pop(0)
 708     sanitized_path = [
 709         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 710         for path_part in norm_path]
 711     if drive_or_unc:
 712         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 713     elif force and s and s[0] == os.path.sep:
 714         sanitized_path.insert(0, os.path.sep)
 715     return os.path.join(*sanitized_path)
 716
 717
 718 def sanitize_url(url):
 719     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 720     # the number of unwanted failures due to missing protocol
 721     if url is None:
 722         return
 723     elif url.startswith('//'):
 724         return 'http:%s' % url
 725     # Fix some common typos seen so far
 726     COMMON_TYPOS = (
 727         # https://github.com/ytdl-org/youtube-dl/issues/15649
 728         (r'^httpss://', r'https://'),
 729         # https://bx1.be/lives/direct-tv/
 730         (r'^rmtp([es]?)://', r'rtmp\1://'),
 731     )
 732     for mistake, fixup in COMMON_TYPOS:
 733         if re.match(mistake, url):
 734             return re.sub(mistake, fixup, url)
 735     return url
 736
 737
 738 def extract_basic_auth(url):
 739     parts = urllib.parse.urlsplit(url)
 740     if parts.username is None:
 741         return url, None
 742     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 743         parts.hostname if parts.port is None
 744         else '%s:%d' % (parts.hostname, parts.port))))
 745     auth_payload = base64.b64encode(
 746         ('%s:%s' % (parts.username, parts.password or '')).encode())
 747     return url, f'Basic {auth_payload.decode()}'
 748
 749
 750 def sanitized_Request(url, *args, **kwargs):
 751     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 752     if auth_header is not None:
 753         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 754         headers['Authorization'] = auth_header
 755     return urllib.request.Request(url, *args, **kwargs)
 756
 757
 758 def expand_path(s):
 759     """Expand shell variables and ~"""
 760     return os.path.expandvars(compat_expanduser(s))
 761
 762
 763 def orderedSet(iterable, *, lazy=False):
 764     """Remove all duplicates from the input iterable"""
 765     def _iter():
 766         seen = []  # Do not use set since the items can be unhashable
 767         for x in iterable:
 768             if x not in seen:
 769                 seen.append(x)
 770                 yield x
 771
 772     return _iter() if lazy else list(_iter())
 773
 774
 775 def _htmlentity_transform(entity_with_semicolon):
 776     """Transforms an HTML entity to a character."""
 777     entity = entity_with_semicolon[:-1]
 778
 779     # Known non-numeric HTML entity
 780     if entity in html.entities.name2codepoint:
 781         return chr(html.entities.name2codepoint[entity])
 782
 783     # TODO: HTML5 allows entities without a semicolon. For example,
 784     # '&Eacuteric' should be decoded as 'Éric'.
 785     if entity_with_semicolon in html.entities.html5:
 786         return html.entities.html5[entity_with_semicolon]
 787
 788     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 789     if mobj is not None:
 790         numstr = mobj.group(1)
 791         if numstr.startswith('x'):
 792             base = 16
 793             numstr = '0%s' % numstr
 794         else:
 795             base = 10
 796         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 797         with contextlib.suppress(ValueError):
 798             return chr(int(numstr, base))
 799
 800     # Unknown entity in name, return its literal representation
 801     return '&%s;' % entity
 802
 803
 804 def unescapeHTML(s):
 805     if s is None:
 806         return None
 807     assert isinstance(s, str)
 808
 809     return re.sub(
 810         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 811
 812
 813 def escapeHTML(text):
 814     return (
 815         text
 816         .replace('&', '&amp;')
 817         .replace('<', '&lt;')
 818         .replace('>', '&gt;')
 819         .replace('"', '&quot;')
 820         .replace("'", '&#39;')
 821     )
 822
 823
 824 def process_communicate_or_kill(p, *args, **kwargs):
 825     write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
 826                  'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
 827     return Popen.communicate_or_kill(p, *args, **kwargs)
 828
 829
 830 class Popen(subprocess.Popen):
 831     if sys.platform == 'win32':
 832         _startupinfo = subprocess.STARTUPINFO()
 833         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 834     else:
 835         _startupinfo = None
 836
 837     def __init__(self, *args, text=False, **kwargs):
 838         if text is True:
 839             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 840             kwargs.setdefault('encoding', 'utf-8')
 841             kwargs.setdefault('errors', 'replace')
 842         super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
 843
 844     def communicate_or_kill(self, *args, **kwargs):
 845         try:
 846             return self.communicate(*args, **kwargs)
 847         except BaseException:  # Including KeyboardInterrupt
 848             self.kill(timeout=None)
 849             raise
 850
 851     def kill(self, *, timeout=0):
 852         super().kill()
 853         if timeout != 0:
 854             self.wait(timeout=timeout)
 855
 856     @classmethod
 857     def run(cls, *args, **kwargs):
 858         with cls(*args, **kwargs) as proc:
 859             stdout, stderr = proc.communicate_or_kill()
 860             return stdout or '', stderr or '', proc.returncode
 861
 862
 863 def get_subprocess_encoding():
 864     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 865         # For subprocess calls, encode with locale encoding
 866         # Refer to http://stackoverflow.com/a/9951851/35070
 867         encoding = preferredencoding()
 868     else:
 869         encoding = sys.getfilesystemencoding()
 870     if encoding is None:
 871         encoding = 'utf-8'
 872     return encoding
 873
 874
 875 def encodeFilename(s, for_subprocess=False):
 876     assert isinstance(s, str)
 877     return s
 878
 879
 880 def decodeFilename(b, for_subprocess=False):
 881     return b
 882
 883
 884 def encodeArgument(s):
 885     # Legacy code that uses byte strings
 886     # Uncomment the following line after fixing all post processors
 887     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 888     return s if isinstance(s, str) else s.decode('ascii')
 889
 890
 891 def decodeArgument(b):
 892     return b
 893
 894
 895 def decodeOption(optval):
 896     if optval is None:
 897         return optval
 898     if isinstance(optval, bytes):
 899         optval = optval.decode(preferredencoding())
 900
 901     assert isinstance(optval, str)
 902     return optval
 903
 904
 905 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 906
 907
 908 def timetuple_from_msec(msec):
 909     secs, msec = divmod(msec, 1000)
 910     mins, secs = divmod(secs, 60)
 911     hrs, mins = divmod(mins, 60)
 912     return _timetuple(hrs, mins, secs, msec)
 913
 914
 915 def formatSeconds(secs, delim=':', msec=False):
 916     time = timetuple_from_msec(secs * 1000)
 917     if time.hours:
 918         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 919     elif time.minutes:
 920         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 921     else:
 922         ret = '%d' % time.seconds
 923     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 924
 925
 926 def _ssl_load_windows_store_certs(ssl_context, storename):
 927     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 928     try:
 929         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 930                  if encoding == 'x509_asn' and (
 931                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 932     except PermissionError:
 933         return
 934     for cert in certs:
 935         with contextlib.suppress(ssl.SSLError):
 936             ssl_context.load_verify_locations(cadata=cert)
 937
 938
 939 def make_HTTPS_handler(params, **kwargs):
 940     opts_check_certificate = not params.get('nocheckcertificate')
 941     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 942     context.check_hostname = opts_check_certificate
 943     if params.get('legacyserverconnect'):
 944         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 945         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 946         context.set_ciphers('DEFAULT')
 947
 948     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 949     if opts_check_certificate:
 950         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 951             context.load_verify_locations(cafile=certifi.where())
 952         try:
 953             context.load_default_certs()
 954         # Work around the issue in load_default_certs when there are bad certificates. See:
 955         # https://github.com/yt-dlp/yt-dlp/issues/1060,
 956         # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 957         except ssl.SSLError:
 958             # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 959             if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 960                 for storename in ('CA', 'ROOT'):
 961                     _ssl_load_windows_store_certs(context, storename)
 962             context.set_default_verify_paths()
 963
 964     client_certfile = params.get('client_certificate')
 965     if client_certfile:
 966         try:
 967             context.load_cert_chain(
 968                 client_certfile, keyfile=params.get('client_certificate_key'),
 969                 password=params.get('client_certificate_password'))
 970         except ssl.SSLError:
 971             raise YoutubeDLError('Unable to load client certificate')
 972
 973     # Some servers may reject requests if ALPN extension is not sent. See:
 974     # https://github.com/python/cpython/issues/85140
 975     # https://github.com/yt-dlp/yt-dlp/issues/3878
 976     with contextlib.suppress(NotImplementedError):
 977         context.set_alpn_protocols(['http/1.1'])
 978
 979     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 980
 981
 982 def bug_reports_message(before=';'):
 983     from .update import REPOSITORY
 984
 985     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 986            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 987
 988     before = before.rstrip()
 989     if not before or before.endswith(('.', '!', '?')):
 990         msg = msg[0].title() + msg[1:]
 991
 992     return (before + ' ' if before else '') + msg
 993
 994
 995 class YoutubeDLError(Exception):
 996     """Base exception for YoutubeDL errors."""
 997     msg = None
 998
 999     def __init__(self, msg=None):
1000         if msg is not None:
1001             self.msg = msg
1002         elif self.msg is None:
1003             self.msg = type(self).__name__
1004         super().__init__(self.msg)
1005
1006
1007 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1008 if hasattr(ssl, 'CertificateError'):
1009     network_exceptions.append(ssl.CertificateError)
1010 network_exceptions = tuple(network_exceptions)
1011
1012
1013 class ExtractorError(YoutubeDLError):
1014     """Error during info extraction."""
1015
1016     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1017         """ tb, if given, is the original traceback (so that it can be printed out).
1018         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1019         """
1020         if sys.exc_info()[0] in network_exceptions:
1021             expected = True
1022
1023         self.orig_msg = str(msg)
1024         self.traceback = tb
1025         self.expected = expected
1026         self.cause = cause
1027         self.video_id = video_id
1028         self.ie = ie
1029         self.exc_info = sys.exc_info()  # preserve original exception
1030         if isinstance(self.exc_info[1], ExtractorError):
1031             self.exc_info = self.exc_info[1].exc_info
1032
1033         super().__init__(''.join((
1034             format_field(ie, None, '[%s] '),
1035             format_field(video_id, None, '%s: '),
1036             msg,
1037             format_field(cause, None, ' (caused by %r)'),
1038             '' if expected else bug_reports_message())))
1039
1040     def format_traceback(self):
1041         return join_nonempty(
1042             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1043             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1044             delim='\n') or None
1045
1046
1047 class UnsupportedError(ExtractorError):
1048     def __init__(self, url):
1049         super().__init__(
1050             'Unsupported URL: %s' % url, expected=True)
1051         self.url = url
1052
1053
1054 class RegexNotFoundError(ExtractorError):
1055     """Error when a regex didn't match"""
1056     pass
1057
1058
1059 class GeoRestrictedError(ExtractorError):
1060     """Geographic restriction Error exception.
1061
1062     This exception may be thrown when a video is not available from your
1063     geographic location due to geographic restrictions imposed by a website.
1064     """
1065
1066     def __init__(self, msg, countries=None, **kwargs):
1067         kwargs['expected'] = True
1068         super().__init__(msg, **kwargs)
1069         self.countries = countries
1070
1071
1072 class DownloadError(YoutubeDLError):
1073     """Download Error exception.
1074
1075     This exception may be thrown by FileDownloader objects if they are not
1076     configured to continue on errors. They will contain the appropriate
1077     error message.
1078     """
1079
1080     def __init__(self, msg, exc_info=None):
1081         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1082         super().__init__(msg)
1083         self.exc_info = exc_info
1084
1085
1086 class EntryNotInPlaylist(YoutubeDLError):
1087     """Entry not in playlist exception.
1088
1089     This exception will be thrown by YoutubeDL when a requested entry
1090     is not found in the playlist info_dict
1091     """
1092     msg = 'Entry not found in info'
1093
1094
1095 class SameFileError(YoutubeDLError):
1096     """Same File exception.
1097
1098     This exception will be thrown by FileDownloader objects if they detect
1099     multiple files would have to be downloaded to the same file on disk.
1100     """
1101     msg = 'Fixed output name but more than one file to download'
1102
1103     def __init__(self, filename=None):
1104         if filename is not None:
1105             self.msg += f': {filename}'
1106         super().__init__(self.msg)
1107
1108
1109 class PostProcessingError(YoutubeDLError):
1110     """Post Processing exception.
1111
1112     This exception may be raised by PostProcessor's .run() method to
1113     indicate an error in the postprocessing task.
1114     """
1115
1116
1117 class DownloadCancelled(YoutubeDLError):
1118     """ Exception raised when the download queue should be interrupted """
1119     msg = 'The download was cancelled'
1120
1121
1122 class ExistingVideoReached(DownloadCancelled):
1123     """ --break-on-existing triggered """
1124     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1125
1126
1127 class RejectedVideoReached(DownloadCancelled):
1128     """ --break-on-reject triggered """
1129     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1130
1131
1132 class MaxDownloadsReached(DownloadCancelled):
1133     """ --max-downloads limit has been reached. """
1134     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1135
1136
1137 class ReExtractInfo(YoutubeDLError):
1138     """ Video info needs to be re-extracted. """
1139
1140     def __init__(self, msg, expected=False):
1141         super().__init__(msg)
1142         self.expected = expected
1143
1144
1145 class ThrottledDownload(ReExtractInfo):
1146     """ Download speed below --throttled-rate. """
1147     msg = 'The download speed is below throttle limit'
1148
1149     def __init__(self):
1150         super().__init__(self.msg, expected=False)
1151
1152
1153 class UnavailableVideoError(YoutubeDLError):
1154     """Unavailable Format exception.
1155
1156     This exception will be thrown when a video is requested
1157     in a format that is not available for that video.
1158     """
1159     msg = 'Unable to download video'
1160
1161     def __init__(self, err=None):
1162         if err is not None:
1163             self.msg += f': {err}'
1164         super().__init__(self.msg)
1165
1166
1167 class ContentTooShortError(YoutubeDLError):
1168     """Content Too Short exception.
1169
1170     This exception may be raised by FileDownloader objects when a file they
1171     download is too small for what the server announced first, indicating
1172     the connection was probably interrupted.
1173     """
1174
1175     def __init__(self, downloaded, expected):
1176         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1177         # Both in bytes
1178         self.downloaded = downloaded
1179         self.expected = expected
1180
1181
1182 class XAttrMetadataError(YoutubeDLError):
1183     def __init__(self, code=None, msg='Unknown error'):
1184         super().__init__(msg)
1185         self.code = code
1186         self.msg = msg
1187
1188         # Parsing code and msg
1189         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1190                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1191             self.reason = 'NO_SPACE'
1192         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1193             self.reason = 'VALUE_TOO_LONG'
1194         else:
1195             self.reason = 'NOT_SUPPORTED'
1196
1197
1198 class XAttrUnavailableError(YoutubeDLError):
1199     pass
1200
1201
1202 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1203     hc = http_class(*args, **kwargs)
1204     source_address = ydl_handler._params.get('source_address')
1205
1206     if source_address is not None:
1207         # This is to workaround _create_connection() from socket where it will try all
1208         # address data from getaddrinfo() including IPv6. This filters the result from
1209         # getaddrinfo() based on the source_address value.
1210         # This is based on the cpython socket.create_connection() function.
1211         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1212         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1213             host, port = address
1214             err = None
1215             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1216             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1217             ip_addrs = [addr for addr in addrs if addr[0] == af]
1218             if addrs and not ip_addrs:
1219                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1220                 raise OSError(
1221                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1222                     % (ip_version, source_address[0]))
1223             for res in ip_addrs:
1224                 af, socktype, proto, canonname, sa = res
1225                 sock = None
1226                 try:
1227                     sock = socket.socket(af, socktype, proto)
1228                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1229                         sock.settimeout(timeout)
1230                     sock.bind(source_address)
1231                     sock.connect(sa)
1232                     err = None  # Explicitly break reference cycle
1233                     return sock
1234                 except OSError as _:
1235                     err = _
1236                     if sock is not None:
1237                         sock.close()
1238             if err is not None:
1239                 raise err
1240             else:
1241                 raise OSError('getaddrinfo returns an empty list')
1242         if hasattr(hc, '_create_connection'):
1243             hc._create_connection = _create_connection
1244         hc.source_address = (source_address, 0)
1245
1246     return hc
1247
1248
1249 def handle_youtubedl_headers(headers):
1250     filtered_headers = headers
1251
1252     if 'Youtubedl-no-compression' in filtered_headers:
1253         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1254         del filtered_headers['Youtubedl-no-compression']
1255
1256     return filtered_headers
1257
1258
1259 class YoutubeDLHandler(urllib.request.HTTPHandler):
1260     """Handler for HTTP requests and responses.
1261
1262     This class, when installed with an OpenerDirector, automatically adds
1263     the standard headers to every HTTP request and handles gzipped and
1264     deflated responses from web servers. If compression is to be avoided in
1265     a particular request, the original request in the program code only has
1266     to include the HTTP header "Youtubedl-no-compression", which will be
1267     removed before making the real request.
1268
1269     Part of this code was copied from:
1270
1271     http://techknack.net/python-urllib2-handlers/
1272
1273     Andrew Rowls, the author of that code, agreed to release it to the
1274     public domain.
1275     """
1276
1277     def __init__(self, params, *args, **kwargs):
1278         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1279         self._params = params
1280
1281     def http_open(self, req):
1282         conn_class = http.client.HTTPConnection
1283
1284         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1285         if socks_proxy:
1286             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1287             del req.headers['Ytdl-socks-proxy']
1288
1289         return self.do_open(functools.partial(
1290             _create_http_connection, self, conn_class, False),
1291             req)
1292
1293     @staticmethod
1294     def deflate(data):
1295         if not data:
1296             return data
1297         try:
1298             return zlib.decompress(data, -zlib.MAX_WBITS)
1299         except zlib.error:
1300             return zlib.decompress(data)
1301
1302     @staticmethod
1303     def brotli(data):
1304         if not data:
1305             return data
1306         return brotli.decompress(data)
1307
1308     def http_request(self, req):
1309         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1310         # always respected by websites, some tend to give out URLs with non percent-encoded
1311         # non-ASCII characters (see telemb.py, ard.py [#3412])
1312         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1313         # To work around aforementioned issue we will replace request's original URL with
1314         # percent-encoded one
1315         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1316         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1317         url = req.get_full_url()
1318         url_escaped = escape_url(url)
1319
1320         # Substitute URL if any change after escaping
1321         if url != url_escaped:
1322             req = update_Request(req, url=url_escaped)
1323
1324         for h, v in self._params.get('http_headers', std_headers).items():
1325             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1326             # The dict keys are capitalized because of this bug by urllib
1327             if h.capitalize() not in req.headers:
1328                 req.add_header(h, v)
1329
1330         if 'Accept-encoding' not in req.headers:
1331             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1332
1333         req.headers = handle_youtubedl_headers(req.headers)
1334
1335         return super().do_request_(req)
1336
1337     def http_response(self, req, resp):
1338         old_resp = resp
1339         # gzip
1340         if resp.headers.get('Content-encoding', '') == 'gzip':
1341             content = resp.read()
1342             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1343             try:
1344                 uncompressed = io.BytesIO(gz.read())
1345             except OSError as original_ioerror:
1346                 # There may be junk add the end of the file
1347                 # See http://stackoverflow.com/q/4928560/35070 for details
1348                 for i in range(1, 1024):
1349                     try:
1350                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1351                         uncompressed = io.BytesIO(gz.read())
1352                     except OSError:
1353                         continue
1354                     break
1355                 else:
1356                     raise original_ioerror
1357             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1358             resp.msg = old_resp.msg
1359             del resp.headers['Content-encoding']
1360         # deflate
1361         if resp.headers.get('Content-encoding', '') == 'deflate':
1362             gz = io.BytesIO(self.deflate(resp.read()))
1363             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1364             resp.msg = old_resp.msg
1365             del resp.headers['Content-encoding']
1366         # brotli
1367         if resp.headers.get('Content-encoding', '') == 'br':
1368             resp = urllib.request.addinfourl(
1369                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1370             resp.msg = old_resp.msg
1371             del resp.headers['Content-encoding']
1372         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1373         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1374         if 300 <= resp.code < 400:
1375             location = resp.headers.get('Location')
1376             if location:
1377                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1378                 location = location.encode('iso-8859-1').decode()
1379                 location_escaped = escape_url(location)
1380                 if location != location_escaped:
1381                     del resp.headers['Location']
1382                     resp.headers['Location'] = location_escaped
1383         return resp
1384
1385     https_request = http_request
1386     https_response = http_response
1387
1388
1389 def make_socks_conn_class(base_class, socks_proxy):
1390     assert issubclass(base_class, (
1391         http.client.HTTPConnection, http.client.HTTPSConnection))
1392
1393     url_components = urllib.parse.urlparse(socks_proxy)
1394     if url_components.scheme.lower() == 'socks5':
1395         socks_type = ProxyType.SOCKS5
1396     elif url_components.scheme.lower() in ('socks', 'socks4'):
1397         socks_type = ProxyType.SOCKS4
1398     elif url_components.scheme.lower() == 'socks4a':
1399         socks_type = ProxyType.SOCKS4A
1400
1401     def unquote_if_non_empty(s):
1402         if not s:
1403             return s
1404         return urllib.parse.unquote_plus(s)
1405
1406     proxy_args = (
1407         socks_type,
1408         url_components.hostname, url_components.port or 1080,
1409         True,  # Remote DNS
1410         unquote_if_non_empty(url_components.username),
1411         unquote_if_non_empty(url_components.password),
1412     )
1413
1414     class SocksConnection(base_class):
1415         def connect(self):
1416             self.sock = sockssocket()
1417             self.sock.setproxy(*proxy_args)
1418             if isinstance(self.timeout, (int, float)):
1419                 self.sock.settimeout(self.timeout)
1420             self.sock.connect((self.host, self.port))
1421
1422             if isinstance(self, http.client.HTTPSConnection):
1423                 if hasattr(self, '_context'):  # Python > 2.6
1424                     self.sock = self._context.wrap_socket(
1425                         self.sock, server_hostname=self.host)
1426                 else:
1427                     self.sock = ssl.wrap_socket(self.sock)
1428
1429     return SocksConnection
1430
1431
1432 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1433     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1434         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1435         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1436         self._params = params
1437
1438     def https_open(self, req):
1439         kwargs = {}
1440         conn_class = self._https_conn_class
1441
1442         if hasattr(self, '_context'):  # python > 2.6
1443             kwargs['context'] = self._context
1444         if hasattr(self, '_check_hostname'):  # python 3.x
1445             kwargs['check_hostname'] = self._check_hostname
1446
1447         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1448         if socks_proxy:
1449             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1450             del req.headers['Ytdl-socks-proxy']
1451
1452         try:
1453             return self.do_open(
1454                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1455         except urllib.error.URLError as e:
1456             if (isinstance(e.reason, ssl.SSLError)
1457                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1458                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1459             raise
1460
1461
1462 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1463     """
1464     See [1] for cookie file format.
1465
1466     1. https://curl.haxx.se/docs/http-cookies.html
1467     """
1468     _HTTPONLY_PREFIX = '#HttpOnly_'
1469     _ENTRY_LEN = 7
1470     _HEADER = '''# Netscape HTTP Cookie File
1471 # This file is generated by yt-dlp.  Do not edit.
1472
1473 '''
1474     _CookieFileEntry = collections.namedtuple(
1475         'CookieFileEntry',
1476         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1477
1478     def __init__(self, filename=None, *args, **kwargs):
1479         super().__init__(None, *args, **kwargs)
1480         if self.is_path(filename):
1481             filename = os.fspath(filename)
1482         self.filename = filename
1483
1484     @staticmethod
1485     def _true_or_false(cndn):
1486         return 'TRUE' if cndn else 'FALSE'
1487
1488     @staticmethod
1489     def is_path(file):
1490         return isinstance(file, (str, bytes, os.PathLike))
1491
1492     @contextlib.contextmanager
1493     def open(self, file, *, write=False):
1494         if self.is_path(file):
1495             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1496                 yield f
1497         else:
1498             if write:
1499                 file.truncate(0)
1500             yield file
1501
1502     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1503         now = time.time()
1504         for cookie in self:
1505             if (not ignore_discard and cookie.discard
1506                     or not ignore_expires and cookie.is_expired(now)):
1507                 continue
1508             name, value = cookie.name, cookie.value
1509             if value is None:
1510                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1511                 # with no name, whereas http.cookiejar regards it as a
1512                 # cookie with no value.
1513                 name, value = '', name
1514             f.write('%s\n' % '\t'.join((
1515                 cookie.domain,
1516                 self._true_or_false(cookie.domain.startswith('.')),
1517                 cookie.path,
1518                 self._true_or_false(cookie.secure),
1519                 str_or_none(cookie.expires, default=''),
1520                 name, value
1521             )))
1522
1523     def save(self, filename=None, *args, **kwargs):
1524         """
1525         Save cookies to a file.
1526         Code is taken from CPython 3.6
1527         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1528
1529         if filename is None:
1530             if self.filename is not None:
1531                 filename = self.filename
1532             else:
1533                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1534
1535         # Store session cookies with `expires` set to 0 instead of an empty string
1536         for cookie in self:
1537             if cookie.expires is None:
1538                 cookie.expires = 0
1539
1540         with self.open(filename, write=True) as f:
1541             f.write(self._HEADER)
1542             self._really_save(f, *args, **kwargs)
1543
1544     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1545         """Load cookies from a file."""
1546         if filename is None:
1547             if self.filename is not None:
1548                 filename = self.filename
1549             else:
1550                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1551
1552         def prepare_line(line):
1553             if line.startswith(self._HTTPONLY_PREFIX):
1554                 line = line[len(self._HTTPONLY_PREFIX):]
1555             # comments and empty lines are fine
1556             if line.startswith('#') or not line.strip():
1557                 return line
1558             cookie_list = line.split('\t')
1559             if len(cookie_list) != self._ENTRY_LEN:
1560                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1561             cookie = self._CookieFileEntry(*cookie_list)
1562             if cookie.expires_at and not cookie.expires_at.isdigit():
1563                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1564             return line
1565
1566         cf = io.StringIO()
1567         with self.open(filename) as f:
1568             for line in f:
1569                 try:
1570                     cf.write(prepare_line(line))
1571                 except http.cookiejar.LoadError as e:
1572                     if f'{line.strip()} '[0] in '[{"':
1573                         raise http.cookiejar.LoadError(
1574                             'Cookies file must be Netscape formatted, not JSON. See  '
1575                             'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1576                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1577                     continue
1578         cf.seek(0)
1579         self._really_load(cf, filename, ignore_discard, ignore_expires)
1580         # Session cookies are denoted by either `expires` field set to
1581         # an empty string or 0. MozillaCookieJar only recognizes the former
1582         # (see [1]). So we need force the latter to be recognized as session
1583         # cookies on our own.
1584         # Session cookies may be important for cookies-based authentication,
1585         # e.g. usually, when user does not check 'Remember me' check box while
1586         # logging in on a site, some important cookies are stored as session
1587         # cookies so that not recognizing them will result in failed login.
1588         # 1. https://bugs.python.org/issue17164
1589         for cookie in self:
1590             # Treat `expires=0` cookies as session cookies
1591             if cookie.expires == 0:
1592                 cookie.expires = None
1593                 cookie.discard = True
1594
1595
1596 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1597     def __init__(self, cookiejar=None):
1598         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1599
1600     def http_response(self, request, response):
1601         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1602
1603     https_request = urllib.request.HTTPCookieProcessor.http_request
1604     https_response = http_response
1605
1606
1607 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1608     """YoutubeDL redirect handler
1609
1610     The code is based on HTTPRedirectHandler implementation from CPython [1].
1611
1612     This redirect handler solves two issues:
1613      - ensures redirect URL is always unicode under python 2
1614      - introduces support for experimental HTTP response status code
1615        308 Permanent Redirect [2] used by some sites [3]
1616
1617     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1618     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1619     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1620     """
1621
1622     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1623
1624     def redirect_request(self, req, fp, code, msg, headers, newurl):
1625         """Return a Request or None in response to a redirect.
1626
1627         This is called by the http_error_30x methods when a
1628         redirection response is received.  If a redirection should
1629         take place, return a new Request to allow http_error_30x to
1630         perform the redirect.  Otherwise, raise HTTPError if no-one
1631         else should try to handle this url.  Return None if you can't
1632         but another Handler might.
1633         """
1634         m = req.get_method()
1635         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1636                  or code in (301, 302, 303) and m == "POST")):
1637             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1638         # Strictly (according to RFC 2616), 301 or 302 in response to
1639         # a POST MUST NOT cause a redirection without confirmation
1640         # from the user (of urllib.request, in this case).  In practice,
1641         # essentially all clients do redirect in this case, so we do
1642         # the same.
1643
1644         # Be conciliant with URIs containing a space.  This is mainly
1645         # redundant with the more complete encoding done in http_error_302(),
1646         # but it is kept for compatibility with other callers.
1647         newurl = newurl.replace(' ', '%20')
1648
1649         CONTENT_HEADERS = ("content-length", "content-type")
1650         # NB: don't use dict comprehension for python 2.6 compatibility
1651         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1652
1653         # A 303 must either use GET or HEAD for subsequent request
1654         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1655         if code == 303 and m != 'HEAD':
1656             m = 'GET'
1657         # 301 and 302 redirects are commonly turned into a GET from a POST
1658         # for subsequent requests by browsers, so we'll do the same.
1659         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1660         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1661         if code in (301, 302) and m == 'POST':
1662             m = 'GET'
1663
1664         return urllib.request.Request(
1665             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1666             unverifiable=True, method=m)
1667
1668
1669 def extract_timezone(date_str):
1670     m = re.search(
1671         r'''(?x)
1672             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1673             (?P<tz>Z|                                            # just the UTC Z, or
1674                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1675                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1676                    [ ]?                                          # optional space
1677                 (?P<sign>\+|-)                                   # +/-
1678                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1679             $)
1680         ''', date_str)
1681     if not m:
1682         timezone = datetime.timedelta()
1683     else:
1684         date_str = date_str[:-len(m.group('tz'))]
1685         if not m.group('sign'):
1686             timezone = datetime.timedelta()
1687         else:
1688             sign = 1 if m.group('sign') == '+' else -1
1689             timezone = datetime.timedelta(
1690                 hours=sign * int(m.group('hours')),
1691                 minutes=sign * int(m.group('minutes')))
1692     return timezone, date_str
1693
1694
1695 def parse_iso8601(date_str, delimiter='T', timezone=None):
1696     """ Return a UNIX timestamp from the given date """
1697
1698     if date_str is None:
1699         return None
1700
1701     date_str = re.sub(r'\.[0-9]+', '', date_str)
1702
1703     if timezone is None:
1704         timezone, date_str = extract_timezone(date_str)
1705
1706     with contextlib.suppress(ValueError):
1707         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1708         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1709         return calendar.timegm(dt.timetuple())
1710
1711
1712 def date_formats(day_first=True):
1713     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1714
1715
1716 def unified_strdate(date_str, day_first=True):
1717     """Return a string with the date in the format YYYYMMDD"""
1718
1719     if date_str is None:
1720         return None
1721     upload_date = None
1722     # Replace commas
1723     date_str = date_str.replace(',', ' ')
1724     # Remove AM/PM + timezone
1725     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1726     _, date_str = extract_timezone(date_str)
1727
1728     for expression in date_formats(day_first):
1729         with contextlib.suppress(ValueError):
1730             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1731     if upload_date is None:
1732         timetuple = email.utils.parsedate_tz(date_str)
1733         if timetuple:
1734             with contextlib.suppress(ValueError):
1735                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1736     if upload_date is not None:
1737         return str(upload_date)
1738
1739
1740 def unified_timestamp(date_str, day_first=True):
1741     if date_str is None:
1742         return None
1743
1744     date_str = re.sub(r'[,|]', '', date_str)
1745
1746     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1747     timezone, date_str = extract_timezone(date_str)
1748
1749     # Remove AM/PM + timezone
1750     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1751
1752     # Remove unrecognized timezones from ISO 8601 alike timestamps
1753     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1754     if m:
1755         date_str = date_str[:-len(m.group('tz'))]
1756
1757     # Python only supports microseconds, so remove nanoseconds
1758     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1759     if m:
1760         date_str = m.group(1)
1761
1762     for expression in date_formats(day_first):
1763         with contextlib.suppress(ValueError):
1764             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1765             return calendar.timegm(dt.timetuple())
1766     timetuple = email.utils.parsedate_tz(date_str)
1767     if timetuple:
1768         return calendar.timegm(timetuple) + pm_delta * 3600
1769
1770
1771 def determine_ext(url, default_ext='unknown_video'):
1772     if url is None or '.' not in url:
1773         return default_ext
1774     guess = url.partition('?')[0].rpartition('.')[2]
1775     if re.match(r'^[A-Za-z0-9]+$', guess):
1776         return guess
1777     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1778     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1779         return guess.rstrip('/')
1780     else:
1781         return default_ext
1782
1783
1784 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1785     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1786
1787
1788 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1789     R"""
1790     Return a datetime object from a string.
1791     Supported format:
1792         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1793
1794     @param format       strftime format of DATE
1795     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1796                         auto: round to the unit provided in date_str (if applicable).
1797     """
1798     auto_precision = False
1799     if precision == 'auto':
1800         auto_precision = True
1801         precision = 'microsecond'
1802     today = datetime_round(datetime.datetime.utcnow(), precision)
1803     if date_str in ('now', 'today'):
1804         return today
1805     if date_str == 'yesterday':
1806         return today - datetime.timedelta(days=1)
1807     match = re.match(
1808         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1809         date_str)
1810     if match is not None:
1811         start_time = datetime_from_str(match.group('start'), precision, format)
1812         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1813         unit = match.group('unit')
1814         if unit == 'month' or unit == 'year':
1815             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1816             unit = 'day'
1817         else:
1818             if unit == 'week':
1819                 unit = 'day'
1820                 time *= 7
1821             delta = datetime.timedelta(**{unit + 's': time})
1822             new_date = start_time + delta
1823         if auto_precision:
1824             return datetime_round(new_date, unit)
1825         return new_date
1826
1827     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1828
1829
1830 def date_from_str(date_str, format='%Y%m%d', strict=False):
1831     R"""
1832     Return a date object from a string using datetime_from_str
1833
1834     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1835                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1836     """
1837     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1838         raise ValueError(f'Invalid date format "{date_str}"')
1839     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1840
1841
1842 def datetime_add_months(dt, months):
1843     """Increment/Decrement a datetime object by months."""
1844     month = dt.month + months - 1
1845     year = dt.year + month // 12
1846     month = month % 12 + 1
1847     day = min(dt.day, calendar.monthrange(year, month)[1])
1848     return dt.replace(year, month, day)
1849
1850
1851 def datetime_round(dt, precision='day'):
1852     """
1853     Round a datetime object's time to a specific precision
1854     """
1855     if precision == 'microsecond':
1856         return dt
1857
1858     unit_seconds = {
1859         'day': 86400,
1860         'hour': 3600,
1861         'minute': 60,
1862         'second': 1,
1863     }
1864     roundto = lambda x, n: ((x + n / 2) // n) * n
1865     timestamp = calendar.timegm(dt.timetuple())
1866     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1867
1868
1869 def hyphenate_date(date_str):
1870     """
1871     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1872     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1873     if match is not None:
1874         return '-'.join(match.groups())
1875     else:
1876         return date_str
1877
1878
1879 class DateRange:
1880     """Represents a time interval between two dates"""
1881
1882     def __init__(self, start=None, end=None):
1883         """start and end must be strings in the format accepted by date"""
1884         if start is not None:
1885             self.start = date_from_str(start, strict=True)
1886         else:
1887             self.start = datetime.datetime.min.date()
1888         if end is not None:
1889             self.end = date_from_str(end, strict=True)
1890         else:
1891             self.end = datetime.datetime.max.date()
1892         if self.start > self.end:
1893             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1894
1895     @classmethod
1896     def day(cls, day):
1897         """Returns a range that only contains the given day"""
1898         return cls(day, day)
1899
1900     def __contains__(self, date):
1901         """Check if the date is in the range"""
1902         if not isinstance(date, datetime.date):
1903             date = date_from_str(date)
1904         return self.start <= date <= self.end
1905
1906     def __str__(self):
1907         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1908
1909
1910 def platform_name():
1911     """ Returns the platform name as a str """
1912     res = platform.platform()
1913     if isinstance(res, bytes):
1914         res = res.decode(preferredencoding())
1915
1916     assert isinstance(res, str)
1917     return res
1918
1919
1920 @functools.cache
1921 def get_windows_version():
1922     ''' Get Windows version. returns () if it's not running on Windows '''
1923     if compat_os_name == 'nt':
1924         return version_tuple(platform.win32_ver()[1])
1925     else:
1926         return ()
1927
1928
1929 def write_string(s, out=None, encoding=None):
1930     assert isinstance(s, str)
1931     out = out or sys.stderr
1932
1933     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1934         s = re.sub(r'([\r\n]+)', r' \1', s)
1935
1936     enc, buffer = None, out
1937     if 'b' in getattr(out, 'mode', ''):
1938         enc = encoding or preferredencoding()
1939     elif hasattr(out, 'buffer'):
1940         buffer = out.buffer
1941         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1942
1943     buffer.write(s.encode(enc, 'ignore') if enc else s)
1944     out.flush()
1945
1946
1947 def bytes_to_intlist(bs):
1948     if not bs:
1949         return []
1950     if isinstance(bs[0], int):  # Python 3
1951         return list(bs)
1952     else:
1953         return [ord(c) for c in bs]
1954
1955
1956 def intlist_to_bytes(xs):
1957     if not xs:
1958         return b''
1959     return struct.pack('%dB' % len(xs), *xs)
1960
1961
1962 class LockingUnsupportedError(OSError):
1963     msg = 'File locking is not supported'
1964
1965     def __init__(self):
1966         super().__init__(self.msg)
1967
1968
1969 # Cross-platform file locking
1970 if sys.platform == 'win32':
1971     import ctypes.wintypes
1972     import msvcrt
1973
1974     class OVERLAPPED(ctypes.Structure):
1975         _fields_ = [
1976             ('Internal', ctypes.wintypes.LPVOID),
1977             ('InternalHigh', ctypes.wintypes.LPVOID),
1978             ('Offset', ctypes.wintypes.DWORD),
1979             ('OffsetHigh', ctypes.wintypes.DWORD),
1980             ('hEvent', ctypes.wintypes.HANDLE),
1981         ]
1982
1983     kernel32 = ctypes.windll.kernel32
1984     LockFileEx = kernel32.LockFileEx
1985     LockFileEx.argtypes = [
1986         ctypes.wintypes.HANDLE,     # hFile
1987         ctypes.wintypes.DWORD,      # dwFlags
1988         ctypes.wintypes.DWORD,      # dwReserved
1989         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1990         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1991         ctypes.POINTER(OVERLAPPED)  # Overlapped
1992     ]
1993     LockFileEx.restype = ctypes.wintypes.BOOL
1994     UnlockFileEx = kernel32.UnlockFileEx
1995     UnlockFileEx.argtypes = [
1996         ctypes.wintypes.HANDLE,     # hFile
1997         ctypes.wintypes.DWORD,      # dwReserved
1998         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1999         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2000         ctypes.POINTER(OVERLAPPED)  # Overlapped
2001     ]
2002     UnlockFileEx.restype = ctypes.wintypes.BOOL
2003     whole_low = 0xffffffff
2004     whole_high = 0x7fffffff
2005
2006     def _lock_file(f, exclusive, block):
2007         overlapped = OVERLAPPED()
2008         overlapped.Offset = 0
2009         overlapped.OffsetHigh = 0
2010         overlapped.hEvent = 0
2011         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2012
2013         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2014                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2015                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2016             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2017             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2018
2019     def _unlock_file(f):
2020         assert f._lock_file_overlapped_p
2021         handle = msvcrt.get_osfhandle(f.fileno())
2022         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2023             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2024
2025 else:
2026     try:
2027         import fcntl
2028
2029         def _lock_file(f, exclusive, block):
2030             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2031             if not block:
2032                 flags |= fcntl.LOCK_NB
2033             try:
2034                 fcntl.flock(f, flags)
2035             except BlockingIOError:
2036                 raise
2037             except OSError:  # AOSP does not have flock()
2038                 fcntl.lockf(f, flags)
2039
2040         def _unlock_file(f):
2041             try:
2042                 fcntl.flock(f, fcntl.LOCK_UN)
2043             except OSError:
2044                 fcntl.lockf(f, fcntl.LOCK_UN)
2045
2046     except ImportError:
2047
2048         def _lock_file(f, exclusive, block):
2049             raise LockingUnsupportedError()
2050
2051         def _unlock_file(f):
2052             raise LockingUnsupportedError()
2053
2054
2055 class locked_file:
2056     locked = False
2057
2058     def __init__(self, filename, mode, block=True, encoding=None):
2059         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2060             raise NotImplementedError(mode)
2061         self.mode, self.block = mode, block
2062
2063         writable = any(f in mode for f in 'wax+')
2064         readable = any(f in mode for f in 'r+')
2065         flags = functools.reduce(operator.ior, (
2066             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2067             getattr(os, 'O_BINARY', 0),  # Windows only
2068             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2069             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2070             os.O_APPEND if 'a' in mode else 0,
2071             os.O_EXCL if 'x' in mode else 0,
2072             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2073         ))
2074
2075         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2076
2077     def __enter__(self):
2078         exclusive = 'r' not in self.mode
2079         try:
2080             _lock_file(self.f, exclusive, self.block)
2081             self.locked = True
2082         except OSError:
2083             self.f.close()
2084             raise
2085         if 'w' in self.mode:
2086             try:
2087                 self.f.truncate()
2088             except OSError as e:
2089                 if e.errno not in (
2090                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2091                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2092                 ):
2093                     raise
2094         return self
2095
2096     def unlock(self):
2097         if not self.locked:
2098             return
2099         try:
2100             _unlock_file(self.f)
2101         finally:
2102             self.locked = False
2103
2104     def __exit__(self, *_):
2105         try:
2106             self.unlock()
2107         finally:
2108             self.f.close()
2109
2110     open = __enter__
2111     close = __exit__
2112
2113     def __getattr__(self, attr):
2114         return getattr(self.f, attr)
2115
2116     def __iter__(self):
2117         return iter(self.f)
2118
2119
2120 @functools.cache
2121 def get_filesystem_encoding():
2122     encoding = sys.getfilesystemencoding()
2123     return encoding if encoding is not None else 'utf-8'
2124
2125
2126 def shell_quote(args):
2127     quoted_args = []
2128     encoding = get_filesystem_encoding()
2129     for a in args:
2130         if isinstance(a, bytes):
2131             # We may get a filename encoded with 'encodeFilename'
2132             a = a.decode(encoding)
2133         quoted_args.append(compat_shlex_quote(a))
2134     return ' '.join(quoted_args)
2135
2136
2137 def smuggle_url(url, data):
2138     """ Pass additional data in a URL for internal use. """
2139
2140     url, idata = unsmuggle_url(url, {})
2141     data.update(idata)
2142     sdata = urllib.parse.urlencode(
2143         {'__youtubedl_smuggle': json.dumps(data)})
2144     return url + '#' + sdata
2145
2146
2147 def unsmuggle_url(smug_url, default=None):
2148     if '#__youtubedl_smuggle' not in smug_url:
2149         return smug_url, default
2150     url, _, sdata = smug_url.rpartition('#')
2151     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2152     data = json.loads(jsond)
2153     return url, data
2154
2155
2156 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2157     """ Formats numbers with decimal sufixes like K, M, etc """
2158     num, factor = float_or_none(num), float(factor)
2159     if num is None or num < 0:
2160         return None
2161     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2162     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2163     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2164     if factor == 1024:
2165         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2166     converted = num / (factor ** exponent)
2167     return fmt % (converted, suffix)
2168
2169
2170 def format_bytes(bytes):
2171     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2172
2173
2174 def lookup_unit_table(unit_table, s):
2175     units_re = '|'.join(re.escape(u) for u in unit_table)
2176     m = re.match(
2177         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2178     if not m:
2179         return None
2180     num_str = m.group('num').replace(',', '.')
2181     mult = unit_table[m.group('unit')]
2182     return int(float(num_str) * mult)
2183
2184
2185 def parse_filesize(s):
2186     if s is None:
2187         return None
2188
2189     # The lower-case forms are of course incorrect and unofficial,
2190     # but we support those too
2191     _UNIT_TABLE = {
2192         'B': 1,
2193         'b': 1,
2194         'bytes': 1,
2195         'KiB': 1024,
2196         'KB': 1000,
2197         'kB': 1024,
2198         'Kb': 1000,
2199         'kb': 1000,
2200         'kilobytes': 1000,
2201         'kibibytes': 1024,
2202         'MiB': 1024 ** 2,
2203         'MB': 1000 ** 2,
2204         'mB': 1024 ** 2,
2205         'Mb': 1000 ** 2,
2206         'mb': 1000 ** 2,
2207         'megabytes': 1000 ** 2,
2208         'mebibytes': 1024 ** 2,
2209         'GiB': 1024 ** 3,
2210         'GB': 1000 ** 3,
2211         'gB': 1024 ** 3,
2212         'Gb': 1000 ** 3,
2213         'gb': 1000 ** 3,
2214         'gigabytes': 1000 ** 3,
2215         'gibibytes': 1024 ** 3,
2216         'TiB': 1024 ** 4,
2217         'TB': 1000 ** 4,
2218         'tB': 1024 ** 4,
2219         'Tb': 1000 ** 4,
2220         'tb': 1000 ** 4,
2221         'terabytes': 1000 ** 4,
2222         'tebibytes': 1024 ** 4,
2223         'PiB': 1024 ** 5,
2224         'PB': 1000 ** 5,
2225         'pB': 1024 ** 5,
2226         'Pb': 1000 ** 5,
2227         'pb': 1000 ** 5,
2228         'petabytes': 1000 ** 5,
2229         'pebibytes': 1024 ** 5,
2230         'EiB': 1024 ** 6,
2231         'EB': 1000 ** 6,
2232         'eB': 1024 ** 6,
2233         'Eb': 1000 ** 6,
2234         'eb': 1000 ** 6,
2235         'exabytes': 1000 ** 6,
2236         'exbibytes': 1024 ** 6,
2237         'ZiB': 1024 ** 7,
2238         'ZB': 1000 ** 7,
2239         'zB': 1024 ** 7,
2240         'Zb': 1000 ** 7,
2241         'zb': 1000 ** 7,
2242         'zettabytes': 1000 ** 7,
2243         'zebibytes': 1024 ** 7,
2244         'YiB': 1024 ** 8,
2245         'YB': 1000 ** 8,
2246         'yB': 1024 ** 8,
2247         'Yb': 1000 ** 8,
2248         'yb': 1000 ** 8,
2249         'yottabytes': 1000 ** 8,
2250         'yobibytes': 1024 ** 8,
2251     }
2252
2253     return lookup_unit_table(_UNIT_TABLE, s)
2254
2255
2256 def parse_count(s):
2257     if s is None:
2258         return None
2259
2260     s = re.sub(r'^[^\d]+\s', '', s).strip()
2261
2262     if re.match(r'^[\d,.]+$', s):
2263         return str_to_int(s)
2264
2265     _UNIT_TABLE = {
2266         'k': 1000,
2267         'K': 1000,
2268         'm': 1000 ** 2,
2269         'M': 1000 ** 2,
2270         'kk': 1000 ** 2,
2271         'KK': 1000 ** 2,
2272         'b': 1000 ** 3,
2273         'B': 1000 ** 3,
2274     }
2275
2276     ret = lookup_unit_table(_UNIT_TABLE, s)
2277     if ret is not None:
2278         return ret
2279
2280     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2281     if mobj:
2282         return str_to_int(mobj.group(1))
2283
2284
2285 def parse_resolution(s, *, lenient=False):
2286     if s is None:
2287         return {}
2288
2289     if lenient:
2290         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2291     else:
2292         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2293     if mobj:
2294         return {
2295             'width': int(mobj.group('w')),
2296             'height': int(mobj.group('h')),
2297         }
2298
2299     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2300     if mobj:
2301         return {'height': int(mobj.group(1))}
2302
2303     mobj = re.search(r'\b([48])[kK]\b', s)
2304     if mobj:
2305         return {'height': int(mobj.group(1)) * 540}
2306
2307     return {}
2308
2309
2310 def parse_bitrate(s):
2311     if not isinstance(s, str):
2312         return
2313     mobj = re.search(r'\b(\d+)\s*kbps', s)
2314     if mobj:
2315         return int(mobj.group(1))
2316
2317
2318 def month_by_name(name, lang='en'):
2319     """ Return the number of a month by (locale-independently) English name """
2320
2321     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2322
2323     try:
2324         return month_names.index(name) + 1
2325     except ValueError:
2326         return None
2327
2328
2329 def month_by_abbreviation(abbrev):
2330     """ Return the number of a month by (locale-independently) English
2331         abbreviations """
2332
2333     try:
2334         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2335     except ValueError:
2336         return None
2337
2338
2339 def fix_xml_ampersands(xml_str):
2340     """Replace all the '&' by '&amp;' in XML"""
2341     return re.sub(
2342         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2343         '&amp;',
2344         xml_str)
2345
2346
2347 def setproctitle(title):
2348     assert isinstance(title, str)
2349
2350     # ctypes in Jython is not complete
2351     # http://bugs.jython.org/issue2148
2352     if sys.platform.startswith('java'):
2353         return
2354
2355     try:
2356         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2357     except OSError:
2358         return
2359     except TypeError:
2360         # LoadLibrary in Windows Python 2.7.13 only expects
2361         # a bytestring, but since unicode_literals turns
2362         # every string into a unicode string, it fails.
2363         return
2364     title_bytes = title.encode()
2365     buf = ctypes.create_string_buffer(len(title_bytes))
2366     buf.value = title_bytes
2367     try:
2368         libc.prctl(15, buf, 0, 0, 0)
2369     except AttributeError:
2370         return  # Strange libc, just skip this
2371
2372
2373 def remove_start(s, start):
2374     return s[len(start):] if s is not None and s.startswith(start) else s
2375
2376
2377 def remove_end(s, end):
2378     return s[:-len(end)] if s is not None and s.endswith(end) else s
2379
2380
2381 def remove_quotes(s):
2382     if s is None or len(s) < 2:
2383         return s
2384     for quote in ('"', "'", ):
2385         if s[0] == quote and s[-1] == quote:
2386             return s[1:-1]
2387     return s
2388
2389
2390 def get_domain(url):
2391     domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2392     return domain.group('domain') if domain else None
2393
2394
2395 def url_basename(url):
2396     path = urllib.parse.urlparse(url).path
2397     return path.strip('/').split('/')[-1]
2398
2399
2400 def base_url(url):
2401     return re.match(r'https?://[^?#&]+/', url).group()
2402
2403
2404 def urljoin(base, path):
2405     if isinstance(path, bytes):
2406         path = path.decode()
2407     if not isinstance(path, str) or not path:
2408         return None
2409     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2410         return path
2411     if isinstance(base, bytes):
2412         base = base.decode()
2413     if not isinstance(base, str) or not re.match(
2414             r'^(?:https?:)?//', base):
2415         return None
2416     return urllib.parse.urljoin(base, path)
2417
2418
2419 class HEADRequest(urllib.request.Request):
2420     def get_method(self):
2421         return 'HEAD'
2422
2423
2424 class PUTRequest(urllib.request.Request):
2425     def get_method(self):
2426         return 'PUT'
2427
2428
2429 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2430     if get_attr and v is not None:
2431         v = getattr(v, get_attr, None)
2432     try:
2433         return int(v) * invscale // scale
2434     except (ValueError, TypeError, OverflowError):
2435         return default
2436
2437
2438 def str_or_none(v, default=None):
2439     return default if v is None else str(v)
2440
2441
2442 def str_to_int(int_str):
2443     """ A more relaxed version of int_or_none """
2444     if isinstance(int_str, int):
2445         return int_str
2446     elif isinstance(int_str, str):
2447         int_str = re.sub(r'[,\.\+]', '', int_str)
2448         return int_or_none(int_str)
2449
2450
2451 def float_or_none(v, scale=1, invscale=1, default=None):
2452     if v is None:
2453         return default
2454     try:
2455         return float(v) * invscale / scale
2456     except (ValueError, TypeError):
2457         return default
2458
2459
2460 def bool_or_none(v, default=None):
2461     return v if isinstance(v, bool) else default
2462
2463
2464 def strip_or_none(v, default=None):
2465     return v.strip() if isinstance(v, str) else default
2466
2467
2468 def url_or_none(url):
2469     if not url or not isinstance(url, str):
2470         return None
2471     url = url.strip()
2472     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2473
2474
2475 def request_to_url(req):
2476     if isinstance(req, urllib.request.Request):
2477         return req.get_full_url()
2478     else:
2479         return req
2480
2481
2482 def strftime_or_none(timestamp, date_format, default=None):
2483     datetime_object = None
2484     try:
2485         if isinstance(timestamp, (int, float)):  # unix timestamp
2486             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2487         elif isinstance(timestamp, str):  # assume YYYYMMDD
2488             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2489         return datetime_object.strftime(date_format)
2490     except (ValueError, TypeError, AttributeError):
2491         return default
2492
2493
2494 def parse_duration(s):
2495     if not isinstance(s, str):
2496         return None
2497     s = s.strip()
2498     if not s:
2499         return None
2500
2501     days, hours, mins, secs, ms = [None] * 5
2502     m = re.match(r'''(?x)
2503             (?P<before_secs>
2504                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2505             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2506             (?P<ms>[.:][0-9]+)?Z?$
2507         ''', s)
2508     if m:
2509         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2510     else:
2511         m = re.match(
2512             r'''(?ix)(?:P?
2513                 (?:
2514                     [0-9]+\s*y(?:ears?)?,?\s*
2515                 )?
2516                 (?:
2517                     [0-9]+\s*m(?:onths?)?,?\s*
2518                 )?
2519                 (?:
2520                     [0-9]+\s*w(?:eeks?)?,?\s*
2521                 )?
2522                 (?:
2523                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2524                 )?
2525                 T)?
2526                 (?:
2527                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2528                 )?
2529                 (?:
2530                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2531                 )?
2532                 (?:
2533                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2534                 )?Z?$''', s)
2535         if m:
2536             days, hours, mins, secs, ms = m.groups()
2537         else:
2538             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2539             if m:
2540                 hours, mins = m.groups()
2541             else:
2542                 return None
2543
2544     if ms:
2545         ms = ms.replace(':', '.')
2546     return sum(float(part or 0) * mult for part, mult in (
2547         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2548
2549
2550 def prepend_extension(filename, ext, expected_real_ext=None):
2551     name, real_ext = os.path.splitext(filename)
2552     return (
2553         f'{name}.{ext}{real_ext}'
2554         if not expected_real_ext or real_ext[1:] == expected_real_ext
2555         else f'{filename}.{ext}')
2556
2557
2558 def replace_extension(filename, ext, expected_real_ext=None):
2559     name, real_ext = os.path.splitext(filename)
2560     return '{}.{}'.format(
2561         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2562         ext)
2563
2564
2565 def check_executable(exe, args=[]):
2566     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2567     args can be a list of arguments for a short output (like -version) """
2568     try:
2569         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2570     except OSError:
2571         return False
2572     return exe
2573
2574
2575 def _get_exe_version_output(exe, args, *, to_screen=None):
2576     if to_screen:
2577         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2578     try:
2579         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2580         # SIGTTOU if yt-dlp is run in the background.
2581         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2582         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2583                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2584     except OSError:
2585         return False
2586     return stdout
2587
2588
2589 def detect_exe_version(output, version_re=None, unrecognized='present'):
2590     assert isinstance(output, str)
2591     if version_re is None:
2592         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2593     m = re.search(version_re, output)
2594     if m:
2595         return m.group(1)
2596     else:
2597         return unrecognized
2598
2599
2600 def get_exe_version(exe, args=['--version'],
2601                     version_re=None, unrecognized='present'):
2602     """ Returns the version of the specified executable,
2603     or False if the executable is not present """
2604     out = _get_exe_version_output(exe, args)
2605     return detect_exe_version(out, version_re, unrecognized) if out else False
2606
2607
2608 def frange(start=0, stop=None, step=1):
2609     """Float range"""
2610     if stop is None:
2611         start, stop = 0, start
2612     sign = [-1, 1][step > 0] if step else 0
2613     while sign * start < sign * stop:
2614         yield start
2615         start += step
2616
2617
2618 class LazyList(collections.abc.Sequence):
2619     """Lazy immutable list from an iterable
2620     Note that slices of a LazyList are lists and not LazyList"""
2621
2622     class IndexError(IndexError):
2623         pass
2624
2625     def __init__(self, iterable, *, reverse=False, _cache=None):
2626         self._iterable = iter(iterable)
2627         self._cache = [] if _cache is None else _cache
2628         self._reversed = reverse
2629
2630     def __iter__(self):
2631         if self._reversed:
2632             # We need to consume the entire iterable to iterate in reverse
2633             yield from self.exhaust()
2634             return
2635         yield from self._cache
2636         for item in self._iterable:
2637             self._cache.append(item)
2638             yield item
2639
2640     def _exhaust(self):
2641         self._cache.extend(self._iterable)
2642         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2643         return self._cache
2644
2645     def exhaust(self):
2646         """Evaluate the entire iterable"""
2647         return self._exhaust()[::-1 if self._reversed else 1]
2648
2649     @staticmethod
2650     def _reverse_index(x):
2651         return None if x is None else -(x + 1)
2652
2653     def __getitem__(self, idx):
2654         if isinstance(idx, slice):
2655             if self._reversed:
2656                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2657             start, stop, step = idx.start, idx.stop, idx.step or 1
2658         elif isinstance(idx, int):
2659             if self._reversed:
2660                 idx = self._reverse_index(idx)
2661             start, stop, step = idx, idx, 0
2662         else:
2663             raise TypeError('indices must be integers or slices')
2664         if ((start or 0) < 0 or (stop or 0) < 0
2665                 or (start is None and step < 0)
2666                 or (stop is None and step > 0)):
2667             # We need to consume the entire iterable to be able to slice from the end
2668             # Obviously, never use this with infinite iterables
2669             self._exhaust()
2670             try:
2671                 return self._cache[idx]
2672             except IndexError as e:
2673                 raise self.IndexError(e) from e
2674         n = max(start or 0, stop or 0) - len(self._cache) + 1
2675         if n > 0:
2676             self._cache.extend(itertools.islice(self._iterable, n))
2677         try:
2678             return self._cache[idx]
2679         except IndexError as e:
2680             raise self.IndexError(e) from e
2681
2682     def __bool__(self):
2683         try:
2684             self[-1] if self._reversed else self[0]
2685         except self.IndexError:
2686             return False
2687         return True
2688
2689     def __len__(self):
2690         self._exhaust()
2691         return len(self._cache)
2692
2693     def __reversed__(self):
2694         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2695
2696     def __copy__(self):
2697         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2698
2699     def __repr__(self):
2700         # repr and str should mimic a list. So we exhaust the iterable
2701         return repr(self.exhaust())
2702
2703     def __str__(self):
2704         return repr(self.exhaust())
2705
2706
2707 class PagedList:
2708
2709     class IndexError(IndexError):
2710         pass
2711
2712     def __len__(self):
2713         # This is only useful for tests
2714         return len(self.getslice())
2715
2716     def __init__(self, pagefunc, pagesize, use_cache=True):
2717         self._pagefunc = pagefunc
2718         self._pagesize = pagesize
2719         self._pagecount = float('inf')
2720         self._use_cache = use_cache
2721         self._cache = {}
2722
2723     def getpage(self, pagenum):
2724         page_results = self._cache.get(pagenum)
2725         if page_results is None:
2726             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2727         if self._use_cache:
2728             self._cache[pagenum] = page_results
2729         return page_results
2730
2731     def getslice(self, start=0, end=None):
2732         return list(self._getslice(start, end))
2733
2734     def _getslice(self, start, end):
2735         raise NotImplementedError('This method must be implemented by subclasses')
2736
2737     def __getitem__(self, idx):
2738         assert self._use_cache, 'Indexing PagedList requires cache'
2739         if not isinstance(idx, int) or idx < 0:
2740             raise TypeError('indices must be non-negative integers')
2741         entries = self.getslice(idx, idx + 1)
2742         if not entries:
2743             raise self.IndexError()
2744         return entries[0]
2745
2746
2747 class OnDemandPagedList(PagedList):
2748     """Download pages until a page with less than maximum results"""
2749
2750     def _getslice(self, start, end):
2751         for pagenum in itertools.count(start // self._pagesize):
2752             firstid = pagenum * self._pagesize
2753             nextfirstid = pagenum * self._pagesize + self._pagesize
2754             if start >= nextfirstid:
2755                 continue
2756
2757             startv = (
2758                 start % self._pagesize
2759                 if firstid <= start < nextfirstid
2760                 else 0)
2761             endv = (
2762                 ((end - 1) % self._pagesize) + 1
2763                 if (end is not None and firstid <= end <= nextfirstid)
2764                 else None)
2765
2766             try:
2767                 page_results = self.getpage(pagenum)
2768             except Exception:
2769                 self._pagecount = pagenum - 1
2770                 raise
2771             if startv != 0 or endv is not None:
2772                 page_results = page_results[startv:endv]
2773             yield from page_results
2774
2775             # A little optimization - if current page is not "full", ie. does
2776             # not contain page_size videos then we can assume that this page
2777             # is the last one - there are no more ids on further pages -
2778             # i.e. no need to query again.
2779             if len(page_results) + startv < self._pagesize:
2780                 break
2781
2782             # If we got the whole page, but the next page is not interesting,
2783             # break out early as well
2784             if end == nextfirstid:
2785                 break
2786
2787
2788 class InAdvancePagedList(PagedList):
2789     """PagedList with total number of pages known in advance"""
2790
2791     def __init__(self, pagefunc, pagecount, pagesize):
2792         PagedList.__init__(self, pagefunc, pagesize, True)
2793         self._pagecount = pagecount
2794
2795     def _getslice(self, start, end):
2796         start_page = start // self._pagesize
2797         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2798         skip_elems = start - start_page * self._pagesize
2799         only_more = None if end is None else end - start
2800         for pagenum in range(start_page, end_page):
2801             page_results = self.getpage(pagenum)
2802             if skip_elems:
2803                 page_results = page_results[skip_elems:]
2804                 skip_elems = None
2805             if only_more is not None:
2806                 if len(page_results) < only_more:
2807                     only_more -= len(page_results)
2808                 else:
2809                     yield from page_results[:only_more]
2810                     break
2811             yield from page_results
2812
2813
2814 class PlaylistEntries:
2815     MissingEntry = object()
2816     is_exhausted = False
2817
2818     def __init__(self, ydl, info_dict):
2819         self.ydl = ydl
2820
2821         # _entries must be assigned now since infodict can change during iteration
2822         entries = info_dict.get('entries')
2823         if entries is None:
2824             raise EntryNotInPlaylist('There are no entries')
2825         elif isinstance(entries, list):
2826             self.is_exhausted = True
2827
2828         requested_entries = info_dict.get('requested_entries')
2829         self.is_incomplete = bool(requested_entries)
2830         if self.is_incomplete:
2831             assert self.is_exhausted
2832             self._entries = [self.MissingEntry] * max(requested_entries)
2833             for i, entry in zip(requested_entries, entries):
2834                 self._entries[i - 1] = entry
2835         elif isinstance(entries, (list, PagedList, LazyList)):
2836             self._entries = entries
2837         else:
2838             self._entries = LazyList(entries)
2839
2840     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2841         (?P<start>[+-]?\d+)?
2842         (?P<range>[:-]
2843             (?P<end>[+-]?\d+|inf(?:inite)?)?
2844             (?::(?P<step>[+-]?\d+))?
2845         )?''')
2846
2847     @classmethod
2848     def parse_playlist_items(cls, string):
2849         for segment in string.split(','):
2850             if not segment:
2851                 raise ValueError('There is two or more consecutive commas')
2852             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2853             if not mobj:
2854                 raise ValueError(f'{segment!r} is not a valid specification')
2855             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2856             if int_or_none(step) == 0:
2857                 raise ValueError(f'Step in {segment!r} cannot be zero')
2858             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2859
2860     def get_requested_items(self):
2861         playlist_items = self.ydl.params.get('playlist_items')
2862         playlist_start = self.ydl.params.get('playliststart', 1)
2863         playlist_end = self.ydl.params.get('playlistend')
2864         # For backwards compatibility, interpret -1 as whole list
2865         if playlist_end in (-1, None):
2866             playlist_end = ''
2867         if not playlist_items:
2868             playlist_items = f'{playlist_start}:{playlist_end}'
2869         elif playlist_start != 1 or playlist_end:
2870             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2871
2872         for index in self.parse_playlist_items(playlist_items):
2873             for i, entry in self[index]:
2874                 yield i, entry
2875                 if not entry:
2876                     continue
2877                 try:
2878                     # TODO: Add auto-generated fields
2879                     self.ydl._match_entry(entry, incomplete=True, silent=True)
2880                 except (ExistingVideoReached, RejectedVideoReached):
2881                     return
2882
2883     def get_full_count(self):
2884         if self.is_exhausted and not self.is_incomplete:
2885             return len(self)
2886         elif isinstance(self._entries, InAdvancePagedList):
2887             if self._entries._pagesize == 1:
2888                 return self._entries._pagecount
2889
2890     @functools.cached_property
2891     def _getter(self):
2892         if isinstance(self._entries, list):
2893             def get_entry(i):
2894                 try:
2895                     entry = self._entries[i]
2896                 except IndexError:
2897                     entry = self.MissingEntry
2898                     if not self.is_incomplete:
2899                         raise self.IndexError()
2900                 if entry is self.MissingEntry:
2901                     raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2902                 return entry
2903         else:
2904             def get_entry(i):
2905                 try:
2906                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2907                 except (LazyList.IndexError, PagedList.IndexError):
2908                     raise self.IndexError()
2909         return get_entry
2910
2911     def __getitem__(self, idx):
2912         if isinstance(idx, int):
2913             idx = slice(idx, idx)
2914
2915         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2916         step = 1 if idx.step is None else idx.step
2917         if idx.start is None:
2918             start = 0 if step > 0 else len(self) - 1
2919         else:
2920             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2921
2922         # NB: Do not call len(self) when idx == [:]
2923         if idx.stop is None:
2924             stop = 0 if step < 0 else float('inf')
2925         else:
2926             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2927         stop += [-1, 1][step > 0]
2928
2929         for i in frange(start, stop, step):
2930             if i < 0:
2931                 continue
2932             try:
2933                 entry = self._getter(i)
2934             except self.IndexError:
2935                 self.is_exhausted = True
2936                 if step > 0:
2937                     break
2938                 continue
2939             yield i + 1, entry
2940
2941     def __len__(self):
2942         return len(tuple(self[:]))
2943
2944     class IndexError(IndexError):
2945         pass
2946
2947
2948 def uppercase_escape(s):
2949     unicode_escape = codecs.getdecoder('unicode_escape')
2950     return re.sub(
2951         r'\\U[0-9a-fA-F]{8}',
2952         lambda m: unicode_escape(m.group(0))[0],
2953         s)
2954
2955
2956 def lowercase_escape(s):
2957     unicode_escape = codecs.getdecoder('unicode_escape')
2958     return re.sub(
2959         r'\\u[0-9a-fA-F]{4}',
2960         lambda m: unicode_escape(m.group(0))[0],
2961         s)
2962
2963
2964 def escape_rfc3986(s):
2965     """Escape non-ASCII characters as suggested by RFC 3986"""
2966     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2967
2968
2969 def escape_url(url):
2970     """Escape URL as suggested by RFC 3986"""
2971     url_parsed = urllib.parse.urlparse(url)
2972     return url_parsed._replace(
2973         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2974         path=escape_rfc3986(url_parsed.path),
2975         params=escape_rfc3986(url_parsed.params),
2976         query=escape_rfc3986(url_parsed.query),
2977         fragment=escape_rfc3986(url_parsed.fragment)
2978     ).geturl()
2979
2980
2981 def parse_qs(url):
2982     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
2983
2984
2985 def read_batch_urls(batch_fd):
2986     def fixup(url):
2987         if not isinstance(url, str):
2988             url = url.decode('utf-8', 'replace')
2989         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2990         for bom in BOM_UTF8:
2991             if url.startswith(bom):
2992                 url = url[len(bom):]
2993         url = url.lstrip()
2994         if not url or url.startswith(('#', ';', ']')):
2995             return False
2996         # "#" cannot be stripped out since it is part of the URI
2997         # However, it can be safely stipped out if follwing a whitespace
2998         return re.split(r'\s#', url, 1)[0].rstrip()
2999
3000     with contextlib.closing(batch_fd) as fd:
3001         return [url for url in map(fixup, fd) if url]
3002
3003
3004 def urlencode_postdata(*args, **kargs):
3005     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3006
3007
3008 def update_url_query(url, query):
3009     if not query:
3010         return url
3011     parsed_url = urllib.parse.urlparse(url)
3012     qs = urllib.parse.parse_qs(parsed_url.query)
3013     qs.update(query)
3014     return urllib.parse.urlunparse(parsed_url._replace(
3015         query=urllib.parse.urlencode(qs, True)))
3016
3017
3018 def update_Request(req, url=None, data=None, headers={}, query={}):
3019     req_headers = req.headers.copy()
3020     req_headers.update(headers)
3021     req_data = data or req.data
3022     req_url = update_url_query(url or req.get_full_url(), query)
3023     req_get_method = req.get_method()
3024     if req_get_method == 'HEAD':
3025         req_type = HEADRequest
3026     elif req_get_method == 'PUT':
3027         req_type = PUTRequest
3028     else:
3029         req_type = urllib.request.Request
3030     new_req = req_type(
3031         req_url, data=req_data, headers=req_headers,
3032         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3033     if hasattr(req, 'timeout'):
3034         new_req.timeout = req.timeout
3035     return new_req
3036
3037
3038 def _multipart_encode_impl(data, boundary):
3039     content_type = 'multipart/form-data; boundary=%s' % boundary
3040
3041     out = b''
3042     for k, v in data.items():
3043         out += b'--' + boundary.encode('ascii') + b'\r\n'
3044         if isinstance(k, str):
3045             k = k.encode()
3046         if isinstance(v, str):
3047             v = v.encode()
3048         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3049         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3050         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3051         if boundary.encode('ascii') in content:
3052             raise ValueError('Boundary overlaps with data')
3053         out += content
3054
3055     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3056
3057     return out, content_type
3058
3059
3060 def multipart_encode(data, boundary=None):
3061     '''
3062     Encode a dict to RFC 7578-compliant form-data
3063
3064     data:
3065         A dict where keys and values can be either Unicode or bytes-like
3066         objects.
3067     boundary:
3068         If specified a Unicode object, it's used as the boundary. Otherwise
3069         a random boundary is generated.
3070
3071     Reference: https://tools.ietf.org/html/rfc7578
3072     '''
3073     has_specified_boundary = boundary is not None
3074
3075     while True:
3076         if boundary is None:
3077             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3078
3079         try:
3080             out, content_type = _multipart_encode_impl(data, boundary)
3081             break
3082         except ValueError:
3083             if has_specified_boundary:
3084                 raise
3085             boundary = None
3086
3087     return out, content_type
3088
3089
3090 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3091     for val in map(d.get, variadic(key_or_keys)):
3092         if val is not None and (val or not skip_false_values):
3093             return val
3094     return default
3095
3096
3097 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3098     for f in funcs:
3099         try:
3100             val = f(*args, **kwargs)
3101         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3102             pass
3103         else:
3104             if expected_type is None or isinstance(val, expected_type):
3105                 return val
3106
3107
3108 def try_get(src, getter, expected_type=None):
3109     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3110
3111
3112 def filter_dict(dct, cndn=lambda _, v: v is not None):
3113     return {k: v for k, v in dct.items() if cndn(k, v)}
3114
3115
3116 def merge_dicts(*dicts):
3117     merged = {}
3118     for a_dict in dicts:
3119         for k, v in a_dict.items():
3120             if (v is not None and k not in merged
3121                     or isinstance(v, str) and merged[k] == ''):
3122                 merged[k] = v
3123     return merged
3124
3125
3126 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3127     return string if isinstance(string, str) else str(string, encoding, errors)
3128
3129
3130 US_RATINGS = {
3131     'G': 0,
3132     'PG': 10,
3133     'PG-13': 13,
3134     'R': 16,
3135     'NC': 18,
3136 }
3137
3138
3139 TV_PARENTAL_GUIDELINES = {
3140     'TV-Y': 0,
3141     'TV-Y7': 7,
3142     'TV-G': 0,
3143     'TV-PG': 0,
3144     'TV-14': 14,
3145     'TV-MA': 17,
3146 }
3147
3148
3149 def parse_age_limit(s):
3150     # isinstance(False, int) is True. So type() must be used instead
3151     if type(s) is int:  # noqa: E721
3152         return s if 0 <= s <= 21 else None
3153     elif not isinstance(s, str):
3154         return None
3155     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3156     if m:
3157         return int(m.group('age'))
3158     s = s.upper()
3159     if s in US_RATINGS:
3160         return US_RATINGS[s]
3161     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3162     if m:
3163         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3164     return None
3165
3166
3167 def strip_jsonp(code):
3168     return re.sub(
3169         r'''(?sx)^
3170             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3171             (?:\s*&&\s*(?P=func_name))?
3172             \s*\(\s*(?P<callback_data>.*)\);?
3173             \s*?(?://[^\n]*)*$''',
3174         r'\g<callback_data>', code)
3175
3176
3177 def js_to_json(code, vars={}):
3178     # vars is a dict of var, val pairs to substitute
3179     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3180     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3181     INTEGER_TABLE = (
3182         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3183         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3184     )
3185
3186     def fix_kv(m):
3187         v = m.group(0)
3188         if v in ('true', 'false', 'null'):
3189             return v
3190         elif v in ('undefined', 'void 0'):
3191             return 'null'
3192         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3193             return ""
3194
3195         if v[0] in ("'", '"'):
3196             v = re.sub(r'(?s)\\.|"', lambda m: {
3197                 '"': '\\"',
3198                 "\\'": "'",
3199                 '\\\n': '',
3200                 '\\x': '\\u00',
3201             }.get(m.group(0), m.group(0)), v[1:-1])
3202         else:
3203             for regex, base in INTEGER_TABLE:
3204                 im = re.match(regex, v)
3205                 if im:
3206                     i = int(im.group(1), base)
3207                     return '"%d":' % i if v.endswith(':') else '%d' % i
3208
3209             if v in vars:
3210                 return vars[v]
3211
3212         return '"%s"' % v
3213
3214     def create_map(mobj):
3215         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3216
3217     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3218     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3219
3220     return re.sub(r'''(?sx)
3221         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3222         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3223         {comment}|,(?={skip}[\]}}])|
3224         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3225         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3226         [0-9]+(?={skip}:)|
3227         !+
3228         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3229
3230
3231 def qualities(quality_ids):
3232     """ Get a numeric quality value out of a list of possible values """
3233     def q(qid):
3234         try:
3235             return quality_ids.index(qid)
3236         except ValueError:
3237             return -1
3238     return q
3239
3240
3241 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3242
3243
3244 DEFAULT_OUTTMPL = {
3245     'default': '%(title)s [%(id)s].%(ext)s',
3246     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3247 }
3248 OUTTMPL_TYPES = {
3249     'chapter': None,
3250     'subtitle': None,
3251     'thumbnail': None,
3252     'description': 'description',
3253     'annotation': 'annotations.xml',
3254     'infojson': 'info.json',
3255     'link': None,
3256     'pl_video': None,
3257     'pl_thumbnail': None,
3258     'pl_description': 'description',
3259     'pl_infojson': 'info.json',
3260 }
3261
3262 # As of [1] format syntax is:
3263 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3264 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3265 STR_FORMAT_RE_TMPL = r'''(?x)
3266     (?<!%)(?P<prefix>(?:%%)*)
3267     %
3268     (?P<has_key>\((?P<key>{0})\))?
3269     (?P<format>
3270         (?P<conversion>[#0\-+ ]+)?
3271         (?P<min_width>\d+)?
3272         (?P<precision>\.\d+)?
3273         (?P<len_mod>[hlL])?  # unused in python
3274         {1}  # conversion type
3275     )
3276 '''
3277
3278
3279 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3280
3281
3282 def limit_length(s, length):
3283     """ Add ellipses to overly long strings """
3284     if s is None:
3285         return None
3286     ELLIPSES = '...'
3287     if len(s) > length:
3288         return s[:length - len(ELLIPSES)] + ELLIPSES
3289     return s
3290
3291
3292 def version_tuple(v):
3293     return tuple(int(e) for e in re.split(r'[-.]', v))
3294
3295
3296 def is_outdated_version(version, limit, assume_new=True):
3297     if not version:
3298         return not assume_new
3299     try:
3300         return version_tuple(version) < version_tuple(limit)
3301     except ValueError:
3302         return not assume_new
3303
3304
3305 def ytdl_is_updateable():
3306     """ Returns if yt-dlp can be updated with -U """
3307
3308     from .update import is_non_updateable
3309
3310     return not is_non_updateable()
3311
3312
3313 def args_to_str(args):
3314     # Get a short string representation for a subprocess command
3315     return ' '.join(compat_shlex_quote(a) for a in args)
3316
3317
3318 def error_to_compat_str(err):
3319     return str(err)
3320
3321
3322 def error_to_str(err):
3323     return f'{type(err).__name__}: {err}'
3324
3325
3326 def mimetype2ext(mt):
3327     if mt is None:
3328         return None
3329
3330     mt, _, params = mt.partition(';')
3331     mt = mt.strip()
3332
3333     FULL_MAP = {
3334         'audio/mp4': 'm4a',
3335         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3336         # it's the most popular one
3337         'audio/mpeg': 'mp3',
3338         'audio/x-wav': 'wav',
3339         'audio/wav': 'wav',
3340         'audio/wave': 'wav',
3341     }
3342
3343     ext = FULL_MAP.get(mt)
3344     if ext is not None:
3345         return ext
3346
3347     SUBTYPE_MAP = {
3348         '3gpp': '3gp',
3349         'smptett+xml': 'tt',
3350         'ttaf+xml': 'dfxp',
3351         'ttml+xml': 'ttml',
3352         'x-flv': 'flv',
3353         'x-mp4-fragmented': 'mp4',
3354         'x-ms-sami': 'sami',
3355         'x-ms-wmv': 'wmv',
3356         'mpegurl': 'm3u8',
3357         'x-mpegurl': 'm3u8',
3358         'vnd.apple.mpegurl': 'm3u8',
3359         'dash+xml': 'mpd',
3360         'f4m+xml': 'f4m',
3361         'hds+xml': 'f4m',
3362         'vnd.ms-sstr+xml': 'ism',
3363         'quicktime': 'mov',
3364         'mp2t': 'ts',
3365         'x-wav': 'wav',
3366         'filmstrip+json': 'fs',
3367         'svg+xml': 'svg',
3368     }
3369
3370     _, _, subtype = mt.rpartition('/')
3371     ext = SUBTYPE_MAP.get(subtype.lower())
3372     if ext is not None:
3373         return ext
3374
3375     SUFFIX_MAP = {
3376         'json': 'json',
3377         'xml': 'xml',
3378         'zip': 'zip',
3379         'gzip': 'gz',
3380     }
3381
3382     _, _, suffix = subtype.partition('+')
3383     ext = SUFFIX_MAP.get(suffix)
3384     if ext is not None:
3385         return ext
3386
3387     return subtype.replace('+', '.')
3388
3389
3390 def ext2mimetype(ext_or_url):
3391     if not ext_or_url:
3392         return None
3393     if '.' not in ext_or_url:
3394         ext_or_url = f'file.{ext_or_url}'
3395     return mimetypes.guess_type(ext_or_url)[0]
3396
3397
3398 def parse_codecs(codecs_str):
3399     # http://tools.ietf.org/html/rfc6381
3400     if not codecs_str:
3401         return {}
3402     split_codecs = list(filter(None, map(
3403         str.strip, codecs_str.strip().strip(',').split(','))))
3404     vcodec, acodec, scodec, hdr = None, None, None, None
3405     for full_codec in split_codecs:
3406         parts = full_codec.split('.')
3407         codec = parts[0].replace('0', '')
3408         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3409                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3410             if not vcodec:
3411                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3412                 if codec in ('dvh1', 'dvhe'):
3413                     hdr = 'DV'
3414                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3415                     hdr = 'HDR10'
3416                 elif full_codec.replace('0', '').startswith('vp9.2'):
3417                     hdr = 'HDR10'
3418         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3419             if not acodec:
3420                 acodec = full_codec
3421         elif codec in ('stpp', 'wvtt',):
3422             if not scodec:
3423                 scodec = full_codec
3424         else:
3425             write_string(f'WARNING: Unknown codec {full_codec}\n')
3426     if vcodec or acodec or scodec:
3427         return {
3428             'vcodec': vcodec or 'none',
3429             'acodec': acodec or 'none',
3430             'dynamic_range': hdr,
3431             **({'scodec': scodec} if scodec is not None else {}),
3432         }
3433     elif len(split_codecs) == 2:
3434         return {
3435             'vcodec': split_codecs[0],
3436             'acodec': split_codecs[1],
3437         }
3438     return {}
3439
3440
3441 def urlhandle_detect_ext(url_handle):
3442     getheader = url_handle.headers.get
3443
3444     cd = getheader('Content-Disposition')
3445     if cd:
3446         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3447         if m:
3448             e = determine_ext(m.group('filename'), default_ext=None)
3449             if e:
3450                 return e
3451
3452     return mimetype2ext(getheader('Content-Type'))
3453
3454
3455 def encode_data_uri(data, mime_type):
3456     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3457
3458
3459 def age_restricted(content_limit, age_limit):
3460     """ Returns True iff the content should be blocked """
3461
3462     if age_limit is None:  # No limit set
3463         return False
3464     if content_limit is None:
3465         return False  # Content available for everyone
3466     return age_limit < content_limit
3467
3468
3469 def is_html(first_bytes):
3470     """ Detect whether a file contains HTML by examining its first bytes. """
3471
3472     BOMS = [
3473         (b'\xef\xbb\xbf', 'utf-8'),
3474         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3475         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3476         (b'\xff\xfe', 'utf-16-le'),
3477         (b'\xfe\xff', 'utf-16-be'),
3478     ]
3479
3480     encoding = 'utf-8'
3481     for bom, enc in BOMS:
3482         while first_bytes.startswith(bom):
3483             encoding, first_bytes = enc, first_bytes[len(bom):]
3484
3485     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3486
3487
3488 def determine_protocol(info_dict):
3489     protocol = info_dict.get('protocol')
3490     if protocol is not None:
3491         return protocol
3492
3493     url = sanitize_url(info_dict['url'])
3494     if url.startswith('rtmp'):
3495         return 'rtmp'
3496     elif url.startswith('mms'):
3497         return 'mms'
3498     elif url.startswith('rtsp'):
3499         return 'rtsp'
3500
3501     ext = determine_ext(url)
3502     if ext == 'm3u8':
3503         return 'm3u8'
3504     elif ext == 'f4m':
3505         return 'f4m'
3506
3507     return urllib.parse.urlparse(url).scheme
3508
3509
3510 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3511     """ Render a list of rows, each as a list of values.
3512     Text after a \t will be right aligned """
3513     def width(string):
3514         return len(remove_terminal_sequences(string).replace('\t', ''))
3515
3516     def get_max_lens(table):
3517         return [max(width(str(v)) for v in col) for col in zip(*table)]
3518
3519     def filter_using_list(row, filterArray):
3520         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3521
3522     max_lens = get_max_lens(data) if hide_empty else []
3523     header_row = filter_using_list(header_row, max_lens)
3524     data = [filter_using_list(row, max_lens) for row in data]
3525
3526     table = [header_row] + data
3527     max_lens = get_max_lens(table)
3528     extra_gap += 1
3529     if delim:
3530         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3531         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3532     for row in table:
3533         for pos, text in enumerate(map(str, row)):
3534             if '\t' in text:
3535                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3536             else:
3537                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3538     ret = '\n'.join(''.join(row).rstrip() for row in table)
3539     return ret
3540
3541
3542 def _match_one(filter_part, dct, incomplete):
3543     # TODO: Generalize code with YoutubeDL._build_format_filter
3544     STRING_OPERATORS = {
3545         '*=': operator.contains,
3546         '^=': lambda attr, value: attr.startswith(value),
3547         '$=': lambda attr, value: attr.endswith(value),
3548         '~=': lambda attr, value: re.search(value, attr),
3549     }
3550     COMPARISON_OPERATORS = {
3551         **STRING_OPERATORS,
3552         '<=': operator.le,  # "<=" must be defined above "<"
3553         '<': operator.lt,
3554         '>=': operator.ge,
3555         '>': operator.gt,
3556         '=': operator.eq,
3557     }
3558
3559     if isinstance(incomplete, bool):
3560         is_incomplete = lambda _: incomplete
3561     else:
3562         is_incomplete = lambda k: k in incomplete
3563
3564     operator_rex = re.compile(r'''(?x)
3565         (?P<key>[a-z_]+)
3566         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3567         (?:
3568             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3569             (?P<strval>.+?)
3570         )
3571         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3572     m = operator_rex.fullmatch(filter_part.strip())
3573     if m:
3574         m = m.groupdict()
3575         unnegated_op = COMPARISON_OPERATORS[m['op']]
3576         if m['negation']:
3577             op = lambda attr, value: not unnegated_op(attr, value)
3578         else:
3579             op = unnegated_op
3580         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3581         if m['quote']:
3582             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3583         actual_value = dct.get(m['key'])
3584         numeric_comparison = None
3585         if isinstance(actual_value, (int, float)):
3586             # If the original field is a string and matching comparisonvalue is
3587             # a number we should respect the origin of the original field
3588             # and process comparison value as a string (see
3589             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3590             try:
3591                 numeric_comparison = int(comparison_value)
3592             except ValueError:
3593                 numeric_comparison = parse_filesize(comparison_value)
3594                 if numeric_comparison is None:
3595                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3596                 if numeric_comparison is None:
3597                     numeric_comparison = parse_duration(comparison_value)
3598         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3599             raise ValueError('Operator %s only supports string values!' % m['op'])
3600         if actual_value is None:
3601             return is_incomplete(m['key']) or m['none_inclusive']
3602         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3603
3604     UNARY_OPERATORS = {
3605         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3606         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3607     }
3608     operator_rex = re.compile(r'''(?x)
3609         (?P<op>%s)\s*(?P<key>[a-z_]+)
3610         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3611     m = operator_rex.fullmatch(filter_part.strip())
3612     if m:
3613         op = UNARY_OPERATORS[m.group('op')]
3614         actual_value = dct.get(m.group('key'))
3615         if is_incomplete(m.group('key')) and actual_value is None:
3616             return True
3617         return op(actual_value)
3618
3619     raise ValueError('Invalid filter part %r' % filter_part)
3620
3621
3622 def match_str(filter_str, dct, incomplete=False):
3623     """ Filter a dictionary with a simple string syntax.
3624     @returns           Whether the filter passes
3625     @param incomplete  Set of keys that is expected to be missing from dct.
3626                        Can be True/False to indicate all/none of the keys may be missing.
3627                        All conditions on incomplete keys pass if the key is missing
3628     """
3629     return all(
3630         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3631         for filter_part in re.split(r'(?<!\\)&', filter_str))
3632
3633
3634 def match_filter_func(filters):
3635     if not filters:
3636         return None
3637     filters = set(variadic(filters))
3638
3639     interactive = '-' in filters
3640     if interactive:
3641         filters.remove('-')
3642
3643     def _match_func(info_dict, incomplete=False):
3644         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3645             return NO_DEFAULT if interactive and not incomplete else None
3646         else:
3647             video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3648             filter_str = ') | ('.join(map(str.strip, filters))
3649             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3650     return _match_func
3651
3652
3653 def download_range_func(chapters, ranges):
3654     def inner(info_dict, ydl):
3655         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3656                    else 'Cannot match chapters since chapter information is unavailable')
3657         for regex in chapters or []:
3658             for i, chapter in enumerate(info_dict.get('chapters') or []):
3659                 if re.search(regex, chapter['title']):
3660                     warning = None
3661                     yield {**chapter, 'index': i}
3662         if chapters and warning:
3663             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3664
3665         yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
3666
3667     return inner
3668
3669
3670 def parse_dfxp_time_expr(time_expr):
3671     if not time_expr:
3672         return
3673
3674     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3675     if mobj:
3676         return float(mobj.group('time_offset'))
3677
3678     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3679     if mobj:
3680         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3681
3682
3683 def srt_subtitles_timecode(seconds):
3684     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3685
3686
3687 def ass_subtitles_timecode(seconds):
3688     time = timetuple_from_msec(seconds * 1000)
3689     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3690
3691
3692 def dfxp2srt(dfxp_data):
3693     '''
3694     @param dfxp_data A bytes-like object containing DFXP data
3695     @returns A unicode object containing converted SRT data
3696     '''
3697     LEGACY_NAMESPACES = (
3698         (b'http://www.w3.org/ns/ttml', [
3699             b'http://www.w3.org/2004/11/ttaf1',
3700             b'http://www.w3.org/2006/04/ttaf1',
3701             b'http://www.w3.org/2006/10/ttaf1',
3702         ]),
3703         (b'http://www.w3.org/ns/ttml#styling', [
3704             b'http://www.w3.org/ns/ttml#style',
3705         ]),
3706     )
3707
3708     SUPPORTED_STYLING = [
3709         'color',
3710         'fontFamily',
3711         'fontSize',
3712         'fontStyle',
3713         'fontWeight',
3714         'textDecoration'
3715     ]
3716
3717     _x = functools.partial(xpath_with_ns, ns_map={
3718         'xml': 'http://www.w3.org/XML/1998/namespace',
3719         'ttml': 'http://www.w3.org/ns/ttml',
3720         'tts': 'http://www.w3.org/ns/ttml#styling',
3721     })
3722
3723     styles = {}
3724     default_style = {}
3725
3726     class TTMLPElementParser:
3727         _out = ''
3728         _unclosed_elements = []
3729         _applied_styles = []
3730
3731         def start(self, tag, attrib):
3732             if tag in (_x('ttml:br'), 'br'):
3733                 self._out += '\n'
3734             else:
3735                 unclosed_elements = []
3736                 style = {}
3737                 element_style_id = attrib.get('style')
3738                 if default_style:
3739                     style.update(default_style)
3740                 if element_style_id:
3741                     style.update(styles.get(element_style_id, {}))
3742                 for prop in SUPPORTED_STYLING:
3743                     prop_val = attrib.get(_x('tts:' + prop))
3744                     if prop_val:
3745                         style[prop] = prop_val
3746                 if style:
3747                     font = ''
3748                     for k, v in sorted(style.items()):
3749                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3750                             continue
3751                         if k == 'color':
3752                             font += ' color="%s"' % v
3753                         elif k == 'fontSize':
3754                             font += ' size="%s"' % v
3755                         elif k == 'fontFamily':
3756                             font += ' face="%s"' % v
3757                         elif k == 'fontWeight' and v == 'bold':
3758                             self._out += '<b>'
3759                             unclosed_elements.append('b')
3760                         elif k == 'fontStyle' and v == 'italic':
3761                             self._out += '<i>'
3762                             unclosed_elements.append('i')
3763                         elif k == 'textDecoration' and v == 'underline':
3764                             self._out += '<u>'
3765                             unclosed_elements.append('u')
3766                     if font:
3767                         self._out += '<font' + font + '>'
3768                         unclosed_elements.append('font')
3769                     applied_style = {}
3770                     if self._applied_styles:
3771                         applied_style.update(self._applied_styles[-1])
3772                     applied_style.update(style)
3773                     self._applied_styles.append(applied_style)
3774                 self._unclosed_elements.append(unclosed_elements)
3775
3776         def end(self, tag):
3777             if tag not in (_x('ttml:br'), 'br'):
3778                 unclosed_elements = self._unclosed_elements.pop()
3779                 for element in reversed(unclosed_elements):
3780                     self._out += '</%s>' % element
3781                 if unclosed_elements and self._applied_styles:
3782                     self._applied_styles.pop()
3783
3784         def data(self, data):
3785             self._out += data
3786
3787         def close(self):
3788             return self._out.strip()
3789
3790     def parse_node(node):
3791         target = TTMLPElementParser()
3792         parser = xml.etree.ElementTree.XMLParser(target=target)
3793         parser.feed(xml.etree.ElementTree.tostring(node))
3794         return parser.close()
3795
3796     for k, v in LEGACY_NAMESPACES:
3797         for ns in v:
3798             dfxp_data = dfxp_data.replace(ns, k)
3799
3800     dfxp = compat_etree_fromstring(dfxp_data)
3801     out = []
3802     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3803
3804     if not paras:
3805         raise ValueError('Invalid dfxp/TTML subtitle')
3806
3807     repeat = False
3808     while True:
3809         for style in dfxp.findall(_x('.//ttml:style')):
3810             style_id = style.get('id') or style.get(_x('xml:id'))
3811             if not style_id:
3812                 continue
3813             parent_style_id = style.get('style')
3814             if parent_style_id:
3815                 if parent_style_id not in styles:
3816                     repeat = True
3817                     continue
3818                 styles[style_id] = styles[parent_style_id].copy()
3819             for prop in SUPPORTED_STYLING:
3820                 prop_val = style.get(_x('tts:' + prop))
3821                 if prop_val:
3822                     styles.setdefault(style_id, {})[prop] = prop_val
3823         if repeat:
3824             repeat = False
3825         else:
3826             break
3827
3828     for p in ('body', 'div'):
3829         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3830         if ele is None:
3831             continue
3832         style = styles.get(ele.get('style'))
3833         if not style:
3834             continue
3835         default_style.update(style)
3836
3837     for para, index in zip(paras, itertools.count(1)):
3838         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3839         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3840         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3841         if begin_time is None:
3842             continue
3843         if not end_time:
3844             if not dur:
3845                 continue
3846             end_time = begin_time + dur
3847         out.append('%d\n%s --> %s\n%s\n\n' % (
3848             index,
3849             srt_subtitles_timecode(begin_time),
3850             srt_subtitles_timecode(end_time),
3851             parse_node(para)))
3852
3853     return ''.join(out)
3854
3855
3856 def cli_option(params, command_option, param, separator=None):
3857     param = params.get(param)
3858     return ([] if param is None
3859             else [command_option, str(param)] if separator is None
3860             else [f'{command_option}{separator}{param}'])
3861
3862
3863 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3864     param = params.get(param)
3865     assert param in (True, False, None)
3866     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3867
3868
3869 def cli_valueless_option(params, command_option, param, expected_value=True):
3870     return [command_option] if params.get(param) == expected_value else []
3871
3872
3873 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3874     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3875         if use_compat:
3876             return argdict
3877         else:
3878             argdict = None
3879     if argdict is None:
3880         return default
3881     assert isinstance(argdict, dict)
3882
3883     assert isinstance(keys, (list, tuple))
3884     for key_list in keys:
3885         arg_list = list(filter(
3886             lambda x: x is not None,
3887             [argdict.get(key.lower()) for key in variadic(key_list)]))
3888         if arg_list:
3889             return [arg for args in arg_list for arg in args]
3890     return default
3891
3892
3893 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3894     main_key, exe = main_key.lower(), exe.lower()
3895     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3896     keys = [f'{root_key}{k}' for k in (keys or [''])]
3897     if root_key in keys:
3898         if main_key != exe:
3899             keys.append((main_key, exe))
3900         keys.append('default')
3901     else:
3902         use_compat = False
3903     return cli_configuration_args(argdict, keys, default, use_compat)
3904
3905
3906 class ISO639Utils:
3907     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3908     _lang_map = {
3909         'aa': 'aar',
3910         'ab': 'abk',
3911         'ae': 'ave',
3912         'af': 'afr',
3913         'ak': 'aka',
3914         'am': 'amh',
3915         'an': 'arg',
3916         'ar': 'ara',
3917         'as': 'asm',
3918         'av': 'ava',
3919         'ay': 'aym',
3920         'az': 'aze',
3921         'ba': 'bak',
3922         'be': 'bel',
3923         'bg': 'bul',
3924         'bh': 'bih',
3925         'bi': 'bis',
3926         'bm': 'bam',
3927         'bn': 'ben',
3928         'bo': 'bod',
3929         'br': 'bre',
3930         'bs': 'bos',
3931         'ca': 'cat',
3932         'ce': 'che',
3933         'ch': 'cha',
3934         'co': 'cos',
3935         'cr': 'cre',
3936         'cs': 'ces',
3937         'cu': 'chu',
3938         'cv': 'chv',
3939         'cy': 'cym',
3940         'da': 'dan',
3941         'de': 'deu',
3942         'dv': 'div',
3943         'dz': 'dzo',
3944         'ee': 'ewe',
3945         'el': 'ell',
3946         'en': 'eng',
3947         'eo': 'epo',
3948         'es': 'spa',
3949         'et': 'est',
3950         'eu': 'eus',
3951         'fa': 'fas',
3952         'ff': 'ful',
3953         'fi': 'fin',
3954         'fj': 'fij',
3955         'fo': 'fao',
3956         'fr': 'fra',
3957         'fy': 'fry',
3958         'ga': 'gle',
3959         'gd': 'gla',
3960         'gl': 'glg',
3961         'gn': 'grn',
3962         'gu': 'guj',
3963         'gv': 'glv',
3964         'ha': 'hau',
3965         'he': 'heb',
3966         'iw': 'heb',  # Replaced by he in 1989 revision
3967         'hi': 'hin',
3968         'ho': 'hmo',
3969         'hr': 'hrv',
3970         'ht': 'hat',
3971         'hu': 'hun',
3972         'hy': 'hye',
3973         'hz': 'her',
3974         'ia': 'ina',
3975         'id': 'ind',
3976         'in': 'ind',  # Replaced by id in 1989 revision
3977         'ie': 'ile',
3978         'ig': 'ibo',
3979         'ii': 'iii',
3980         'ik': 'ipk',
3981         'io': 'ido',
3982         'is': 'isl',
3983         'it': 'ita',
3984         'iu': 'iku',
3985         'ja': 'jpn',
3986         'jv': 'jav',
3987         'ka': 'kat',
3988         'kg': 'kon',
3989         'ki': 'kik',
3990         'kj': 'kua',
3991         'kk': 'kaz',
3992         'kl': 'kal',
3993         'km': 'khm',
3994         'kn': 'kan',
3995         'ko': 'kor',
3996         'kr': 'kau',
3997         'ks': 'kas',
3998         'ku': 'kur',
3999         'kv': 'kom',
4000         'kw': 'cor',
4001         'ky': 'kir',
4002         'la': 'lat',
4003         'lb': 'ltz',
4004         'lg': 'lug',
4005         'li': 'lim',
4006         'ln': 'lin',
4007         'lo': 'lao',
4008         'lt': 'lit',
4009         'lu': 'lub',
4010         'lv': 'lav',
4011         'mg': 'mlg',
4012         'mh': 'mah',
4013         'mi': 'mri',
4014         'mk': 'mkd',
4015         'ml': 'mal',
4016         'mn': 'mon',
4017         'mr': 'mar',
4018         'ms': 'msa',
4019         'mt': 'mlt',
4020         'my': 'mya',
4021         'na': 'nau',
4022         'nb': 'nob',
4023         'nd': 'nde',
4024         'ne': 'nep',
4025         'ng': 'ndo',
4026         'nl': 'nld',
4027         'nn': 'nno',
4028         'no': 'nor',
4029         'nr': 'nbl',
4030         'nv': 'nav',
4031         'ny': 'nya',
4032         'oc': 'oci',
4033         'oj': 'oji',
4034         'om': 'orm',
4035         'or': 'ori',
4036         'os': 'oss',
4037         'pa': 'pan',
4038         'pi': 'pli',
4039         'pl': 'pol',
4040         'ps': 'pus',
4041         'pt': 'por',
4042         'qu': 'que',
4043         'rm': 'roh',
4044         'rn': 'run',
4045         'ro': 'ron',
4046         'ru': 'rus',
4047         'rw': 'kin',
4048         'sa': 'san',
4049         'sc': 'srd',
4050         'sd': 'snd',
4051         'se': 'sme',
4052         'sg': 'sag',
4053         'si': 'sin',
4054         'sk': 'slk',
4055         'sl': 'slv',
4056         'sm': 'smo',
4057         'sn': 'sna',
4058         'so': 'som',
4059         'sq': 'sqi',
4060         'sr': 'srp',
4061         'ss': 'ssw',
4062         'st': 'sot',
4063         'su': 'sun',
4064         'sv': 'swe',
4065         'sw': 'swa',
4066         'ta': 'tam',
4067         'te': 'tel',
4068         'tg': 'tgk',
4069         'th': 'tha',
4070         'ti': 'tir',
4071         'tk': 'tuk',
4072         'tl': 'tgl',
4073         'tn': 'tsn',
4074         'to': 'ton',
4075         'tr': 'tur',
4076         'ts': 'tso',
4077         'tt': 'tat',
4078         'tw': 'twi',
4079         'ty': 'tah',
4080         'ug': 'uig',
4081         'uk': 'ukr',
4082         'ur': 'urd',
4083         'uz': 'uzb',
4084         've': 'ven',
4085         'vi': 'vie',
4086         'vo': 'vol',
4087         'wa': 'wln',
4088         'wo': 'wol',
4089         'xh': 'xho',
4090         'yi': 'yid',
4091         'ji': 'yid',  # Replaced by yi in 1989 revision
4092         'yo': 'yor',
4093         'za': 'zha',
4094         'zh': 'zho',
4095         'zu': 'zul',
4096     }
4097
4098     @classmethod
4099     def short2long(cls, code):
4100         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4101         return cls._lang_map.get(code[:2])
4102
4103     @classmethod
4104     def long2short(cls, code):
4105         """Convert language code from ISO 639-2/T to ISO 639-1"""
4106         for short_name, long_name in cls._lang_map.items():
4107             if long_name == code:
4108                 return short_name
4109
4110
4111 class ISO3166Utils:
4112     # From http://data.okfn.org/data/core/country-list
4113     _country_map = {
4114         'AF': 'Afghanistan',
4115         'AX': 'Åland Islands',
4116         'AL': 'Albania',
4117         'DZ': 'Algeria',
4118         'AS': 'American Samoa',
4119         'AD': 'Andorra',
4120         'AO': 'Angola',
4121         'AI': 'Anguilla',
4122         'AQ': 'Antarctica',
4123         'AG': 'Antigua and Barbuda',
4124         'AR': 'Argentina',
4125         'AM': 'Armenia',
4126         'AW': 'Aruba',
4127         'AU': 'Australia',
4128         'AT': 'Austria',
4129         'AZ': 'Azerbaijan',
4130         'BS': 'Bahamas',
4131         'BH': 'Bahrain',
4132         'BD': 'Bangladesh',
4133         'BB': 'Barbados',
4134         'BY': 'Belarus',
4135         'BE': 'Belgium',
4136         'BZ': 'Belize',
4137         'BJ': 'Benin',
4138         'BM': 'Bermuda',
4139         'BT': 'Bhutan',
4140         'BO': 'Bolivia, Plurinational State of',
4141         'BQ': 'Bonaire, Sint Eustatius and Saba',
4142         'BA': 'Bosnia and Herzegovina',
4143         'BW': 'Botswana',
4144         'BV': 'Bouvet Island',
4145         'BR': 'Brazil',
4146         'IO': 'British Indian Ocean Territory',
4147         'BN': 'Brunei Darussalam',
4148         'BG': 'Bulgaria',
4149         'BF': 'Burkina Faso',
4150         'BI': 'Burundi',
4151         'KH': 'Cambodia',
4152         'CM': 'Cameroon',
4153         'CA': 'Canada',
4154         'CV': 'Cape Verde',
4155         'KY': 'Cayman Islands',
4156         'CF': 'Central African Republic',
4157         'TD': 'Chad',
4158         'CL': 'Chile',
4159         'CN': 'China',
4160         'CX': 'Christmas Island',
4161         'CC': 'Cocos (Keeling) Islands',
4162         'CO': 'Colombia',
4163         'KM': 'Comoros',
4164         'CG': 'Congo',
4165         'CD': 'Congo, the Democratic Republic of the',
4166         'CK': 'Cook Islands',
4167         'CR': 'Costa Rica',
4168         'CI': 'Côte d\'Ivoire',
4169         'HR': 'Croatia',
4170         'CU': 'Cuba',
4171         'CW': 'Curaçao',
4172         'CY': 'Cyprus',
4173         'CZ': 'Czech Republic',
4174         'DK': 'Denmark',
4175         'DJ': 'Djibouti',
4176         'DM': 'Dominica',
4177         'DO': 'Dominican Republic',
4178         'EC': 'Ecuador',
4179         'EG': 'Egypt',
4180         'SV': 'El Salvador',
4181         'GQ': 'Equatorial Guinea',
4182         'ER': 'Eritrea',
4183         'EE': 'Estonia',
4184         'ET': 'Ethiopia',
4185         'FK': 'Falkland Islands (Malvinas)',
4186         'FO': 'Faroe Islands',
4187         'FJ': 'Fiji',
4188         'FI': 'Finland',
4189         'FR': 'France',
4190         'GF': 'French Guiana',
4191         'PF': 'French Polynesia',
4192         'TF': 'French Southern Territories',
4193         'GA': 'Gabon',
4194         'GM': 'Gambia',
4195         'GE': 'Georgia',
4196         'DE': 'Germany',
4197         'GH': 'Ghana',
4198         'GI': 'Gibraltar',
4199         'GR': 'Greece',
4200         'GL': 'Greenland',
4201         'GD': 'Grenada',
4202         'GP': 'Guadeloupe',
4203         'GU': 'Guam',
4204         'GT': 'Guatemala',
4205         'GG': 'Guernsey',
4206         'GN': 'Guinea',
4207         'GW': 'Guinea-Bissau',
4208         'GY': 'Guyana',
4209         'HT': 'Haiti',
4210         'HM': 'Heard Island and McDonald Islands',
4211         'VA': 'Holy See (Vatican City State)',
4212         'HN': 'Honduras',
4213         'HK': 'Hong Kong',
4214         'HU': 'Hungary',
4215         'IS': 'Iceland',
4216         'IN': 'India',
4217         'ID': 'Indonesia',
4218         'IR': 'Iran, Islamic Republic of',
4219         'IQ': 'Iraq',
4220         'IE': 'Ireland',
4221         'IM': 'Isle of Man',
4222         'IL': 'Israel',
4223         'IT': 'Italy',
4224         'JM': 'Jamaica',
4225         'JP': 'Japan',
4226         'JE': 'Jersey',
4227         'JO': 'Jordan',
4228         'KZ': 'Kazakhstan',
4229         'KE': 'Kenya',
4230         'KI': 'Kiribati',
4231         'KP': 'Korea, Democratic People\'s Republic of',
4232         'KR': 'Korea, Republic of',
4233         'KW': 'Kuwait',
4234         'KG': 'Kyrgyzstan',
4235         'LA': 'Lao People\'s Democratic Republic',
4236         'LV': 'Latvia',
4237         'LB': 'Lebanon',
4238         'LS': 'Lesotho',
4239         'LR': 'Liberia',
4240         'LY': 'Libya',
4241         'LI': 'Liechtenstein',
4242         'LT': 'Lithuania',
4243         'LU': 'Luxembourg',
4244         'MO': 'Macao',
4245         'MK': 'Macedonia, the Former Yugoslav Republic of',
4246         'MG': 'Madagascar',
4247         'MW': 'Malawi',
4248         'MY': 'Malaysia',
4249         'MV': 'Maldives',
4250         'ML': 'Mali',
4251         'MT': 'Malta',
4252         'MH': 'Marshall Islands',
4253         'MQ': 'Martinique',
4254         'MR': 'Mauritania',
4255         'MU': 'Mauritius',
4256         'YT': 'Mayotte',
4257         'MX': 'Mexico',
4258         'FM': 'Micronesia, Federated States of',
4259         'MD': 'Moldova, Republic of',
4260         'MC': 'Monaco',
4261         'MN': 'Mongolia',
4262         'ME': 'Montenegro',
4263         'MS': 'Montserrat',
4264         'MA': 'Morocco',
4265         'MZ': 'Mozambique',
4266         'MM': 'Myanmar',
4267         'NA': 'Namibia',
4268         'NR': 'Nauru',
4269         'NP': 'Nepal',
4270         'NL': 'Netherlands',
4271         'NC': 'New Caledonia',
4272         'NZ': 'New Zealand',
4273         'NI': 'Nicaragua',
4274         'NE': 'Niger',
4275         'NG': 'Nigeria',
4276         'NU': 'Niue',
4277         'NF': 'Norfolk Island',
4278         'MP': 'Northern Mariana Islands',
4279         'NO': 'Norway',
4280         'OM': 'Oman',
4281         'PK': 'Pakistan',
4282         'PW': 'Palau',
4283         'PS': 'Palestine, State of',
4284         'PA': 'Panama',
4285         'PG': 'Papua New Guinea',
4286         'PY': 'Paraguay',
4287         'PE': 'Peru',
4288         'PH': 'Philippines',
4289         'PN': 'Pitcairn',
4290         'PL': 'Poland',
4291         'PT': 'Portugal',
4292         'PR': 'Puerto Rico',
4293         'QA': 'Qatar',
4294         'RE': 'Réunion',
4295         'RO': 'Romania',
4296         'RU': 'Russian Federation',
4297         'RW': 'Rwanda',
4298         'BL': 'Saint Barthélemy',
4299         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4300         'KN': 'Saint Kitts and Nevis',
4301         'LC': 'Saint Lucia',
4302         'MF': 'Saint Martin (French part)',
4303         'PM': 'Saint Pierre and Miquelon',
4304         'VC': 'Saint Vincent and the Grenadines',
4305         'WS': 'Samoa',
4306         'SM': 'San Marino',
4307         'ST': 'Sao Tome and Principe',
4308         'SA': 'Saudi Arabia',
4309         'SN': 'Senegal',
4310         'RS': 'Serbia',
4311         'SC': 'Seychelles',
4312         'SL': 'Sierra Leone',
4313         'SG': 'Singapore',
4314         'SX': 'Sint Maarten (Dutch part)',
4315         'SK': 'Slovakia',
4316         'SI': 'Slovenia',
4317         'SB': 'Solomon Islands',
4318         'SO': 'Somalia',
4319         'ZA': 'South Africa',
4320         'GS': 'South Georgia and the South Sandwich Islands',
4321         'SS': 'South Sudan',
4322         'ES': 'Spain',
4323         'LK': 'Sri Lanka',
4324         'SD': 'Sudan',
4325         'SR': 'Suriname',
4326         'SJ': 'Svalbard and Jan Mayen',
4327         'SZ': 'Swaziland',
4328         'SE': 'Sweden',
4329         'CH': 'Switzerland',
4330         'SY': 'Syrian Arab Republic',
4331         'TW': 'Taiwan, Province of China',
4332         'TJ': 'Tajikistan',
4333         'TZ': 'Tanzania, United Republic of',
4334         'TH': 'Thailand',
4335         'TL': 'Timor-Leste',
4336         'TG': 'Togo',
4337         'TK': 'Tokelau',
4338         'TO': 'Tonga',
4339         'TT': 'Trinidad and Tobago',
4340         'TN': 'Tunisia',
4341         'TR': 'Turkey',
4342         'TM': 'Turkmenistan',
4343         'TC': 'Turks and Caicos Islands',
4344         'TV': 'Tuvalu',
4345         'UG': 'Uganda',
4346         'UA': 'Ukraine',
4347         'AE': 'United Arab Emirates',
4348         'GB': 'United Kingdom',
4349         'US': 'United States',
4350         'UM': 'United States Minor Outlying Islands',
4351         'UY': 'Uruguay',
4352         'UZ': 'Uzbekistan',
4353         'VU': 'Vanuatu',
4354         'VE': 'Venezuela, Bolivarian Republic of',
4355         'VN': 'Viet Nam',
4356         'VG': 'Virgin Islands, British',
4357         'VI': 'Virgin Islands, U.S.',
4358         'WF': 'Wallis and Futuna',
4359         'EH': 'Western Sahara',
4360         'YE': 'Yemen',
4361         'ZM': 'Zambia',
4362         'ZW': 'Zimbabwe',
4363         # Not ISO 3166 codes, but used for IP blocks
4364         'AP': 'Asia/Pacific Region',
4365         'EU': 'Europe',
4366     }
4367
4368     @classmethod
4369     def short2full(cls, code):
4370         """Convert an ISO 3166-2 country code to the corresponding full name"""
4371         return cls._country_map.get(code.upper())
4372
4373
4374 class GeoUtils:
4375     # Major IPv4 address blocks per country
4376     _country_ip_map = {
4377         'AD': '46.172.224.0/19',
4378         'AE': '94.200.0.0/13',
4379         'AF': '149.54.0.0/17',
4380         'AG': '209.59.64.0/18',
4381         'AI': '204.14.248.0/21',
4382         'AL': '46.99.0.0/16',
4383         'AM': '46.70.0.0/15',
4384         'AO': '105.168.0.0/13',
4385         'AP': '182.50.184.0/21',
4386         'AQ': '23.154.160.0/24',
4387         'AR': '181.0.0.0/12',
4388         'AS': '202.70.112.0/20',
4389         'AT': '77.116.0.0/14',
4390         'AU': '1.128.0.0/11',
4391         'AW': '181.41.0.0/18',
4392         'AX': '185.217.4.0/22',
4393         'AZ': '5.197.0.0/16',
4394         'BA': '31.176.128.0/17',
4395         'BB': '65.48.128.0/17',
4396         'BD': '114.130.0.0/16',
4397         'BE': '57.0.0.0/8',
4398         'BF': '102.178.0.0/15',
4399         'BG': '95.42.0.0/15',
4400         'BH': '37.131.0.0/17',
4401         'BI': '154.117.192.0/18',
4402         'BJ': '137.255.0.0/16',
4403         'BL': '185.212.72.0/23',
4404         'BM': '196.12.64.0/18',
4405         'BN': '156.31.0.0/16',
4406         'BO': '161.56.0.0/16',
4407         'BQ': '161.0.80.0/20',
4408         'BR': '191.128.0.0/12',
4409         'BS': '24.51.64.0/18',
4410         'BT': '119.2.96.0/19',
4411         'BW': '168.167.0.0/16',
4412         'BY': '178.120.0.0/13',
4413         'BZ': '179.42.192.0/18',
4414         'CA': '99.224.0.0/11',
4415         'CD': '41.243.0.0/16',
4416         'CF': '197.242.176.0/21',
4417         'CG': '160.113.0.0/16',
4418         'CH': '85.0.0.0/13',
4419         'CI': '102.136.0.0/14',
4420         'CK': '202.65.32.0/19',
4421         'CL': '152.172.0.0/14',
4422         'CM': '102.244.0.0/14',
4423         'CN': '36.128.0.0/10',
4424         'CO': '181.240.0.0/12',
4425         'CR': '201.192.0.0/12',
4426         'CU': '152.206.0.0/15',
4427         'CV': '165.90.96.0/19',
4428         'CW': '190.88.128.0/17',
4429         'CY': '31.153.0.0/16',
4430         'CZ': '88.100.0.0/14',
4431         'DE': '53.0.0.0/8',
4432         'DJ': '197.241.0.0/17',
4433         'DK': '87.48.0.0/12',
4434         'DM': '192.243.48.0/20',
4435         'DO': '152.166.0.0/15',
4436         'DZ': '41.96.0.0/12',
4437         'EC': '186.68.0.0/15',
4438         'EE': '90.190.0.0/15',
4439         'EG': '156.160.0.0/11',
4440         'ER': '196.200.96.0/20',
4441         'ES': '88.0.0.0/11',
4442         'ET': '196.188.0.0/14',
4443         'EU': '2.16.0.0/13',
4444         'FI': '91.152.0.0/13',
4445         'FJ': '144.120.0.0/16',
4446         'FK': '80.73.208.0/21',
4447         'FM': '119.252.112.0/20',
4448         'FO': '88.85.32.0/19',
4449         'FR': '90.0.0.0/9',
4450         'GA': '41.158.0.0/15',
4451         'GB': '25.0.0.0/8',
4452         'GD': '74.122.88.0/21',
4453         'GE': '31.146.0.0/16',
4454         'GF': '161.22.64.0/18',
4455         'GG': '62.68.160.0/19',
4456         'GH': '154.160.0.0/12',
4457         'GI': '95.164.0.0/16',
4458         'GL': '88.83.0.0/19',
4459         'GM': '160.182.0.0/15',
4460         'GN': '197.149.192.0/18',
4461         'GP': '104.250.0.0/19',
4462         'GQ': '105.235.224.0/20',
4463         'GR': '94.64.0.0/13',
4464         'GT': '168.234.0.0/16',
4465         'GU': '168.123.0.0/16',
4466         'GW': '197.214.80.0/20',
4467         'GY': '181.41.64.0/18',
4468         'HK': '113.252.0.0/14',
4469         'HN': '181.210.0.0/16',
4470         'HR': '93.136.0.0/13',
4471         'HT': '148.102.128.0/17',
4472         'HU': '84.0.0.0/14',
4473         'ID': '39.192.0.0/10',
4474         'IE': '87.32.0.0/12',
4475         'IL': '79.176.0.0/13',
4476         'IM': '5.62.80.0/20',
4477         'IN': '117.192.0.0/10',
4478         'IO': '203.83.48.0/21',
4479         'IQ': '37.236.0.0/14',
4480         'IR': '2.176.0.0/12',
4481         'IS': '82.221.0.0/16',
4482         'IT': '79.0.0.0/10',
4483         'JE': '87.244.64.0/18',
4484         'JM': '72.27.0.0/17',
4485         'JO': '176.29.0.0/16',
4486         'JP': '133.0.0.0/8',
4487         'KE': '105.48.0.0/12',
4488         'KG': '158.181.128.0/17',
4489         'KH': '36.37.128.0/17',
4490         'KI': '103.25.140.0/22',
4491         'KM': '197.255.224.0/20',
4492         'KN': '198.167.192.0/19',
4493         'KP': '175.45.176.0/22',
4494         'KR': '175.192.0.0/10',
4495         'KW': '37.36.0.0/14',
4496         'KY': '64.96.0.0/15',
4497         'KZ': '2.72.0.0/13',
4498         'LA': '115.84.64.0/18',
4499         'LB': '178.135.0.0/16',
4500         'LC': '24.92.144.0/20',
4501         'LI': '82.117.0.0/19',
4502         'LK': '112.134.0.0/15',
4503         'LR': '102.183.0.0/16',
4504         'LS': '129.232.0.0/17',
4505         'LT': '78.56.0.0/13',
4506         'LU': '188.42.0.0/16',
4507         'LV': '46.109.0.0/16',
4508         'LY': '41.252.0.0/14',
4509         'MA': '105.128.0.0/11',
4510         'MC': '88.209.64.0/18',
4511         'MD': '37.246.0.0/16',
4512         'ME': '178.175.0.0/17',
4513         'MF': '74.112.232.0/21',
4514         'MG': '154.126.0.0/17',
4515         'MH': '117.103.88.0/21',
4516         'MK': '77.28.0.0/15',
4517         'ML': '154.118.128.0/18',
4518         'MM': '37.111.0.0/17',
4519         'MN': '49.0.128.0/17',
4520         'MO': '60.246.0.0/16',
4521         'MP': '202.88.64.0/20',
4522         'MQ': '109.203.224.0/19',
4523         'MR': '41.188.64.0/18',
4524         'MS': '208.90.112.0/22',
4525         'MT': '46.11.0.0/16',
4526         'MU': '105.16.0.0/12',
4527         'MV': '27.114.128.0/18',
4528         'MW': '102.70.0.0/15',
4529         'MX': '187.192.0.0/11',
4530         'MY': '175.136.0.0/13',
4531         'MZ': '197.218.0.0/15',
4532         'NA': '41.182.0.0/16',
4533         'NC': '101.101.0.0/18',
4534         'NE': '197.214.0.0/18',
4535         'NF': '203.17.240.0/22',
4536         'NG': '105.112.0.0/12',
4537         'NI': '186.76.0.0/15',
4538         'NL': '145.96.0.0/11',
4539         'NO': '84.208.0.0/13',
4540         'NP': '36.252.0.0/15',
4541         'NR': '203.98.224.0/19',
4542         'NU': '49.156.48.0/22',
4543         'NZ': '49.224.0.0/14',
4544         'OM': '5.36.0.0/15',
4545         'PA': '186.72.0.0/15',
4546         'PE': '186.160.0.0/14',
4547         'PF': '123.50.64.0/18',
4548         'PG': '124.240.192.0/19',
4549         'PH': '49.144.0.0/13',
4550         'PK': '39.32.0.0/11',
4551         'PL': '83.0.0.0/11',
4552         'PM': '70.36.0.0/20',
4553         'PR': '66.50.0.0/16',
4554         'PS': '188.161.0.0/16',
4555         'PT': '85.240.0.0/13',
4556         'PW': '202.124.224.0/20',
4557         'PY': '181.120.0.0/14',
4558         'QA': '37.210.0.0/15',
4559         'RE': '102.35.0.0/16',
4560         'RO': '79.112.0.0/13',
4561         'RS': '93.86.0.0/15',
4562         'RU': '5.136.0.0/13',
4563         'RW': '41.186.0.0/16',
4564         'SA': '188.48.0.0/13',
4565         'SB': '202.1.160.0/19',
4566         'SC': '154.192.0.0/11',
4567         'SD': '102.120.0.0/13',
4568         'SE': '78.64.0.0/12',
4569         'SG': '8.128.0.0/10',
4570         'SI': '188.196.0.0/14',
4571         'SK': '78.98.0.0/15',
4572         'SL': '102.143.0.0/17',
4573         'SM': '89.186.32.0/19',
4574         'SN': '41.82.0.0/15',
4575         'SO': '154.115.192.0/18',
4576         'SR': '186.179.128.0/17',
4577         'SS': '105.235.208.0/21',
4578         'ST': '197.159.160.0/19',
4579         'SV': '168.243.0.0/16',
4580         'SX': '190.102.0.0/20',
4581         'SY': '5.0.0.0/16',
4582         'SZ': '41.84.224.0/19',
4583         'TC': '65.255.48.0/20',
4584         'TD': '154.68.128.0/19',
4585         'TG': '196.168.0.0/14',
4586         'TH': '171.96.0.0/13',
4587         'TJ': '85.9.128.0/18',
4588         'TK': '27.96.24.0/21',
4589         'TL': '180.189.160.0/20',
4590         'TM': '95.85.96.0/19',
4591         'TN': '197.0.0.0/11',
4592         'TO': '175.176.144.0/21',
4593         'TR': '78.160.0.0/11',
4594         'TT': '186.44.0.0/15',
4595         'TV': '202.2.96.0/19',
4596         'TW': '120.96.0.0/11',
4597         'TZ': '156.156.0.0/14',
4598         'UA': '37.52.0.0/14',
4599         'UG': '102.80.0.0/13',
4600         'US': '6.0.0.0/8',
4601         'UY': '167.56.0.0/13',
4602         'UZ': '84.54.64.0/18',
4603         'VA': '212.77.0.0/19',
4604         'VC': '207.191.240.0/21',
4605         'VE': '186.88.0.0/13',
4606         'VG': '66.81.192.0/20',
4607         'VI': '146.226.0.0/16',
4608         'VN': '14.160.0.0/11',
4609         'VU': '202.80.32.0/20',
4610         'WF': '117.20.32.0/21',
4611         'WS': '202.4.32.0/19',
4612         'YE': '134.35.0.0/16',
4613         'YT': '41.242.116.0/22',
4614         'ZA': '41.0.0.0/11',
4615         'ZM': '102.144.0.0/13',
4616         'ZW': '102.177.192.0/18',
4617     }
4618
4619     @classmethod
4620     def random_ipv4(cls, code_or_block):
4621         if len(code_or_block) == 2:
4622             block = cls._country_ip_map.get(code_or_block.upper())
4623             if not block:
4624                 return None
4625         else:
4626             block = code_or_block
4627         addr, preflen = block.split('/')
4628         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4629         addr_max = addr_min | (0xffffffff >> int(preflen))
4630         return str(socket.inet_ntoa(
4631             struct.pack('!L', random.randint(addr_min, addr_max))))
4632
4633
4634 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4635     def __init__(self, proxies=None):
4636         # Set default handlers
4637         for type in ('http', 'https'):
4638             setattr(self, '%s_open' % type,
4639                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4640                         meth(r, proxy, type))
4641         urllib.request.ProxyHandler.__init__(self, proxies)
4642
4643     def proxy_open(self, req, proxy, type):
4644         req_proxy = req.headers.get('Ytdl-request-proxy')
4645         if req_proxy is not None:
4646             proxy = req_proxy
4647             del req.headers['Ytdl-request-proxy']
4648
4649         if proxy == '__noproxy__':
4650             return None  # No Proxy
4651         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4652             req.add_header('Ytdl-socks-proxy', proxy)
4653             # yt-dlp's http/https handlers do wrapping the socket with socks
4654             return None
4655         return urllib.request.ProxyHandler.proxy_open(
4656             self, req, proxy, type)
4657
4658
4659 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4660 # released into Public Domain
4661 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4662
4663 def long_to_bytes(n, blocksize=0):
4664     """long_to_bytes(n:long, blocksize:int) : string
4665     Convert a long integer to a byte string.
4666
4667     If optional blocksize is given and greater than zero, pad the front of the
4668     byte string with binary zeros so that the length is a multiple of
4669     blocksize.
4670     """
4671     # after much testing, this algorithm was deemed to be the fastest
4672     s = b''
4673     n = int(n)
4674     while n > 0:
4675         s = struct.pack('>I', n & 0xffffffff) + s
4676         n = n >> 32
4677     # strip off leading zeros
4678     for i in range(len(s)):
4679         if s[i] != b'\000'[0]:
4680             break
4681     else:
4682         # only happens when n == 0
4683         s = b'\000'
4684         i = 0
4685     s = s[i:]
4686     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4687     # de-padding being done above, but sigh...
4688     if blocksize > 0 and len(s) % blocksize:
4689         s = (blocksize - len(s) % blocksize) * b'\000' + s
4690     return s
4691
4692
4693 def bytes_to_long(s):
4694     """bytes_to_long(string) : long
4695     Convert a byte string to a long integer.
4696
4697     This is (essentially) the inverse of long_to_bytes().
4698     """
4699     acc = 0
4700     length = len(s)
4701     if length % 4:
4702         extra = (4 - length % 4)
4703         s = b'\000' * extra + s
4704         length = length + extra
4705     for i in range(0, length, 4):
4706         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4707     return acc
4708
4709
4710 def ohdave_rsa_encrypt(data, exponent, modulus):
4711     '''
4712     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4713
4714     Input:
4715         data: data to encrypt, bytes-like object
4716         exponent, modulus: parameter e and N of RSA algorithm, both integer
4717     Output: hex string of encrypted data
4718
4719     Limitation: supports one block encryption only
4720     '''
4721
4722     payload = int(binascii.hexlify(data[::-1]), 16)
4723     encrypted = pow(payload, exponent, modulus)
4724     return '%x' % encrypted
4725
4726
4727 def pkcs1pad(data, length):
4728     """
4729     Padding input data with PKCS#1 scheme
4730
4731     @param {int[]} data        input data
4732     @param {int}   length      target length
4733     @returns {int[]}           padded data
4734     """
4735     if len(data) > length - 11:
4736         raise ValueError('Input data too long for PKCS#1 padding')
4737
4738     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4739     return [0, 2] + pseudo_random + [0] + data
4740
4741
4742 def _base_n_table(n, table):
4743     if not table and not n:
4744         raise ValueError('Either table or n must be specified')
4745     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4746
4747     if n != len(table):
4748         raise ValueError(f'base {n} exceeds table length {len(table)}')
4749     return table
4750
4751
4752 def encode_base_n(num, n=None, table=None):
4753     """Convert given int to a base-n string"""
4754     table = _base_n_table(n, table)
4755     if not num:
4756         return table[0]
4757
4758     result, base = '', len(table)
4759     while num:
4760         result = table[num % base] + result
4761         num = num // base
4762     return result
4763
4764
4765 def decode_base_n(string, n=None, table=None):
4766     """Convert given base-n string to int"""
4767     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4768     result, base = 0, len(table)
4769     for char in string:
4770         result = result * base + table[char]
4771     return result
4772
4773
4774 def decode_base(value, digits):
4775     write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4776                  'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4777     return decode_base_n(value, table=digits)
4778
4779
4780 def decode_packed_codes(code):
4781     mobj = re.search(PACKED_CODES_RE, code)
4782     obfuscated_code, base, count, symbols = mobj.groups()
4783     base = int(base)
4784     count = int(count)
4785     symbols = symbols.split('|')
4786     symbol_table = {}
4787
4788     while count:
4789         count -= 1
4790         base_n_count = encode_base_n(count, base)
4791         symbol_table[base_n_count] = symbols[count] or base_n_count
4792
4793     return re.sub(
4794         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4795         obfuscated_code)
4796
4797
4798 def caesar(s, alphabet, shift):
4799     if shift == 0:
4800         return s
4801     l = len(alphabet)
4802     return ''.join(
4803         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4804         for c in s)
4805
4806
4807 def rot47(s):
4808     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4809
4810
4811 def parse_m3u8_attributes(attrib):
4812     info = {}
4813     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4814         if val.startswith('"'):
4815             val = val[1:-1]
4816         info[key] = val
4817     return info
4818
4819
4820 def urshift(val, n):
4821     return val >> n if val >= 0 else (val + 0x100000000) >> n
4822
4823
4824 # Based on png2str() written by @gdkchan and improved by @yokrysty
4825 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4826 def decode_png(png_data):
4827     # Reference: https://www.w3.org/TR/PNG/
4828     header = png_data[8:]
4829
4830     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4831         raise OSError('Not a valid PNG file.')
4832
4833     int_map = {1: '>B', 2: '>H', 4: '>I'}
4834     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4835
4836     chunks = []
4837
4838     while header:
4839         length = unpack_integer(header[:4])
4840         header = header[4:]
4841
4842         chunk_type = header[:4]
4843         header = header[4:]
4844
4845         chunk_data = header[:length]
4846         header = header[length:]
4847
4848         header = header[4:]  # Skip CRC
4849
4850         chunks.append({
4851             'type': chunk_type,
4852             'length': length,
4853             'data': chunk_data
4854         })
4855
4856     ihdr = chunks[0]['data']
4857
4858     width = unpack_integer(ihdr[:4])
4859     height = unpack_integer(ihdr[4:8])
4860
4861     idat = b''
4862
4863     for chunk in chunks:
4864         if chunk['type'] == b'IDAT':
4865             idat += chunk['data']
4866
4867     if not idat:
4868         raise OSError('Unable to read PNG data.')
4869
4870     decompressed_data = bytearray(zlib.decompress(idat))
4871
4872     stride = width * 3
4873     pixels = []
4874
4875     def _get_pixel(idx):
4876         x = idx % stride
4877         y = idx // stride
4878         return pixels[y][x]
4879
4880     for y in range(height):
4881         basePos = y * (1 + stride)
4882         filter_type = decompressed_data[basePos]
4883
4884         current_row = []
4885
4886         pixels.append(current_row)
4887
4888         for x in range(stride):
4889             color = decompressed_data[1 + basePos + x]
4890             basex = y * stride + x
4891             left = 0
4892             up = 0
4893
4894             if x > 2:
4895                 left = _get_pixel(basex - 3)
4896             if y > 0:
4897                 up = _get_pixel(basex - stride)
4898
4899             if filter_type == 1:  # Sub
4900                 color = (color + left) & 0xff
4901             elif filter_type == 2:  # Up
4902                 color = (color + up) & 0xff
4903             elif filter_type == 3:  # Average
4904                 color = (color + ((left + up) >> 1)) & 0xff
4905             elif filter_type == 4:  # Paeth
4906                 a = left
4907                 b = up
4908                 c = 0
4909
4910                 if x > 2 and y > 0:
4911                     c = _get_pixel(basex - stride - 3)
4912
4913                 p = a + b - c
4914
4915                 pa = abs(p - a)
4916                 pb = abs(p - b)
4917                 pc = abs(p - c)
4918
4919                 if pa <= pb and pa <= pc:
4920                     color = (color + a) & 0xff
4921                 elif pb <= pc:
4922                     color = (color + b) & 0xff
4923                 else:
4924                     color = (color + c) & 0xff
4925
4926             current_row.append(color)
4927
4928     return width, height, pixels
4929
4930
4931 def write_xattr(path, key, value):
4932     # Windows: Write xattrs to NTFS Alternate Data Streams:
4933     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4934     if compat_os_name == 'nt':
4935         assert ':' not in key
4936         assert os.path.exists(path)
4937
4938         try:
4939             with open(f'{path}:{key}', 'wb') as f:
4940                 f.write(value)
4941         except OSError as e:
4942             raise XAttrMetadataError(e.errno, e.strerror)
4943         return
4944
4945     # UNIX Method 1. Use xattrs/pyxattrs modules
4946
4947     setxattr = None
4948     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4949         # Unicode arguments are not supported in pyxattr until version 0.5.0
4950         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4951         if version_tuple(xattr.__version__) >= (0, 5, 0):
4952             setxattr = xattr.set
4953     elif xattr:
4954         setxattr = xattr.setxattr
4955
4956     if setxattr:
4957         try:
4958             setxattr(path, key, value)
4959         except OSError as e:
4960             raise XAttrMetadataError(e.errno, e.strerror)
4961         return
4962
4963     # UNIX Method 2. Use setfattr/xattr executables
4964     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4965            else 'xattr' if check_executable('xattr', ['-h']) else None)
4966     if not exe:
4967         raise XAttrUnavailableError(
4968             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4969             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4970
4971     value = value.decode()
4972     try:
4973         _, stderr, returncode = Popen.run(
4974             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4975             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4976     except OSError as e:
4977         raise XAttrMetadataError(e.errno, e.strerror)
4978     if returncode:
4979         raise XAttrMetadataError(returncode, stderr)
4980
4981
4982 def random_birthday(year_field, month_field, day_field):
4983     start_date = datetime.date(1950, 1, 1)
4984     end_date = datetime.date(1995, 12, 31)
4985     offset = random.randint(0, (end_date - start_date).days)
4986     random_date = start_date + datetime.timedelta(offset)
4987     return {
4988         year_field: str(random_date.year),
4989         month_field: str(random_date.month),
4990         day_field: str(random_date.day),
4991     }
4992
4993
4994 # Templates for internet shortcut files, which are plain text files.
4995 DOT_URL_LINK_TEMPLATE = '''\
4996 [InternetShortcut]
4997 URL=%(url)s
4998 '''
4999
5000 DOT_WEBLOC_LINK_TEMPLATE = '''\
5001 <?xml version="1.0" encoding="UTF-8"?>
5002 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5003 <plist version="1.0">
5004 <dict>
5005 \t<key>URL</key>
5006 \t<string>%(url)s</string>
5007 </dict>
5008 </plist>
5009 '''
5010
5011 DOT_DESKTOP_LINK_TEMPLATE = '''\
5012 [Desktop Entry]
5013 Encoding=UTF-8
5014 Name=%(filename)s
5015 Type=Link
5016 URL=%(url)s
5017 Icon=text-html
5018 '''
5019
5020 LINK_TEMPLATES = {
5021     'url': DOT_URL_LINK_TEMPLATE,
5022     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5023     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5024 }
5025
5026
5027 def iri_to_uri(iri):
5028     """
5029     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5030
5031     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5032     """
5033
5034     iri_parts = urllib.parse.urlparse(iri)
5035
5036     if '[' in iri_parts.netloc:
5037         raise ValueError('IPv6 URIs are not, yet, supported.')
5038         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5039
5040     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5041
5042     net_location = ''
5043     if iri_parts.username:
5044         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5045         if iri_parts.password is not None:
5046             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5047         net_location += '@'
5048
5049     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5050     # The 'idna' encoding produces ASCII text.
5051     if iri_parts.port is not None and iri_parts.port != 80:
5052         net_location += ':' + str(iri_parts.port)
5053
5054     return urllib.parse.urlunparse(
5055         (iri_parts.scheme,
5056             net_location,
5057
5058             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5059
5060             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5061             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5062
5063             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5064             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5065
5066             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5067
5068     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5069
5070
5071 def to_high_limit_path(path):
5072     if sys.platform in ['win32', 'cygwin']:
5073         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5074         return '\\\\?\\' + os.path.abspath(path)
5075
5076     return path
5077
5078
5079 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5080     val = traverse_obj(obj, *variadic(field))
5081     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5082         return default
5083     return template % func(val)
5084
5085
5086 def clean_podcast_url(url):
5087     return re.sub(r'''(?x)
5088         (?:
5089             (?:
5090                 chtbl\.com/track|
5091                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5092                 play\.podtrac\.com
5093             )/[^/]+|
5094             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5095             flex\.acast\.com|
5096             pd(?:
5097                 cn\.co| # https://podcorn.com/analytics-prefix/
5098                 st\.fm # https://podsights.com/docs/
5099             )/e
5100         )/''', '', url)
5101
5102
5103 _HEX_TABLE = '0123456789abcdef'
5104
5105
5106 def random_uuidv4():
5107     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5108
5109
5110 def make_dir(path, to_screen=None):
5111     try:
5112         dn = os.path.dirname(path)
5113         if dn and not os.path.exists(dn):
5114             os.makedirs(dn)
5115         return True
5116     except OSError as err:
5117         if callable(to_screen) is not None:
5118             to_screen('unable to create directory ' + error_to_compat_str(err))
5119         return False
5120
5121
5122 def get_executable_path():
5123     from .update import _get_variant_and_executable_path
5124
5125     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5126
5127
5128 def load_plugins(name, suffix, namespace):
5129     classes = {}
5130     with contextlib.suppress(FileNotFoundError):
5131         plugins_spec = importlib.util.spec_from_file_location(
5132             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5133         plugins = importlib.util.module_from_spec(plugins_spec)
5134         sys.modules[plugins_spec.name] = plugins
5135         plugins_spec.loader.exec_module(plugins)
5136         for name in dir(plugins):
5137             if name in namespace:
5138                 continue
5139             if not name.endswith(suffix):
5140                 continue
5141             klass = getattr(plugins, name)
5142             classes[name] = namespace[name] = klass
5143     return classes
5144
5145
5146 def traverse_obj(
5147         obj, *path_list, default=None, expected_type=None, get_all=True,
5148         casesense=True, is_user_input=False, traverse_string=False):
5149     ''' Traverse nested list/dict/tuple
5150     @param path_list        A list of paths which are checked one by one.
5151                             Each path is a list of keys where each key is a:
5152                               - None:     Do nothing
5153                               - string:   A dictionary key
5154                               - int:      An index into a list
5155                               - tuple:    A list of keys all of which will be traversed
5156                               - Ellipsis: Fetch all values in the object
5157                               - Function: Takes the key and value as arguments
5158                                           and returns whether the key matches or not
5159     @param default          Default value to return
5160     @param expected_type    Only accept final value of this type (Can also be any callable)
5161     @param get_all          Return all the values obtained from a path or only the first one
5162     @param casesense        Whether to consider dictionary keys as case sensitive
5163     @param is_user_input    Whether the keys are generated from user input. If True,
5164                             strings are converted to int/slice if necessary
5165     @param traverse_string  Whether to traverse inside strings. If True, any
5166                             non-compatible object will also be converted into a string
5167     # TODO: Write tests
5168     '''
5169     if not casesense:
5170         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5171         path_list = (map(_lower, variadic(path)) for path in path_list)
5172
5173     def _traverse_obj(obj, path, _current_depth=0):
5174         nonlocal depth
5175         path = tuple(variadic(path))
5176         for i, key in enumerate(path):
5177             if None in (key, obj):
5178                 return obj
5179             if isinstance(key, (list, tuple)):
5180                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5181                 key = ...
5182             if key is ...:
5183                 obj = (obj.values() if isinstance(obj, dict)
5184                        else obj if isinstance(obj, (list, tuple, LazyList))
5185                        else str(obj) if traverse_string else [])
5186                 _current_depth += 1
5187                 depth = max(depth, _current_depth)
5188                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5189             elif callable(key):
5190                 if isinstance(obj, (list, tuple, LazyList)):
5191                     obj = enumerate(obj)
5192                 elif isinstance(obj, dict):
5193                     obj = obj.items()
5194                 else:
5195                     if not traverse_string:
5196                         return None
5197                     obj = str(obj)
5198                 _current_depth += 1
5199                 depth = max(depth, _current_depth)
5200                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5201             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5202                 obj = (obj.get(key) if casesense or (key in obj)
5203                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5204             else:
5205                 if is_user_input:
5206                     key = (int_or_none(key) if ':' not in key
5207                            else slice(*map(int_or_none, key.split(':'))))
5208                     if key == slice(None):
5209                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5210                 if not isinstance(key, (int, slice)):
5211                     return None
5212                 if not isinstance(obj, (list, tuple, LazyList)):
5213                     if not traverse_string:
5214                         return None
5215                     obj = str(obj)
5216                 try:
5217                     obj = obj[key]
5218                 except IndexError:
5219                     return None
5220         return obj
5221
5222     if isinstance(expected_type, type):
5223         type_test = lambda val: val if isinstance(val, expected_type) else None
5224     else:
5225         type_test = expected_type or IDENTITY
5226
5227     for path in path_list:
5228         depth = 0
5229         val = _traverse_obj(obj, path)
5230         if val is not None:
5231             if depth:
5232                 for _ in range(depth - 1):
5233                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5234                 val = [v for v in map(type_test, val) if v is not None]
5235                 if val:
5236                     return val if get_all else val[0]
5237             else:
5238                 val = type_test(val)
5239                 if val is not None:
5240                     return val
5241     return default
5242
5243
5244 def traverse_dict(dictn, keys, casesense=True):
5245     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5246                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5247     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5248
5249
5250 def get_first(obj, keys, **kwargs):
5251     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5252
5253
5254 def variadic(x, allowed_types=(str, bytes, dict)):
5255     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5256
5257
5258 def time_seconds(**kwargs):
5259     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5260     return t.timestamp()
5261
5262
5263 # create a JSON Web Signature (jws) with HS256 algorithm
5264 # the resulting format is in JWS Compact Serialization
5265 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5266 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5267 def jwt_encode_hs256(payload_data, key, headers={}):
5268     header_data = {
5269         'alg': 'HS256',
5270         'typ': 'JWT',
5271     }
5272     if headers:
5273         header_data.update(headers)
5274     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5275     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5276     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5277     signature_b64 = base64.b64encode(h.digest())
5278     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5279     return token
5280
5281
5282 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5283 def jwt_decode_hs256(jwt):
5284     header_b64, payload_b64, signature_b64 = jwt.split('.')
5285     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5286     return payload_data
5287
5288
5289 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5290
5291
5292 @functools.cache
5293 def supports_terminal_sequences(stream):
5294     if compat_os_name == 'nt':
5295         if not WINDOWS_VT_MODE:
5296             return False
5297     elif not os.getenv('TERM'):
5298         return False
5299     try:
5300         return stream.isatty()
5301     except BaseException:
5302         return False
5303
5304
5305 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5306     if get_windows_version() < (10, 0, 10586):
5307         return
5308     global WINDOWS_VT_MODE
5309     try:
5310         Popen.run('', shell=True)
5311     except Exception:
5312         return
5313
5314     WINDOWS_VT_MODE = True
5315     supports_terminal_sequences.cache_clear()
5316
5317
5318 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5319
5320
5321 def remove_terminal_sequences(string):
5322     return _terminal_sequences_re.sub('', string)
5323
5324
5325 def number_of_digits(number):
5326     return len('%d' % number)
5327
5328
5329 def join_nonempty(*values, delim='-', from_dict=None):
5330     if from_dict is not None:
5331         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5332     return delim.join(map(str, filter(None, values)))
5333
5334
5335 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5336     """
5337     Find the largest format dimensions in terms of video width and, for each thumbnail:
5338     * Modify the URL: Match the width with the provided regex and replace with the former width
5339     * Update dimensions
5340
5341     This function is useful with video services that scale the provided thumbnails on demand
5342     """
5343     _keys = ('width', 'height')
5344     max_dimensions = max(
5345         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5346         default=(0, 0))
5347     if not max_dimensions[0]:
5348         return thumbnails
5349     return [
5350         merge_dicts(
5351             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5352             dict(zip(_keys, max_dimensions)), thumbnail)
5353         for thumbnail in thumbnails
5354     ]
5355
5356
5357 def parse_http_range(range):
5358     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5359     if not range:
5360         return None, None, None
5361     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5362     if not crg:
5363         return None, None, None
5364     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5365
5366
5367 def read_stdin(what):
5368     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5369     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5370     return sys.stdin
5371
5372
5373 class Config:
5374     own_args = None
5375     parsed_args = None
5376     filename = None
5377     __initialized = False
5378
5379     def __init__(self, parser, label=None):
5380         self.parser, self.label = parser, label
5381         self._loaded_paths, self.configs = set(), []
5382
5383     def init(self, args=None, filename=None):
5384         assert not self.__initialized
5385         directory = ''
5386         if filename:
5387             location = os.path.realpath(filename)
5388             directory = os.path.dirname(location)
5389             if location in self._loaded_paths:
5390                 return False
5391             self._loaded_paths.add(location)
5392
5393         self.own_args, self.__initialized = args, True
5394         opts, _ = self.parser.parse_known_args(args)
5395         self.parsed_args, self.filename = args, filename
5396
5397         for location in opts.config_locations or []:
5398             if location == '-':
5399                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5400                 continue
5401             location = os.path.join(directory, expand_path(location))
5402             if os.path.isdir(location):
5403                 location = os.path.join(location, 'yt-dlp.conf')
5404             if not os.path.exists(location):
5405                 self.parser.error(f'config location {location} does not exist')
5406             self.append_config(self.read_file(location), location)
5407         return True
5408
5409     def __str__(self):
5410         label = join_nonempty(
5411             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5412             delim=' ')
5413         return join_nonempty(
5414             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5415             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5416             delim='\n')
5417
5418     @staticmethod
5419     def read_file(filename, default=[]):
5420         try:
5421             optionf = open(filename)
5422         except OSError:
5423             return default  # silently skip if file is not present
5424         try:
5425             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5426             contents = optionf.read()
5427             res = shlex.split(contents, comments=True)
5428         except Exception as err:
5429             raise ValueError(f'Unable to parse "{filename}": {err}')
5430         finally:
5431             optionf.close()
5432         return res
5433
5434     @staticmethod
5435     def hide_login_info(opts):
5436         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5437         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5438
5439         def _scrub_eq(o):
5440             m = eqre.match(o)
5441             if m:
5442                 return m.group('key') + '=PRIVATE'
5443             else:
5444                 return o
5445
5446         opts = list(map(_scrub_eq, opts))
5447         for idx, opt in enumerate(opts):
5448             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5449                 opts[idx + 1] = 'PRIVATE'
5450         return opts
5451
5452     def append_config(self, *args, label=None):
5453         config = type(self)(self.parser, label)
5454         config._loaded_paths = self._loaded_paths
5455         if config.init(*args):
5456             self.configs.append(config)
5457
5458     @property
5459     def all_args(self):
5460         for config in reversed(self.configs):
5461             yield from config.all_args
5462         yield from self.parsed_args or []
5463
5464     def parse_known_args(self, **kwargs):
5465         return self.parser.parse_known_args(self.all_args, **kwargs)
5466
5467     def parse_args(self):
5468         return self.parser.parse_args(self.all_args)
5469
5470
5471 class WebSocketsWrapper():
5472     """Wraps websockets module to use in non-async scopes"""
5473     pool = None
5474
5475     def __init__(self, url, headers=None, connect=True):
5476         self.loop = asyncio.new_event_loop()
5477         # XXX: "loop" is deprecated
5478         self.conn = websockets.connect(
5479             url, extra_headers=headers, ping_interval=None,
5480             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5481         if connect:
5482             self.__enter__()
5483         atexit.register(self.__exit__, None, None, None)
5484
5485     def __enter__(self):
5486         if not self.pool:
5487             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5488         return self
5489
5490     def send(self, *args):
5491         self.run_with_loop(self.pool.send(*args), self.loop)
5492
5493     def recv(self, *args):
5494         return self.run_with_loop(self.pool.recv(*args), self.loop)
5495
5496     def __exit__(self, type, value, traceback):
5497         try:
5498             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5499         finally:
5500             self.loop.close()
5501             self._cancel_all_tasks(self.loop)
5502
5503     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5504     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5505     @staticmethod
5506     def run_with_loop(main, loop):
5507         if not asyncio.iscoroutine(main):
5508             raise ValueError(f'a coroutine was expected, got {main!r}')
5509
5510         try:
5511             return loop.run_until_complete(main)
5512         finally:
5513             loop.run_until_complete(loop.shutdown_asyncgens())
5514             if hasattr(loop, 'shutdown_default_executor'):
5515                 loop.run_until_complete(loop.shutdown_default_executor())
5516
5517     @staticmethod
5518     def _cancel_all_tasks(loop):
5519         to_cancel = asyncio.all_tasks(loop)
5520
5521         if not to_cancel:
5522             return
5523
5524         for task in to_cancel:
5525             task.cancel()
5526
5527         # XXX: "loop" is removed in python 3.10+
5528         loop.run_until_complete(
5529             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5530
5531         for task in to_cancel:
5532             if task.cancelled():
5533                 continue
5534             if task.exception() is not None:
5535                 loop.call_exception_handler({
5536                     'message': 'unhandled exception during asyncio.run() shutdown',
5537                     'exception': task.exception(),
5538                     'task': task,
5539                 })
5540
5541
5542 def merge_headers(*dicts):
5543     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5544     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5545
5546
5547 class classproperty:
5548     """classmethod(property(func)) that works in py < 3.9"""
5549
5550     def __init__(self, func):
5551         functools.update_wrapper(self, func)
5552         self.func = func
5553
5554     def __get__(self, _, cls):
5555         return self.func(cls)
5556
5557
5558 class Namespace(types.SimpleNamespace):
5559     """Immutable namespace"""
5560
5561     def __iter__(self):
5562         return iter(self.__dict__.values())
5563
5564     @property
5565     def items_(self):
5566         return self.__dict__.items()
5567
5568
5569 # Deprecated
5570 has_certifi = bool(certifi)
5571 has_websockets = bool(websockets)