yt_dlp/utils.py

   1 import asyncio
   2 import atexit
   3 import base64
   4 import binascii
   5 import calendar
   6 import codecs
   7 import collections
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.header
  12 import email.utils
  13 import errno
  14 import gzip
  15 import hashlib
  16 import hmac
  17 import html.entities
  18 import html.parser
  19 import http.client
  20 import http.cookiejar
  21 import importlib.util
  22 import inspect
  23 import io
  24 import itertools
  25 import json
  26 import locale
  27 import math
  28 import mimetypes
  29 import operator
  30 import os
  31 import platform
  32 import random
  33 import re
  34 import shlex
  35 import socket
  36 import ssl
  37 import struct
  38 import subprocess
  39 import sys
  40 import tempfile
  41 import time
  42 import traceback
  43 import types
  44 import urllib.error
  45 import urllib.parse
  46 import urllib.request
  47 import xml.etree.ElementTree
  48 import zlib
  49
  50 from .compat import functools  # isort: split
  51 from .compat import (
  52     compat_etree_fromstring,
  53     compat_expanduser,
  54     compat_HTMLParseError,
  55     compat_os_name,
  56     compat_shlex_quote,
  57 )
  58 from .dependencies import brotli, certifi, websockets, xattr
  59 from .socks import ProxyType, sockssocket
  60
  61
  62 def register_socks_protocols():
  63     # "Register" SOCKS protocols
  64     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  65     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  66     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  67         if scheme not in urllib.parse.uses_netloc:
  68             urllib.parse.uses_netloc.append(scheme)
  69
  70
  71 # This is not clearly defined otherwise
  72 compiled_regex_type = type(re.compile(''))
  73
  74
  75 def random_user_agent():
  76     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  77     _CHROME_VERSIONS = (
  78         '90.0.4430.212',
  79         '90.0.4430.24',
  80         '90.0.4430.70',
  81         '90.0.4430.72',
  82         '90.0.4430.85',
  83         '90.0.4430.93',
  84         '91.0.4472.101',
  85         '91.0.4472.106',
  86         '91.0.4472.114',
  87         '91.0.4472.124',
  88         '91.0.4472.164',
  89         '91.0.4472.19',
  90         '91.0.4472.77',
  91         '92.0.4515.107',
  92         '92.0.4515.115',
  93         '92.0.4515.131',
  94         '92.0.4515.159',
  95         '92.0.4515.43',
  96         '93.0.4556.0',
  97         '93.0.4577.15',
  98         '93.0.4577.63',
  99         '93.0.4577.82',
 100         '94.0.4606.41',
 101         '94.0.4606.54',
 102         '94.0.4606.61',
 103         '94.0.4606.71',
 104         '94.0.4606.81',
 105         '94.0.4606.85',
 106         '95.0.4638.17',
 107         '95.0.4638.50',
 108         '95.0.4638.54',
 109         '95.0.4638.69',
 110         '95.0.4638.74',
 111         '96.0.4664.18',
 112         '96.0.4664.45',
 113         '96.0.4664.55',
 114         '96.0.4664.93',
 115         '97.0.4692.20',
 116     )
 117     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 118
 119
 120 SUPPORTED_ENCODINGS = [
 121     'gzip', 'deflate'
 122 ]
 123 if brotli:
 124     SUPPORTED_ENCODINGS.append('br')
 125
 126 std_headers = {
 127     'User-Agent': random_user_agent(),
 128     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 129     'Accept-Language': 'en-us,en;q=0.5',
 130     'Sec-Fetch-Mode': 'navigate',
 131 }
 132
 133
 134 USER_AGENTS = {
 135     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 136 }
 137
 138
 139 NO_DEFAULT = object()
 140 IDENTITY = lambda x: x
 141
 142 ENGLISH_MONTH_NAMES = [
 143     'January', 'February', 'March', 'April', 'May', 'June',
 144     'July', 'August', 'September', 'October', 'November', 'December']
 145
 146 MONTH_NAMES = {
 147     'en': ENGLISH_MONTH_NAMES,
 148     'fr': [
 149         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 150         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 151 }
 152
 153 # needed for sanitizing filenames in restricted mode
 154 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 155                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 156                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 157
 158 DATE_FORMATS = (
 159     '%d %B %Y',
 160     '%d %b %Y',
 161     '%B %d %Y',
 162     '%B %dst %Y',
 163     '%B %dnd %Y',
 164     '%B %drd %Y',
 165     '%B %dth %Y',
 166     '%b %d %Y',
 167     '%b %dst %Y',
 168     '%b %dnd %Y',
 169     '%b %drd %Y',
 170     '%b %dth %Y',
 171     '%b %dst %Y %I:%M',
 172     '%b %dnd %Y %I:%M',
 173     '%b %drd %Y %I:%M',
 174     '%b %dth %Y %I:%M',
 175     '%Y %m %d',
 176     '%Y-%m-%d',
 177     '%Y.%m.%d.',
 178     '%Y/%m/%d',
 179     '%Y/%m/%d %H:%M',
 180     '%Y/%m/%d %H:%M:%S',
 181     '%Y%m%d%H%M',
 182     '%Y%m%d%H%M%S',
 183     '%Y%m%d',
 184     '%Y-%m-%d %H:%M',
 185     '%Y-%m-%d %H:%M:%S',
 186     '%Y-%m-%d %H:%M:%S.%f',
 187     '%Y-%m-%d %H:%M:%S:%f',
 188     '%d.%m.%Y %H:%M',
 189     '%d.%m.%Y %H.%M',
 190     '%Y-%m-%dT%H:%M:%SZ',
 191     '%Y-%m-%dT%H:%M:%S.%fZ',
 192     '%Y-%m-%dT%H:%M:%S.%f0Z',
 193     '%Y-%m-%dT%H:%M:%S',
 194     '%Y-%m-%dT%H:%M:%S.%f',
 195     '%Y-%m-%dT%H:%M',
 196     '%b %d %Y at %H:%M',
 197     '%b %d %Y at %H:%M:%S',
 198     '%B %d %Y at %H:%M',
 199     '%B %d %Y at %H:%M:%S',
 200     '%H:%M %d-%b-%Y',
 201 )
 202
 203 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 204 DATE_FORMATS_DAY_FIRST.extend([
 205     '%d-%m-%Y',
 206     '%d.%m.%Y',
 207     '%d.%m.%y',
 208     '%d/%m/%Y',
 209     '%d/%m/%y',
 210     '%d/%m/%Y %H:%M:%S',
 211 ])
 212
 213 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 214 DATE_FORMATS_MONTH_FIRST.extend([
 215     '%m-%d-%Y',
 216     '%m.%d.%Y',
 217     '%m/%d/%Y',
 218     '%m/%d/%y',
 219     '%m/%d/%Y %H:%M:%S',
 220 ])
 221
 222 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 223 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
 224
 225 NUMBER_RE = r'\d+(?:\.\d+)?'
 226
 227
 228 @functools.cache
 229 def preferredencoding():
 230     """Get preferred encoding.
 231
 232     Returns the best encoding scheme for the system, based on
 233     locale.getpreferredencoding() and some further tweaks.
 234     """
 235     try:
 236         pref = locale.getpreferredencoding()
 237         'TEST'.encode(pref)
 238     except Exception:
 239         pref = 'UTF-8'
 240
 241     return pref
 242
 243
 244 def write_json_file(obj, fn):
 245     """ Encode obj as JSON and write it to fn, atomically if possible """
 246
 247     tf = tempfile.NamedTemporaryFile(
 248         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 249         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 250
 251     try:
 252         with tf:
 253             json.dump(obj, tf, ensure_ascii=False)
 254         if sys.platform == 'win32':
 255             # Need to remove existing file on Windows, else os.rename raises
 256             # WindowsError or FileExistsError.
 257             with contextlib.suppress(OSError):
 258                 os.unlink(fn)
 259         with contextlib.suppress(OSError):
 260             mask = os.umask(0)
 261             os.umask(mask)
 262             os.chmod(tf.name, 0o666 & ~mask)
 263         os.rename(tf.name, fn)
 264     except Exception:
 265         with contextlib.suppress(OSError):
 266             os.remove(tf.name)
 267         raise
 268
 269
 270 def find_xpath_attr(node, xpath, key, val=None):
 271     """ Find the xpath xpath[@key=val] """
 272     assert re.match(r'^[a-zA-Z_-]+$', key)
 273     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 274     return node.find(expr)
 275
 276 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 277 # the namespace parameter
 278
 279
 280 def xpath_with_ns(path, ns_map):
 281     components = [c.split(':') for c in path.split('/')]
 282     replaced = []
 283     for c in components:
 284         if len(c) == 1:
 285             replaced.append(c[0])
 286         else:
 287             ns, tag = c
 288             replaced.append('{%s}%s' % (ns_map[ns], tag))
 289     return '/'.join(replaced)
 290
 291
 292 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 293     def _find_xpath(xpath):
 294         return node.find(xpath)
 295
 296     if isinstance(xpath, str):
 297         n = _find_xpath(xpath)
 298     else:
 299         for xp in xpath:
 300             n = _find_xpath(xp)
 301             if n is not None:
 302                 break
 303
 304     if n is None:
 305         if default is not NO_DEFAULT:
 306             return default
 307         elif fatal:
 308             name = xpath if name is None else name
 309             raise ExtractorError('Could not find XML element %s' % name)
 310         else:
 311             return None
 312     return n
 313
 314
 315 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 316     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 317     if n is None or n == default:
 318         return n
 319     if n.text is None:
 320         if default is not NO_DEFAULT:
 321             return default
 322         elif fatal:
 323             name = xpath if name is None else name
 324             raise ExtractorError('Could not find XML element\'s text %s' % name)
 325         else:
 326             return None
 327     return n.text
 328
 329
 330 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 331     n = find_xpath_attr(node, xpath, key)
 332     if n is None:
 333         if default is not NO_DEFAULT:
 334             return default
 335         elif fatal:
 336             name = f'{xpath}[@{key}]' if name is None else name
 337             raise ExtractorError('Could not find XML attribute %s' % name)
 338         else:
 339             return None
 340     return n.attrib[key]
 341
 342
 343 def get_element_by_id(id, html, **kwargs):
 344     """Return the content of the tag with the specified ID in the passed HTML document"""
 345     return get_element_by_attribute('id', id, html, **kwargs)
 346
 347
 348 def get_element_html_by_id(id, html, **kwargs):
 349     """Return the html of the tag with the specified ID in the passed HTML document"""
 350     return get_element_html_by_attribute('id', id, html, **kwargs)
 351
 352
 353 def get_element_by_class(class_name, html):
 354     """Return the content of the first tag with the specified class in the passed HTML document"""
 355     retval = get_elements_by_class(class_name, html)
 356     return retval[0] if retval else None
 357
 358
 359 def get_element_html_by_class(class_name, html):
 360     """Return the html of the first tag with the specified class in the passed HTML document"""
 361     retval = get_elements_html_by_class(class_name, html)
 362     return retval[0] if retval else None
 363
 364
 365 def get_element_by_attribute(attribute, value, html, **kwargs):
 366     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 367     return retval[0] if retval else None
 368
 369
 370 def get_element_html_by_attribute(attribute, value, html, **kargs):
 371     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 372     return retval[0] if retval else None
 373
 374
 375 def get_elements_by_class(class_name, html, **kargs):
 376     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 377     return get_elements_by_attribute(
 378         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 379         html, escape_value=False)
 380
 381
 382 def get_elements_html_by_class(class_name, html):
 383     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 384     return get_elements_html_by_attribute(
 385         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 386         html, escape_value=False)
 387
 388
 389 def get_elements_by_attribute(*args, **kwargs):
 390     """Return the content of the tag with the specified attribute in the passed HTML document"""
 391     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 392
 393
 394 def get_elements_html_by_attribute(*args, **kwargs):
 395     """Return the html of the tag with the specified attribute in the passed HTML document"""
 396     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 397
 398
 399 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 400     """
 401     Return the text (content) and the html (whole) of the tag with the specified
 402     attribute in the passed HTML document
 403     """
 404
 405     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 406
 407     value = re.escape(value) if escape_value else value
 408
 409     partial_element_re = rf'''(?x)
 410         <(?P<tag>[a-zA-Z0-9:._-]+)
 411          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 412          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 413         '''
 414
 415     for m in re.finditer(partial_element_re, html):
 416         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 417
 418         yield (
 419             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 420             whole
 421         )
 422
 423
 424 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 425     """
 426     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 427     closing tag for the first opening tag it has encountered, and can be used
 428     as a context manager
 429     """
 430
 431     class HTMLBreakOnClosingTagException(Exception):
 432         pass
 433
 434     def __init__(self):
 435         self.tagstack = collections.deque()
 436         html.parser.HTMLParser.__init__(self)
 437
 438     def __enter__(self):
 439         return self
 440
 441     def __exit__(self, *_):
 442         self.close()
 443
 444     def close(self):
 445         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 446         # so data remains buffered; we no longer have any interest in it, thus
 447         # override this method to discard it
 448         pass
 449
 450     def handle_starttag(self, tag, _):
 451         self.tagstack.append(tag)
 452
 453     def handle_endtag(self, tag):
 454         if not self.tagstack:
 455             raise compat_HTMLParseError('no tags in the stack')
 456         while self.tagstack:
 457             inner_tag = self.tagstack.pop()
 458             if inner_tag == tag:
 459                 break
 460         else:
 461             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 462         if not self.tagstack:
 463             raise self.HTMLBreakOnClosingTagException()
 464
 465
 466 def get_element_text_and_html_by_tag(tag, html):
 467     """
 468     For the first element with the specified tag in the passed HTML document
 469     return its' content (text) and the whole element (html)
 470     """
 471     def find_or_raise(haystack, needle, exc):
 472         try:
 473             return haystack.index(needle)
 474         except ValueError:
 475             raise exc
 476     closing_tag = f'</{tag}>'
 477     whole_start = find_or_raise(
 478         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 479     content_start = find_or_raise(
 480         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 481     content_start += whole_start + 1
 482     with HTMLBreakOnClosingTagParser() as parser:
 483         parser.feed(html[whole_start:content_start])
 484         if not parser.tagstack or parser.tagstack[0] != tag:
 485             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 486         offset = content_start
 487         while offset < len(html):
 488             next_closing_tag_start = find_or_raise(
 489                 html[offset:], closing_tag,
 490                 compat_HTMLParseError(f'closing {tag} tag not found'))
 491             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 492             try:
 493                 parser.feed(html[offset:offset + next_closing_tag_end])
 494                 offset += next_closing_tag_end
 495             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 496                 return html[content_start:offset + next_closing_tag_start], \
 497                     html[whole_start:offset + next_closing_tag_end]
 498         raise compat_HTMLParseError('unexpected end of html')
 499
 500
 501 class HTMLAttributeParser(html.parser.HTMLParser):
 502     """Trivial HTML parser to gather the attributes for a single element"""
 503
 504     def __init__(self):
 505         self.attrs = {}
 506         html.parser.HTMLParser.__init__(self)
 507
 508     def handle_starttag(self, tag, attrs):
 509         self.attrs = dict(attrs)
 510
 511
 512 class HTMLListAttrsParser(html.parser.HTMLParser):
 513     """HTML parser to gather the attributes for the elements of a list"""
 514
 515     def __init__(self):
 516         html.parser.HTMLParser.__init__(self)
 517         self.items = []
 518         self._level = 0
 519
 520     def handle_starttag(self, tag, attrs):
 521         if tag == 'li' and self._level == 0:
 522             self.items.append(dict(attrs))
 523         self._level += 1
 524
 525     def handle_endtag(self, tag):
 526         self._level -= 1
 527
 528
 529 def extract_attributes(html_element):
 530     """Given a string for an HTML element such as
 531     <el
 532          a="foo" B="bar" c="&98;az" d=boz
 533          empty= noval entity="&amp;"
 534          sq='"' dq="'"
 535     >
 536     Decode and return a dictionary of attributes.
 537     {
 538         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 539         'empty': '', 'noval': None, 'entity': '&',
 540         'sq': '"', 'dq': '\''
 541     }.
 542     """
 543     parser = HTMLAttributeParser()
 544     with contextlib.suppress(compat_HTMLParseError):
 545         parser.feed(html_element)
 546         parser.close()
 547     return parser.attrs
 548
 549
 550 def parse_list(webpage):
 551     """Given a string for an series of HTML <li> elements,
 552     return a dictionary of their attributes"""
 553     parser = HTMLListAttrsParser()
 554     parser.feed(webpage)
 555     parser.close()
 556     return parser.items
 557
 558
 559 def clean_html(html):
 560     """Clean an HTML snippet into a readable string"""
 561
 562     if html is None:  # Convenience for sanitizing descriptions etc.
 563         return html
 564
 565     html = re.sub(r'\s+', ' ', html)
 566     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 567     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 568     # Strip html tags
 569     html = re.sub('<.*?>', '', html)
 570     # Replace html entities
 571     html = unescapeHTML(html)
 572     return html.strip()
 573
 574
 575 class LenientJSONDecoder(json.JSONDecoder):
 576     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 577         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 578         super().__init__(*args, **kwargs)
 579
 580     def decode(self, s):
 581         if self.transform_source:
 582             s = self.transform_source(s)
 583         if self.ignore_extra:
 584             return self.raw_decode(s.lstrip())[0]
 585         return super().decode(s)
 586
 587
 588 def sanitize_open(filename, open_mode):
 589     """Try to open the given filename, and slightly tweak it if this fails.
 590
 591     Attempts to open the given filename. If this fails, it tries to change
 592     the filename slightly, step by step, until it's either able to open it
 593     or it fails and raises a final exception, like the standard open()
 594     function.
 595
 596     It returns the tuple (stream, definitive_file_name).
 597     """
 598     if filename == '-':
 599         if sys.platform == 'win32':
 600             import msvcrt
 601             msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 602         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 603
 604     for attempt in range(2):
 605         try:
 606             try:
 607                 if sys.platform == 'win32':
 608                     # FIXME: An exclusive lock also locks the file from being read.
 609                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 610                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 611                     raise LockingUnsupportedError()
 612                 stream = locked_file(filename, open_mode, block=False).__enter__()
 613             except OSError:
 614                 stream = open(filename, open_mode)
 615             return stream, filename
 616         except OSError as err:
 617             if attempt or err.errno in (errno.EACCES,):
 618                 raise
 619             old_filename, filename = filename, sanitize_path(filename)
 620             if old_filename == filename:
 621                 raise
 622
 623
 624 def timeconvert(timestr):
 625     """Convert RFC 2822 defined time string into system timestamp"""
 626     timestamp = None
 627     timetuple = email.utils.parsedate_tz(timestr)
 628     if timetuple is not None:
 629         timestamp = email.utils.mktime_tz(timetuple)
 630     return timestamp
 631
 632
 633 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 634     """Sanitizes a string so it could be used as part of a filename.
 635     @param restricted   Use a stricter subset of allowed characters
 636     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 637                         If unset, yt-dlp's new sanitization rules are in effect
 638     """
 639     if s == '':
 640         return ''
 641
 642     def replace_insane(char):
 643         if restricted and char in ACCENT_CHARS:
 644             return ACCENT_CHARS[char]
 645         elif not restricted and char == '\n':
 646             return '\0 '
 647         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 648             return ''
 649         elif char == '"':
 650             return '' if restricted else '\''
 651         elif char == ':':
 652             return '\0_\0-' if restricted else '\0 \0-'
 653         elif char in '\\/|*<>':
 654             return '\0_'
 655         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 656             return '\0_'
 657         return char
 658
 659     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 660     result = ''.join(map(replace_insane, s))
 661     if is_id is NO_DEFAULT:
 662         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 663         STRIP_RE = r'(?:\0.|[ _-])*'
 664         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 665     result = result.replace('\0', '') or '_'
 666
 667     if not is_id:
 668         while '__' in result:
 669             result = result.replace('__', '_')
 670         result = result.strip('_')
 671         # Common case of "Foreign band name - English song title"
 672         if restricted and result.startswith('-_'):
 673             result = result[2:]
 674         if result.startswith('-'):
 675             result = '_' + result[len('-'):]
 676         result = result.lstrip('.')
 677         if not result:
 678             result = '_'
 679     return result
 680
 681
 682 def sanitize_path(s, force=False):
 683     """Sanitizes and normalizes path on Windows"""
 684     if sys.platform == 'win32':
 685         force = False
 686         drive_or_unc, _ = os.path.splitdrive(s)
 687     elif force:
 688         drive_or_unc = ''
 689     else:
 690         return s
 691
 692     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 693     if drive_or_unc:
 694         norm_path.pop(0)
 695     sanitized_path = [
 696         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 697         for path_part in norm_path]
 698     if drive_or_unc:
 699         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 700     elif force and s and s[0] == os.path.sep:
 701         sanitized_path.insert(0, os.path.sep)
 702     return os.path.join(*sanitized_path)
 703
 704
 705 def sanitize_url(url):
 706     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 707     # the number of unwanted failures due to missing protocol
 708     if url is None:
 709         return
 710     elif url.startswith('//'):
 711         return 'http:%s' % url
 712     # Fix some common typos seen so far
 713     COMMON_TYPOS = (
 714         # https://github.com/ytdl-org/youtube-dl/issues/15649
 715         (r'^httpss://', r'https://'),
 716         # https://bx1.be/lives/direct-tv/
 717         (r'^rmtp([es]?)://', r'rtmp\1://'),
 718     )
 719     for mistake, fixup in COMMON_TYPOS:
 720         if re.match(mistake, url):
 721             return re.sub(mistake, fixup, url)
 722     return url
 723
 724
 725 def extract_basic_auth(url):
 726     parts = urllib.parse.urlsplit(url)
 727     if parts.username is None:
 728         return url, None
 729     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 730         parts.hostname if parts.port is None
 731         else '%s:%d' % (parts.hostname, parts.port))))
 732     auth_payload = base64.b64encode(
 733         ('%s:%s' % (parts.username, parts.password or '')).encode())
 734     return url, f'Basic {auth_payload.decode()}'
 735
 736
 737 def sanitized_Request(url, *args, **kwargs):
 738     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 739     if auth_header is not None:
 740         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 741         headers['Authorization'] = auth_header
 742     return urllib.request.Request(url, *args, **kwargs)
 743
 744
 745 def expand_path(s):
 746     """Expand shell variables and ~"""
 747     return os.path.expandvars(compat_expanduser(s))
 748
 749
 750 def orderedSet(iterable, *, lazy=False):
 751     """Remove all duplicates from the input iterable"""
 752     def _iter():
 753         seen = []  # Do not use set since the items can be unhashable
 754         for x in iterable:
 755             if x not in seen:
 756                 seen.append(x)
 757                 yield x
 758
 759     return _iter() if lazy else list(_iter())
 760
 761
 762 def _htmlentity_transform(entity_with_semicolon):
 763     """Transforms an HTML entity to a character."""
 764     entity = entity_with_semicolon[:-1]
 765
 766     # Known non-numeric HTML entity
 767     if entity in html.entities.name2codepoint:
 768         return chr(html.entities.name2codepoint[entity])
 769
 770     # TODO: HTML5 allows entities without a semicolon. For example,
 771     # '&Eacuteric' should be decoded as 'Éric'.
 772     if entity_with_semicolon in html.entities.html5:
 773         return html.entities.html5[entity_with_semicolon]
 774
 775     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 776     if mobj is not None:
 777         numstr = mobj.group(1)
 778         if numstr.startswith('x'):
 779             base = 16
 780             numstr = '0%s' % numstr
 781         else:
 782             base = 10
 783         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 784         with contextlib.suppress(ValueError):
 785             return chr(int(numstr, base))
 786
 787     # Unknown entity in name, return its literal representation
 788     return '&%s;' % entity
 789
 790
 791 def unescapeHTML(s):
 792     if s is None:
 793         return None
 794     assert isinstance(s, str)
 795
 796     return re.sub(
 797         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 798
 799
 800 def escapeHTML(text):
 801     return (
 802         text
 803         .replace('&', '&amp;')
 804         .replace('<', '&lt;')
 805         .replace('>', '&gt;')
 806         .replace('"', '&quot;')
 807         .replace("'", '&#39;')
 808     )
 809
 810
 811 def process_communicate_or_kill(p, *args, **kwargs):
 812     write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
 813                  'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
 814     return Popen.communicate_or_kill(p, *args, **kwargs)
 815
 816
 817 class Popen(subprocess.Popen):
 818     if sys.platform == 'win32':
 819         _startupinfo = subprocess.STARTUPINFO()
 820         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 821     else:
 822         _startupinfo = None
 823
 824     def __init__(self, *args, text=False, **kwargs):
 825         if text is True:
 826             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 827             kwargs.setdefault('encoding', 'utf-8')
 828             kwargs.setdefault('errors', 'replace')
 829         super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
 830
 831     def communicate_or_kill(self, *args, **kwargs):
 832         try:
 833             return self.communicate(*args, **kwargs)
 834         except BaseException:  # Including KeyboardInterrupt
 835             self.kill(timeout=None)
 836             raise
 837
 838     def kill(self, *, timeout=0):
 839         super().kill()
 840         if timeout != 0:
 841             self.wait(timeout=timeout)
 842
 843     @classmethod
 844     def run(cls, *args, **kwargs):
 845         with cls(*args, **kwargs) as proc:
 846             stdout, stderr = proc.communicate_or_kill()
 847             return stdout or '', stderr or '', proc.returncode
 848
 849
 850 def get_subprocess_encoding():
 851     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 852         # For subprocess calls, encode with locale encoding
 853         # Refer to http://stackoverflow.com/a/9951851/35070
 854         encoding = preferredencoding()
 855     else:
 856         encoding = sys.getfilesystemencoding()
 857     if encoding is None:
 858         encoding = 'utf-8'
 859     return encoding
 860
 861
 862 def encodeFilename(s, for_subprocess=False):
 863     assert isinstance(s, str)
 864     return s
 865
 866
 867 def decodeFilename(b, for_subprocess=False):
 868     return b
 869
 870
 871 def encodeArgument(s):
 872     # Legacy code that uses byte strings
 873     # Uncomment the following line after fixing all post processors
 874     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 875     return s if isinstance(s, str) else s.decode('ascii')
 876
 877
 878 def decodeArgument(b):
 879     return b
 880
 881
 882 def decodeOption(optval):
 883     if optval is None:
 884         return optval
 885     if isinstance(optval, bytes):
 886         optval = optval.decode(preferredencoding())
 887
 888     assert isinstance(optval, str)
 889     return optval
 890
 891
 892 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 893
 894
 895 def timetuple_from_msec(msec):
 896     secs, msec = divmod(msec, 1000)
 897     mins, secs = divmod(secs, 60)
 898     hrs, mins = divmod(mins, 60)
 899     return _timetuple(hrs, mins, secs, msec)
 900
 901
 902 def formatSeconds(secs, delim=':', msec=False):
 903     time = timetuple_from_msec(secs * 1000)
 904     if time.hours:
 905         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 906     elif time.minutes:
 907         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 908     else:
 909         ret = '%d' % time.seconds
 910     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 911
 912
 913 def _ssl_load_windows_store_certs(ssl_context, storename):
 914     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 915     try:
 916         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 917                  if encoding == 'x509_asn' and (
 918                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 919     except PermissionError:
 920         return
 921     for cert in certs:
 922         with contextlib.suppress(ssl.SSLError):
 923             ssl_context.load_verify_locations(cadata=cert)
 924
 925
 926 def make_HTTPS_handler(params, **kwargs):
 927     opts_check_certificate = not params.get('nocheckcertificate')
 928     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 929     context.check_hostname = opts_check_certificate
 930     if params.get('legacyserverconnect'):
 931         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 932         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 933         context.set_ciphers('DEFAULT')
 934
 935     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 936     if opts_check_certificate:
 937         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 938             context.load_verify_locations(cafile=certifi.where())
 939         else:
 940             try:
 941                 context.load_default_certs()
 942                 # Work around the issue in load_default_certs when there are bad certificates. See:
 943                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
 944                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 945             except ssl.SSLError:
 946                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 947                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 948                     for storename in ('CA', 'ROOT'):
 949                         _ssl_load_windows_store_certs(context, storename)
 950                 context.set_default_verify_paths()
 951
 952     client_certfile = params.get('client_certificate')
 953     if client_certfile:
 954         try:
 955             context.load_cert_chain(
 956                 client_certfile, keyfile=params.get('client_certificate_key'),
 957                 password=params.get('client_certificate_password'))
 958         except ssl.SSLError:
 959             raise YoutubeDLError('Unable to load client certificate')
 960
 961     # Some servers may reject requests if ALPN extension is not sent. See:
 962     # https://github.com/python/cpython/issues/85140
 963     # https://github.com/yt-dlp/yt-dlp/issues/3878
 964     with contextlib.suppress(NotImplementedError):
 965         context.set_alpn_protocols(['http/1.1'])
 966
 967     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 968
 969
 970 def bug_reports_message(before=';'):
 971     from .update import REPOSITORY
 972
 973     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 974            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 975
 976     before = before.rstrip()
 977     if not before or before.endswith(('.', '!', '?')):
 978         msg = msg[0].title() + msg[1:]
 979
 980     return (before + ' ' if before else '') + msg
 981
 982
 983 class YoutubeDLError(Exception):
 984     """Base exception for YoutubeDL errors."""
 985     msg = None
 986
 987     def __init__(self, msg=None):
 988         if msg is not None:
 989             self.msg = msg
 990         elif self.msg is None:
 991             self.msg = type(self).__name__
 992         super().__init__(self.msg)
 993
 994
 995 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
 996 if hasattr(ssl, 'CertificateError'):
 997     network_exceptions.append(ssl.CertificateError)
 998 network_exceptions = tuple(network_exceptions)
 999
1000
1001 class ExtractorError(YoutubeDLError):
1002     """Error during info extraction."""
1003
1004     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1005         """ tb, if given, is the original traceback (so that it can be printed out).
1006         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1007         """
1008         if sys.exc_info()[0] in network_exceptions:
1009             expected = True
1010
1011         self.orig_msg = str(msg)
1012         self.traceback = tb
1013         self.expected = expected
1014         self.cause = cause
1015         self.video_id = video_id
1016         self.ie = ie
1017         self.exc_info = sys.exc_info()  # preserve original exception
1018         if isinstance(self.exc_info[1], ExtractorError):
1019             self.exc_info = self.exc_info[1].exc_info
1020
1021         super().__init__(''.join((
1022             format_field(ie, None, '[%s] '),
1023             format_field(video_id, None, '%s: '),
1024             msg,
1025             format_field(cause, None, ' (caused by %r)'),
1026             '' if expected else bug_reports_message())))
1027
1028     def format_traceback(self):
1029         return join_nonempty(
1030             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1031             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1032             delim='\n') or None
1033
1034
1035 class UnsupportedError(ExtractorError):
1036     def __init__(self, url):
1037         super().__init__(
1038             'Unsupported URL: %s' % url, expected=True)
1039         self.url = url
1040
1041
1042 class RegexNotFoundError(ExtractorError):
1043     """Error when a regex didn't match"""
1044     pass
1045
1046
1047 class GeoRestrictedError(ExtractorError):
1048     """Geographic restriction Error exception.
1049
1050     This exception may be thrown when a video is not available from your
1051     geographic location due to geographic restrictions imposed by a website.
1052     """
1053
1054     def __init__(self, msg, countries=None, **kwargs):
1055         kwargs['expected'] = True
1056         super().__init__(msg, **kwargs)
1057         self.countries = countries
1058
1059
1060 class UserNotLive(ExtractorError):
1061     """Error when a channel/user is not live"""
1062
1063     def __init__(self, msg=None, **kwargs):
1064         kwargs['expected'] = True
1065         super().__init__(msg or 'The channel is not currently live', **kwargs)
1066
1067
1068 class DownloadError(YoutubeDLError):
1069     """Download Error exception.
1070
1071     This exception may be thrown by FileDownloader objects if they are not
1072     configured to continue on errors. They will contain the appropriate
1073     error message.
1074     """
1075
1076     def __init__(self, msg, exc_info=None):
1077         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1078         super().__init__(msg)
1079         self.exc_info = exc_info
1080
1081
1082 class EntryNotInPlaylist(YoutubeDLError):
1083     """Entry not in playlist exception.
1084
1085     This exception will be thrown by YoutubeDL when a requested entry
1086     is not found in the playlist info_dict
1087     """
1088     msg = 'Entry not found in info'
1089
1090
1091 class SameFileError(YoutubeDLError):
1092     """Same File exception.
1093
1094     This exception will be thrown by FileDownloader objects if they detect
1095     multiple files would have to be downloaded to the same file on disk.
1096     """
1097     msg = 'Fixed output name but more than one file to download'
1098
1099     def __init__(self, filename=None):
1100         if filename is not None:
1101             self.msg += f': {filename}'
1102         super().__init__(self.msg)
1103
1104
1105 class PostProcessingError(YoutubeDLError):
1106     """Post Processing exception.
1107
1108     This exception may be raised by PostProcessor's .run() method to
1109     indicate an error in the postprocessing task.
1110     """
1111
1112
1113 class DownloadCancelled(YoutubeDLError):
1114     """ Exception raised when the download queue should be interrupted """
1115     msg = 'The download was cancelled'
1116
1117
1118 class ExistingVideoReached(DownloadCancelled):
1119     """ --break-on-existing triggered """
1120     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1121
1122
1123 class RejectedVideoReached(DownloadCancelled):
1124     """ --break-on-reject triggered """
1125     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1126
1127
1128 class MaxDownloadsReached(DownloadCancelled):
1129     """ --max-downloads limit has been reached. """
1130     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1131
1132
1133 class ReExtractInfo(YoutubeDLError):
1134     """ Video info needs to be re-extracted. """
1135
1136     def __init__(self, msg, expected=False):
1137         super().__init__(msg)
1138         self.expected = expected
1139
1140
1141 class ThrottledDownload(ReExtractInfo):
1142     """ Download speed below --throttled-rate. """
1143     msg = 'The download speed is below throttle limit'
1144
1145     def __init__(self):
1146         super().__init__(self.msg, expected=False)
1147
1148
1149 class UnavailableVideoError(YoutubeDLError):
1150     """Unavailable Format exception.
1151
1152     This exception will be thrown when a video is requested
1153     in a format that is not available for that video.
1154     """
1155     msg = 'Unable to download video'
1156
1157     def __init__(self, err=None):
1158         if err is not None:
1159             self.msg += f': {err}'
1160         super().__init__(self.msg)
1161
1162
1163 class ContentTooShortError(YoutubeDLError):
1164     """Content Too Short exception.
1165
1166     This exception may be raised by FileDownloader objects when a file they
1167     download is too small for what the server announced first, indicating
1168     the connection was probably interrupted.
1169     """
1170
1171     def __init__(self, downloaded, expected):
1172         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1173         # Both in bytes
1174         self.downloaded = downloaded
1175         self.expected = expected
1176
1177
1178 class XAttrMetadataError(YoutubeDLError):
1179     def __init__(self, code=None, msg='Unknown error'):
1180         super().__init__(msg)
1181         self.code = code
1182         self.msg = msg
1183
1184         # Parsing code and msg
1185         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1186                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1187             self.reason = 'NO_SPACE'
1188         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1189             self.reason = 'VALUE_TOO_LONG'
1190         else:
1191             self.reason = 'NOT_SUPPORTED'
1192
1193
1194 class XAttrUnavailableError(YoutubeDLError):
1195     pass
1196
1197
1198 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1199     hc = http_class(*args, **kwargs)
1200     source_address = ydl_handler._params.get('source_address')
1201
1202     if source_address is not None:
1203         # This is to workaround _create_connection() from socket where it will try all
1204         # address data from getaddrinfo() including IPv6. This filters the result from
1205         # getaddrinfo() based on the source_address value.
1206         # This is based on the cpython socket.create_connection() function.
1207         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1208         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1209             host, port = address
1210             err = None
1211             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1212             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1213             ip_addrs = [addr for addr in addrs if addr[0] == af]
1214             if addrs and not ip_addrs:
1215                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1216                 raise OSError(
1217                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1218                     % (ip_version, source_address[0]))
1219             for res in ip_addrs:
1220                 af, socktype, proto, canonname, sa = res
1221                 sock = None
1222                 try:
1223                     sock = socket.socket(af, socktype, proto)
1224                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1225                         sock.settimeout(timeout)
1226                     sock.bind(source_address)
1227                     sock.connect(sa)
1228                     err = None  # Explicitly break reference cycle
1229                     return sock
1230                 except OSError as _:
1231                     err = _
1232                     if sock is not None:
1233                         sock.close()
1234             if err is not None:
1235                 raise err
1236             else:
1237                 raise OSError('getaddrinfo returns an empty list')
1238         if hasattr(hc, '_create_connection'):
1239             hc._create_connection = _create_connection
1240         hc.source_address = (source_address, 0)
1241
1242     return hc
1243
1244
1245 def handle_youtubedl_headers(headers):
1246     filtered_headers = headers
1247
1248     if 'Youtubedl-no-compression' in filtered_headers:
1249         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1250         del filtered_headers['Youtubedl-no-compression']
1251
1252     return filtered_headers
1253
1254
1255 class YoutubeDLHandler(urllib.request.HTTPHandler):
1256     """Handler for HTTP requests and responses.
1257
1258     This class, when installed with an OpenerDirector, automatically adds
1259     the standard headers to every HTTP request and handles gzipped and
1260     deflated responses from web servers. If compression is to be avoided in
1261     a particular request, the original request in the program code only has
1262     to include the HTTP header "Youtubedl-no-compression", which will be
1263     removed before making the real request.
1264
1265     Part of this code was copied from:
1266
1267     http://techknack.net/python-urllib2-handlers/
1268
1269     Andrew Rowls, the author of that code, agreed to release it to the
1270     public domain.
1271     """
1272
1273     def __init__(self, params, *args, **kwargs):
1274         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1275         self._params = params
1276
1277     def http_open(self, req):
1278         conn_class = http.client.HTTPConnection
1279
1280         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1281         if socks_proxy:
1282             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1283             del req.headers['Ytdl-socks-proxy']
1284
1285         return self.do_open(functools.partial(
1286             _create_http_connection, self, conn_class, False),
1287             req)
1288
1289     @staticmethod
1290     def deflate(data):
1291         if not data:
1292             return data
1293         try:
1294             return zlib.decompress(data, -zlib.MAX_WBITS)
1295         except zlib.error:
1296             return zlib.decompress(data)
1297
1298     @staticmethod
1299     def brotli(data):
1300         if not data:
1301             return data
1302         return brotli.decompress(data)
1303
1304     def http_request(self, req):
1305         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1306         # always respected by websites, some tend to give out URLs with non percent-encoded
1307         # non-ASCII characters (see telemb.py, ard.py [#3412])
1308         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1309         # To work around aforementioned issue we will replace request's original URL with
1310         # percent-encoded one
1311         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1312         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1313         url = req.get_full_url()
1314         url_escaped = escape_url(url)
1315
1316         # Substitute URL if any change after escaping
1317         if url != url_escaped:
1318             req = update_Request(req, url=url_escaped)
1319
1320         for h, v in self._params.get('http_headers', std_headers).items():
1321             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1322             # The dict keys are capitalized because of this bug by urllib
1323             if h.capitalize() not in req.headers:
1324                 req.add_header(h, v)
1325
1326         if 'Accept-encoding' not in req.headers:
1327             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1328
1329         req.headers = handle_youtubedl_headers(req.headers)
1330
1331         return super().do_request_(req)
1332
1333     def http_response(self, req, resp):
1334         old_resp = resp
1335         # gzip
1336         if resp.headers.get('Content-encoding', '') == 'gzip':
1337             content = resp.read()
1338             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1339             try:
1340                 uncompressed = io.BytesIO(gz.read())
1341             except OSError as original_ioerror:
1342                 # There may be junk add the end of the file
1343                 # See http://stackoverflow.com/q/4928560/35070 for details
1344                 for i in range(1, 1024):
1345                     try:
1346                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1347                         uncompressed = io.BytesIO(gz.read())
1348                     except OSError:
1349                         continue
1350                     break
1351                 else:
1352                     raise original_ioerror
1353             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1354             resp.msg = old_resp.msg
1355             del resp.headers['Content-encoding']
1356         # deflate
1357         if resp.headers.get('Content-encoding', '') == 'deflate':
1358             gz = io.BytesIO(self.deflate(resp.read()))
1359             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1360             resp.msg = old_resp.msg
1361             del resp.headers['Content-encoding']
1362         # brotli
1363         if resp.headers.get('Content-encoding', '') == 'br':
1364             resp = urllib.request.addinfourl(
1365                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1366             resp.msg = old_resp.msg
1367             del resp.headers['Content-encoding']
1368         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1369         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1370         if 300 <= resp.code < 400:
1371             location = resp.headers.get('Location')
1372             if location:
1373                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1374                 location = location.encode('iso-8859-1').decode()
1375                 location_escaped = escape_url(location)
1376                 if location != location_escaped:
1377                     del resp.headers['Location']
1378                     resp.headers['Location'] = location_escaped
1379         return resp
1380
1381     https_request = http_request
1382     https_response = http_response
1383
1384
1385 def make_socks_conn_class(base_class, socks_proxy):
1386     assert issubclass(base_class, (
1387         http.client.HTTPConnection, http.client.HTTPSConnection))
1388
1389     url_components = urllib.parse.urlparse(socks_proxy)
1390     if url_components.scheme.lower() == 'socks5':
1391         socks_type = ProxyType.SOCKS5
1392     elif url_components.scheme.lower() in ('socks', 'socks4'):
1393         socks_type = ProxyType.SOCKS4
1394     elif url_components.scheme.lower() == 'socks4a':
1395         socks_type = ProxyType.SOCKS4A
1396
1397     def unquote_if_non_empty(s):
1398         if not s:
1399             return s
1400         return urllib.parse.unquote_plus(s)
1401
1402     proxy_args = (
1403         socks_type,
1404         url_components.hostname, url_components.port or 1080,
1405         True,  # Remote DNS
1406         unquote_if_non_empty(url_components.username),
1407         unquote_if_non_empty(url_components.password),
1408     )
1409
1410     class SocksConnection(base_class):
1411         def connect(self):
1412             self.sock = sockssocket()
1413             self.sock.setproxy(*proxy_args)
1414             if isinstance(self.timeout, (int, float)):
1415                 self.sock.settimeout(self.timeout)
1416             self.sock.connect((self.host, self.port))
1417
1418             if isinstance(self, http.client.HTTPSConnection):
1419                 if hasattr(self, '_context'):  # Python > 2.6
1420                     self.sock = self._context.wrap_socket(
1421                         self.sock, server_hostname=self.host)
1422                 else:
1423                     self.sock = ssl.wrap_socket(self.sock)
1424
1425     return SocksConnection
1426
1427
1428 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1429     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1430         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1431         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1432         self._params = params
1433
1434     def https_open(self, req):
1435         kwargs = {}
1436         conn_class = self._https_conn_class
1437
1438         if hasattr(self, '_context'):  # python > 2.6
1439             kwargs['context'] = self._context
1440         if hasattr(self, '_check_hostname'):  # python 3.x
1441             kwargs['check_hostname'] = self._check_hostname
1442
1443         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1444         if socks_proxy:
1445             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1446             del req.headers['Ytdl-socks-proxy']
1447
1448         try:
1449             return self.do_open(
1450                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1451         except urllib.error.URLError as e:
1452             if (isinstance(e.reason, ssl.SSLError)
1453                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1454                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1455             raise
1456
1457
1458 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1459     """
1460     See [1] for cookie file format.
1461
1462     1. https://curl.haxx.se/docs/http-cookies.html
1463     """
1464     _HTTPONLY_PREFIX = '#HttpOnly_'
1465     _ENTRY_LEN = 7
1466     _HEADER = '''# Netscape HTTP Cookie File
1467 # This file is generated by yt-dlp.  Do not edit.
1468
1469 '''
1470     _CookieFileEntry = collections.namedtuple(
1471         'CookieFileEntry',
1472         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1473
1474     def __init__(self, filename=None, *args, **kwargs):
1475         super().__init__(None, *args, **kwargs)
1476         if self.is_path(filename):
1477             filename = os.fspath(filename)
1478         self.filename = filename
1479
1480     @staticmethod
1481     def _true_or_false(cndn):
1482         return 'TRUE' if cndn else 'FALSE'
1483
1484     @staticmethod
1485     def is_path(file):
1486         return isinstance(file, (str, bytes, os.PathLike))
1487
1488     @contextlib.contextmanager
1489     def open(self, file, *, write=False):
1490         if self.is_path(file):
1491             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1492                 yield f
1493         else:
1494             if write:
1495                 file.truncate(0)
1496             yield file
1497
1498     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1499         now = time.time()
1500         for cookie in self:
1501             if (not ignore_discard and cookie.discard
1502                     or not ignore_expires and cookie.is_expired(now)):
1503                 continue
1504             name, value = cookie.name, cookie.value
1505             if value is None:
1506                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1507                 # with no name, whereas http.cookiejar regards it as a
1508                 # cookie with no value.
1509                 name, value = '', name
1510             f.write('%s\n' % '\t'.join((
1511                 cookie.domain,
1512                 self._true_or_false(cookie.domain.startswith('.')),
1513                 cookie.path,
1514                 self._true_or_false(cookie.secure),
1515                 str_or_none(cookie.expires, default=''),
1516                 name, value
1517             )))
1518
1519     def save(self, filename=None, *args, **kwargs):
1520         """
1521         Save cookies to a file.
1522         Code is taken from CPython 3.6
1523         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1524
1525         if filename is None:
1526             if self.filename is not None:
1527                 filename = self.filename
1528             else:
1529                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1530
1531         # Store session cookies with `expires` set to 0 instead of an empty string
1532         for cookie in self:
1533             if cookie.expires is None:
1534                 cookie.expires = 0
1535
1536         with self.open(filename, write=True) as f:
1537             f.write(self._HEADER)
1538             self._really_save(f, *args, **kwargs)
1539
1540     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1541         """Load cookies from a file."""
1542         if filename is None:
1543             if self.filename is not None:
1544                 filename = self.filename
1545             else:
1546                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1547
1548         def prepare_line(line):
1549             if line.startswith(self._HTTPONLY_PREFIX):
1550                 line = line[len(self._HTTPONLY_PREFIX):]
1551             # comments and empty lines are fine
1552             if line.startswith('#') or not line.strip():
1553                 return line
1554             cookie_list = line.split('\t')
1555             if len(cookie_list) != self._ENTRY_LEN:
1556                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1557             cookie = self._CookieFileEntry(*cookie_list)
1558             if cookie.expires_at and not cookie.expires_at.isdigit():
1559                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1560             return line
1561
1562         cf = io.StringIO()
1563         with self.open(filename) as f:
1564             for line in f:
1565                 try:
1566                     cf.write(prepare_line(line))
1567                 except http.cookiejar.LoadError as e:
1568                     if f'{line.strip()} '[0] in '[{"':
1569                         raise http.cookiejar.LoadError(
1570                             'Cookies file must be Netscape formatted, not JSON. See  '
1571                             'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1572                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1573                     continue
1574         cf.seek(0)
1575         self._really_load(cf, filename, ignore_discard, ignore_expires)
1576         # Session cookies are denoted by either `expires` field set to
1577         # an empty string or 0. MozillaCookieJar only recognizes the former
1578         # (see [1]). So we need force the latter to be recognized as session
1579         # cookies on our own.
1580         # Session cookies may be important for cookies-based authentication,
1581         # e.g. usually, when user does not check 'Remember me' check box while
1582         # logging in on a site, some important cookies are stored as session
1583         # cookies so that not recognizing them will result in failed login.
1584         # 1. https://bugs.python.org/issue17164
1585         for cookie in self:
1586             # Treat `expires=0` cookies as session cookies
1587             if cookie.expires == 0:
1588                 cookie.expires = None
1589                 cookie.discard = True
1590
1591
1592 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1593     def __init__(self, cookiejar=None):
1594         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1595
1596     def http_response(self, request, response):
1597         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1598
1599     https_request = urllib.request.HTTPCookieProcessor.http_request
1600     https_response = http_response
1601
1602
1603 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1604     """YoutubeDL redirect handler
1605
1606     The code is based on HTTPRedirectHandler implementation from CPython [1].
1607
1608     This redirect handler solves two issues:
1609      - ensures redirect URL is always unicode under python 2
1610      - introduces support for experimental HTTP response status code
1611        308 Permanent Redirect [2] used by some sites [3]
1612
1613     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1614     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1615     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1616     """
1617
1618     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1619
1620     def redirect_request(self, req, fp, code, msg, headers, newurl):
1621         """Return a Request or None in response to a redirect.
1622
1623         This is called by the http_error_30x methods when a
1624         redirection response is received.  If a redirection should
1625         take place, return a new Request to allow http_error_30x to
1626         perform the redirect.  Otherwise, raise HTTPError if no-one
1627         else should try to handle this url.  Return None if you can't
1628         but another Handler might.
1629         """
1630         m = req.get_method()
1631         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1632                  or code in (301, 302, 303) and m == "POST")):
1633             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1634         # Strictly (according to RFC 2616), 301 or 302 in response to
1635         # a POST MUST NOT cause a redirection without confirmation
1636         # from the user (of urllib.request, in this case).  In practice,
1637         # essentially all clients do redirect in this case, so we do
1638         # the same.
1639
1640         # Be conciliant with URIs containing a space.  This is mainly
1641         # redundant with the more complete encoding done in http_error_302(),
1642         # but it is kept for compatibility with other callers.
1643         newurl = newurl.replace(' ', '%20')
1644
1645         CONTENT_HEADERS = ("content-length", "content-type")
1646         # NB: don't use dict comprehension for python 2.6 compatibility
1647         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1648
1649         # A 303 must either use GET or HEAD for subsequent request
1650         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1651         if code == 303 and m != 'HEAD':
1652             m = 'GET'
1653         # 301 and 302 redirects are commonly turned into a GET from a POST
1654         # for subsequent requests by browsers, so we'll do the same.
1655         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1656         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1657         if code in (301, 302) and m == 'POST':
1658             m = 'GET'
1659
1660         return urllib.request.Request(
1661             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1662             unverifiable=True, method=m)
1663
1664
1665 def extract_timezone(date_str):
1666     m = re.search(
1667         r'''(?x)
1668             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1669             (?P<tz>Z|                                            # just the UTC Z, or
1670                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1671                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1672                    [ ]?                                          # optional space
1673                 (?P<sign>\+|-)                                   # +/-
1674                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1675             $)
1676         ''', date_str)
1677     if not m:
1678         timezone = datetime.timedelta()
1679     else:
1680         date_str = date_str[:-len(m.group('tz'))]
1681         if not m.group('sign'):
1682             timezone = datetime.timedelta()
1683         else:
1684             sign = 1 if m.group('sign') == '+' else -1
1685             timezone = datetime.timedelta(
1686                 hours=sign * int(m.group('hours')),
1687                 minutes=sign * int(m.group('minutes')))
1688     return timezone, date_str
1689
1690
1691 def parse_iso8601(date_str, delimiter='T', timezone=None):
1692     """ Return a UNIX timestamp from the given date """
1693
1694     if date_str is None:
1695         return None
1696
1697     date_str = re.sub(r'\.[0-9]+', '', date_str)
1698
1699     if timezone is None:
1700         timezone, date_str = extract_timezone(date_str)
1701
1702     with contextlib.suppress(ValueError):
1703         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1704         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1705         return calendar.timegm(dt.timetuple())
1706
1707
1708 def date_formats(day_first=True):
1709     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1710
1711
1712 def unified_strdate(date_str, day_first=True):
1713     """Return a string with the date in the format YYYYMMDD"""
1714
1715     if date_str is None:
1716         return None
1717     upload_date = None
1718     # Replace commas
1719     date_str = date_str.replace(',', ' ')
1720     # Remove AM/PM + timezone
1721     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1722     _, date_str = extract_timezone(date_str)
1723
1724     for expression in date_formats(day_first):
1725         with contextlib.suppress(ValueError):
1726             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1727     if upload_date is None:
1728         timetuple = email.utils.parsedate_tz(date_str)
1729         if timetuple:
1730             with contextlib.suppress(ValueError):
1731                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1732     if upload_date is not None:
1733         return str(upload_date)
1734
1735
1736 def unified_timestamp(date_str, day_first=True):
1737     if date_str is None:
1738         return None
1739
1740     date_str = re.sub(r'[,|]', '', date_str)
1741
1742     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1743     timezone, date_str = extract_timezone(date_str)
1744
1745     # Remove AM/PM + timezone
1746     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1747
1748     # Remove unrecognized timezones from ISO 8601 alike timestamps
1749     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1750     if m:
1751         date_str = date_str[:-len(m.group('tz'))]
1752
1753     # Python only supports microseconds, so remove nanoseconds
1754     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1755     if m:
1756         date_str = m.group(1)
1757
1758     for expression in date_formats(day_first):
1759         with contextlib.suppress(ValueError):
1760             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1761             return calendar.timegm(dt.timetuple())
1762     timetuple = email.utils.parsedate_tz(date_str)
1763     if timetuple:
1764         return calendar.timegm(timetuple) + pm_delta * 3600
1765
1766
1767 def determine_ext(url, default_ext='unknown_video'):
1768     if url is None or '.' not in url:
1769         return default_ext
1770     guess = url.partition('?')[0].rpartition('.')[2]
1771     if re.match(r'^[A-Za-z0-9]+$', guess):
1772         return guess
1773     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1774     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1775         return guess.rstrip('/')
1776     else:
1777         return default_ext
1778
1779
1780 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1781     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1782
1783
1784 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1785     R"""
1786     Return a datetime object from a string.
1787     Supported format:
1788         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1789
1790     @param format       strftime format of DATE
1791     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1792                         auto: round to the unit provided in date_str (if applicable).
1793     """
1794     auto_precision = False
1795     if precision == 'auto':
1796         auto_precision = True
1797         precision = 'microsecond'
1798     today = datetime_round(datetime.datetime.utcnow(), precision)
1799     if date_str in ('now', 'today'):
1800         return today
1801     if date_str == 'yesterday':
1802         return today - datetime.timedelta(days=1)
1803     match = re.match(
1804         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1805         date_str)
1806     if match is not None:
1807         start_time = datetime_from_str(match.group('start'), precision, format)
1808         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1809         unit = match.group('unit')
1810         if unit == 'month' or unit == 'year':
1811             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1812             unit = 'day'
1813         else:
1814             if unit == 'week':
1815                 unit = 'day'
1816                 time *= 7
1817             delta = datetime.timedelta(**{unit + 's': time})
1818             new_date = start_time + delta
1819         if auto_precision:
1820             return datetime_round(new_date, unit)
1821         return new_date
1822
1823     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1824
1825
1826 def date_from_str(date_str, format='%Y%m%d', strict=False):
1827     R"""
1828     Return a date object from a string using datetime_from_str
1829
1830     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1831                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1832     """
1833     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1834         raise ValueError(f'Invalid date format "{date_str}"')
1835     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1836
1837
1838 def datetime_add_months(dt, months):
1839     """Increment/Decrement a datetime object by months."""
1840     month = dt.month + months - 1
1841     year = dt.year + month // 12
1842     month = month % 12 + 1
1843     day = min(dt.day, calendar.monthrange(year, month)[1])
1844     return dt.replace(year, month, day)
1845
1846
1847 def datetime_round(dt, precision='day'):
1848     """
1849     Round a datetime object's time to a specific precision
1850     """
1851     if precision == 'microsecond':
1852         return dt
1853
1854     unit_seconds = {
1855         'day': 86400,
1856         'hour': 3600,
1857         'minute': 60,
1858         'second': 1,
1859     }
1860     roundto = lambda x, n: ((x + n / 2) // n) * n
1861     timestamp = calendar.timegm(dt.timetuple())
1862     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1863
1864
1865 def hyphenate_date(date_str):
1866     """
1867     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1868     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1869     if match is not None:
1870         return '-'.join(match.groups())
1871     else:
1872         return date_str
1873
1874
1875 class DateRange:
1876     """Represents a time interval between two dates"""
1877
1878     def __init__(self, start=None, end=None):
1879         """start and end must be strings in the format accepted by date"""
1880         if start is not None:
1881             self.start = date_from_str(start, strict=True)
1882         else:
1883             self.start = datetime.datetime.min.date()
1884         if end is not None:
1885             self.end = date_from_str(end, strict=True)
1886         else:
1887             self.end = datetime.datetime.max.date()
1888         if self.start > self.end:
1889             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1890
1891     @classmethod
1892     def day(cls, day):
1893         """Returns a range that only contains the given day"""
1894         return cls(day, day)
1895
1896     def __contains__(self, date):
1897         """Check if the date is in the range"""
1898         if not isinstance(date, datetime.date):
1899             date = date_from_str(date)
1900         return self.start <= date <= self.end
1901
1902     def __str__(self):
1903         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1904
1905     def __eq__(self, other):
1906         return (isinstance(other, DateRange)
1907                 and self.start == other.start and self.end == other.end)
1908
1909
1910 def platform_name():
1911     """ Returns the platform name as a str """
1912     write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1913     return platform.platform()
1914
1915
1916 @functools.cache
1917 def system_identifier():
1918     python_implementation = platform.python_implementation()
1919     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1920         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1921
1922     return 'Python %s (%s %s) - %s %s' % (
1923         platform.python_version(),
1924         python_implementation,
1925         platform.architecture()[0],
1926         platform.platform(),
1927         format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1928     )
1929
1930
1931 @functools.cache
1932 def get_windows_version():
1933     ''' Get Windows version. returns () if it's not running on Windows '''
1934     if compat_os_name == 'nt':
1935         return version_tuple(platform.win32_ver()[1])
1936     else:
1937         return ()
1938
1939
1940 def write_string(s, out=None, encoding=None):
1941     assert isinstance(s, str)
1942     out = out or sys.stderr
1943
1944     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1945         s = re.sub(r'([\r\n]+)', r' \1', s)
1946
1947     enc, buffer = None, out
1948     if 'b' in getattr(out, 'mode', ''):
1949         enc = encoding or preferredencoding()
1950     elif hasattr(out, 'buffer'):
1951         buffer = out.buffer
1952         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1953
1954     buffer.write(s.encode(enc, 'ignore') if enc else s)
1955     out.flush()
1956
1957
1958 def bytes_to_intlist(bs):
1959     if not bs:
1960         return []
1961     if isinstance(bs[0], int):  # Python 3
1962         return list(bs)
1963     else:
1964         return [ord(c) for c in bs]
1965
1966
1967 def intlist_to_bytes(xs):
1968     if not xs:
1969         return b''
1970     return struct.pack('%dB' % len(xs), *xs)
1971
1972
1973 class LockingUnsupportedError(OSError):
1974     msg = 'File locking is not supported'
1975
1976     def __init__(self):
1977         super().__init__(self.msg)
1978
1979
1980 # Cross-platform file locking
1981 if sys.platform == 'win32':
1982     import ctypes.wintypes
1983     import msvcrt
1984
1985     class OVERLAPPED(ctypes.Structure):
1986         _fields_ = [
1987             ('Internal', ctypes.wintypes.LPVOID),
1988             ('InternalHigh', ctypes.wintypes.LPVOID),
1989             ('Offset', ctypes.wintypes.DWORD),
1990             ('OffsetHigh', ctypes.wintypes.DWORD),
1991             ('hEvent', ctypes.wintypes.HANDLE),
1992         ]
1993
1994     kernel32 = ctypes.windll.kernel32
1995     LockFileEx = kernel32.LockFileEx
1996     LockFileEx.argtypes = [
1997         ctypes.wintypes.HANDLE,     # hFile
1998         ctypes.wintypes.DWORD,      # dwFlags
1999         ctypes.wintypes.DWORD,      # dwReserved
2000         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2001         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2002         ctypes.POINTER(OVERLAPPED)  # Overlapped
2003     ]
2004     LockFileEx.restype = ctypes.wintypes.BOOL
2005     UnlockFileEx = kernel32.UnlockFileEx
2006     UnlockFileEx.argtypes = [
2007         ctypes.wintypes.HANDLE,     # hFile
2008         ctypes.wintypes.DWORD,      # dwReserved
2009         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2010         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2011         ctypes.POINTER(OVERLAPPED)  # Overlapped
2012     ]
2013     UnlockFileEx.restype = ctypes.wintypes.BOOL
2014     whole_low = 0xffffffff
2015     whole_high = 0x7fffffff
2016
2017     def _lock_file(f, exclusive, block):
2018         overlapped = OVERLAPPED()
2019         overlapped.Offset = 0
2020         overlapped.OffsetHigh = 0
2021         overlapped.hEvent = 0
2022         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2023
2024         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2025                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2026                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2027             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2028             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2029
2030     def _unlock_file(f):
2031         assert f._lock_file_overlapped_p
2032         handle = msvcrt.get_osfhandle(f.fileno())
2033         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2034             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2035
2036 else:
2037     try:
2038         import fcntl
2039
2040         def _lock_file(f, exclusive, block):
2041             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2042             if not block:
2043                 flags |= fcntl.LOCK_NB
2044             try:
2045                 fcntl.flock(f, flags)
2046             except BlockingIOError:
2047                 raise
2048             except OSError:  # AOSP does not have flock()
2049                 fcntl.lockf(f, flags)
2050
2051         def _unlock_file(f):
2052             try:
2053                 fcntl.flock(f, fcntl.LOCK_UN)
2054             except OSError:
2055                 fcntl.lockf(f, fcntl.LOCK_UN)
2056
2057     except ImportError:
2058
2059         def _lock_file(f, exclusive, block):
2060             raise LockingUnsupportedError()
2061
2062         def _unlock_file(f):
2063             raise LockingUnsupportedError()
2064
2065
2066 class locked_file:
2067     locked = False
2068
2069     def __init__(self, filename, mode, block=True, encoding=None):
2070         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2071             raise NotImplementedError(mode)
2072         self.mode, self.block = mode, block
2073
2074         writable = any(f in mode for f in 'wax+')
2075         readable = any(f in mode for f in 'r+')
2076         flags = functools.reduce(operator.ior, (
2077             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2078             getattr(os, 'O_BINARY', 0),  # Windows only
2079             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2080             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2081             os.O_APPEND if 'a' in mode else 0,
2082             os.O_EXCL if 'x' in mode else 0,
2083             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2084         ))
2085
2086         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2087
2088     def __enter__(self):
2089         exclusive = 'r' not in self.mode
2090         try:
2091             _lock_file(self.f, exclusive, self.block)
2092             self.locked = True
2093         except OSError:
2094             self.f.close()
2095             raise
2096         if 'w' in self.mode:
2097             try:
2098                 self.f.truncate()
2099             except OSError as e:
2100                 if e.errno not in (
2101                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2102                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2103                 ):
2104                     raise
2105         return self
2106
2107     def unlock(self):
2108         if not self.locked:
2109             return
2110         try:
2111             _unlock_file(self.f)
2112         finally:
2113             self.locked = False
2114
2115     def __exit__(self, *_):
2116         try:
2117             self.unlock()
2118         finally:
2119             self.f.close()
2120
2121     open = __enter__
2122     close = __exit__
2123
2124     def __getattr__(self, attr):
2125         return getattr(self.f, attr)
2126
2127     def __iter__(self):
2128         return iter(self.f)
2129
2130
2131 @functools.cache
2132 def get_filesystem_encoding():
2133     encoding = sys.getfilesystemencoding()
2134     return encoding if encoding is not None else 'utf-8'
2135
2136
2137 def shell_quote(args):
2138     quoted_args = []
2139     encoding = get_filesystem_encoding()
2140     for a in args:
2141         if isinstance(a, bytes):
2142             # We may get a filename encoded with 'encodeFilename'
2143             a = a.decode(encoding)
2144         quoted_args.append(compat_shlex_quote(a))
2145     return ' '.join(quoted_args)
2146
2147
2148 def smuggle_url(url, data):
2149     """ Pass additional data in a URL for internal use. """
2150
2151     url, idata = unsmuggle_url(url, {})
2152     data.update(idata)
2153     sdata = urllib.parse.urlencode(
2154         {'__youtubedl_smuggle': json.dumps(data)})
2155     return url + '#' + sdata
2156
2157
2158 def unsmuggle_url(smug_url, default=None):
2159     if '#__youtubedl_smuggle' not in smug_url:
2160         return smug_url, default
2161     url, _, sdata = smug_url.rpartition('#')
2162     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2163     data = json.loads(jsond)
2164     return url, data
2165
2166
2167 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2168     """ Formats numbers with decimal sufixes like K, M, etc """
2169     num, factor = float_or_none(num), float(factor)
2170     if num is None or num < 0:
2171         return None
2172     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2173     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2174     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2175     if factor == 1024:
2176         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2177     converted = num / (factor ** exponent)
2178     return fmt % (converted, suffix)
2179
2180
2181 def format_bytes(bytes):
2182     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2183
2184
2185 def lookup_unit_table(unit_table, s):
2186     units_re = '|'.join(re.escape(u) for u in unit_table)
2187     m = re.match(
2188         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2189     if not m:
2190         return None
2191     num_str = m.group('num').replace(',', '.')
2192     mult = unit_table[m.group('unit')]
2193     return int(float(num_str) * mult)
2194
2195
2196 def parse_filesize(s):
2197     if s is None:
2198         return None
2199
2200     # The lower-case forms are of course incorrect and unofficial,
2201     # but we support those too
2202     _UNIT_TABLE = {
2203         'B': 1,
2204         'b': 1,
2205         'bytes': 1,
2206         'KiB': 1024,
2207         'KB': 1000,
2208         'kB': 1024,
2209         'Kb': 1000,
2210         'kb': 1000,
2211         'kilobytes': 1000,
2212         'kibibytes': 1024,
2213         'MiB': 1024 ** 2,
2214         'MB': 1000 ** 2,
2215         'mB': 1024 ** 2,
2216         'Mb': 1000 ** 2,
2217         'mb': 1000 ** 2,
2218         'megabytes': 1000 ** 2,
2219         'mebibytes': 1024 ** 2,
2220         'GiB': 1024 ** 3,
2221         'GB': 1000 ** 3,
2222         'gB': 1024 ** 3,
2223         'Gb': 1000 ** 3,
2224         'gb': 1000 ** 3,
2225         'gigabytes': 1000 ** 3,
2226         'gibibytes': 1024 ** 3,
2227         'TiB': 1024 ** 4,
2228         'TB': 1000 ** 4,
2229         'tB': 1024 ** 4,
2230         'Tb': 1000 ** 4,
2231         'tb': 1000 ** 4,
2232         'terabytes': 1000 ** 4,
2233         'tebibytes': 1024 ** 4,
2234         'PiB': 1024 ** 5,
2235         'PB': 1000 ** 5,
2236         'pB': 1024 ** 5,
2237         'Pb': 1000 ** 5,
2238         'pb': 1000 ** 5,
2239         'petabytes': 1000 ** 5,
2240         'pebibytes': 1024 ** 5,
2241         'EiB': 1024 ** 6,
2242         'EB': 1000 ** 6,
2243         'eB': 1024 ** 6,
2244         'Eb': 1000 ** 6,
2245         'eb': 1000 ** 6,
2246         'exabytes': 1000 ** 6,
2247         'exbibytes': 1024 ** 6,
2248         'ZiB': 1024 ** 7,
2249         'ZB': 1000 ** 7,
2250         'zB': 1024 ** 7,
2251         'Zb': 1000 ** 7,
2252         'zb': 1000 ** 7,
2253         'zettabytes': 1000 ** 7,
2254         'zebibytes': 1024 ** 7,
2255         'YiB': 1024 ** 8,
2256         'YB': 1000 ** 8,
2257         'yB': 1024 ** 8,
2258         'Yb': 1000 ** 8,
2259         'yb': 1000 ** 8,
2260         'yottabytes': 1000 ** 8,
2261         'yobibytes': 1024 ** 8,
2262     }
2263
2264     return lookup_unit_table(_UNIT_TABLE, s)
2265
2266
2267 def parse_count(s):
2268     if s is None:
2269         return None
2270
2271     s = re.sub(r'^[^\d]+\s', '', s).strip()
2272
2273     if re.match(r'^[\d,.]+$', s):
2274         return str_to_int(s)
2275
2276     _UNIT_TABLE = {
2277         'k': 1000,
2278         'K': 1000,
2279         'm': 1000 ** 2,
2280         'M': 1000 ** 2,
2281         'kk': 1000 ** 2,
2282         'KK': 1000 ** 2,
2283         'b': 1000 ** 3,
2284         'B': 1000 ** 3,
2285     }
2286
2287     ret = lookup_unit_table(_UNIT_TABLE, s)
2288     if ret is not None:
2289         return ret
2290
2291     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2292     if mobj:
2293         return str_to_int(mobj.group(1))
2294
2295
2296 def parse_resolution(s, *, lenient=False):
2297     if s is None:
2298         return {}
2299
2300     if lenient:
2301         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2302     else:
2303         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2304     if mobj:
2305         return {
2306             'width': int(mobj.group('w')),
2307             'height': int(mobj.group('h')),
2308         }
2309
2310     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2311     if mobj:
2312         return {'height': int(mobj.group(1))}
2313
2314     mobj = re.search(r'\b([48])[kK]\b', s)
2315     if mobj:
2316         return {'height': int(mobj.group(1)) * 540}
2317
2318     return {}
2319
2320
2321 def parse_bitrate(s):
2322     if not isinstance(s, str):
2323         return
2324     mobj = re.search(r'\b(\d+)\s*kbps', s)
2325     if mobj:
2326         return int(mobj.group(1))
2327
2328
2329 def month_by_name(name, lang='en'):
2330     """ Return the number of a month by (locale-independently) English name """
2331
2332     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2333
2334     try:
2335         return month_names.index(name) + 1
2336     except ValueError:
2337         return None
2338
2339
2340 def month_by_abbreviation(abbrev):
2341     """ Return the number of a month by (locale-independently) English
2342         abbreviations """
2343
2344     try:
2345         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2346     except ValueError:
2347         return None
2348
2349
2350 def fix_xml_ampersands(xml_str):
2351     """Replace all the '&' by '&amp;' in XML"""
2352     return re.sub(
2353         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2354         '&amp;',
2355         xml_str)
2356
2357
2358 def setproctitle(title):
2359     assert isinstance(title, str)
2360
2361     # ctypes in Jython is not complete
2362     # http://bugs.jython.org/issue2148
2363     if sys.platform.startswith('java'):
2364         return
2365
2366     try:
2367         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2368     except OSError:
2369         return
2370     except TypeError:
2371         # LoadLibrary in Windows Python 2.7.13 only expects
2372         # a bytestring, but since unicode_literals turns
2373         # every string into a unicode string, it fails.
2374         return
2375     title_bytes = title.encode()
2376     buf = ctypes.create_string_buffer(len(title_bytes))
2377     buf.value = title_bytes
2378     try:
2379         libc.prctl(15, buf, 0, 0, 0)
2380     except AttributeError:
2381         return  # Strange libc, just skip this
2382
2383
2384 def remove_start(s, start):
2385     return s[len(start):] if s is not None and s.startswith(start) else s
2386
2387
2388 def remove_end(s, end):
2389     return s[:-len(end)] if s is not None and s.endswith(end) else s
2390
2391
2392 def remove_quotes(s):
2393     if s is None or len(s) < 2:
2394         return s
2395     for quote in ('"', "'", ):
2396         if s[0] == quote and s[-1] == quote:
2397             return s[1:-1]
2398     return s
2399
2400
2401 def get_domain(url):
2402     """
2403     This implementation is inconsistent, but is kept for compatibility.
2404     Use this only for "webpage_url_domain"
2405     """
2406     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2407
2408
2409 def url_basename(url):
2410     path = urllib.parse.urlparse(url).path
2411     return path.strip('/').split('/')[-1]
2412
2413
2414 def base_url(url):
2415     return re.match(r'https?://[^?#&]+/', url).group()
2416
2417
2418 def urljoin(base, path):
2419     if isinstance(path, bytes):
2420         path = path.decode()
2421     if not isinstance(path, str) or not path:
2422         return None
2423     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2424         return path
2425     if isinstance(base, bytes):
2426         base = base.decode()
2427     if not isinstance(base, str) or not re.match(
2428             r'^(?:https?:)?//', base):
2429         return None
2430     return urllib.parse.urljoin(base, path)
2431
2432
2433 class HEADRequest(urllib.request.Request):
2434     def get_method(self):
2435         return 'HEAD'
2436
2437
2438 class PUTRequest(urllib.request.Request):
2439     def get_method(self):
2440         return 'PUT'
2441
2442
2443 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2444     if get_attr and v is not None:
2445         v = getattr(v, get_attr, None)
2446     try:
2447         return int(v) * invscale // scale
2448     except (ValueError, TypeError, OverflowError):
2449         return default
2450
2451
2452 def str_or_none(v, default=None):
2453     return default if v is None else str(v)
2454
2455
2456 def str_to_int(int_str):
2457     """ A more relaxed version of int_or_none """
2458     if isinstance(int_str, int):
2459         return int_str
2460     elif isinstance(int_str, str):
2461         int_str = re.sub(r'[,\.\+]', '', int_str)
2462         return int_or_none(int_str)
2463
2464
2465 def float_or_none(v, scale=1, invscale=1, default=None):
2466     if v is None:
2467         return default
2468     try:
2469         return float(v) * invscale / scale
2470     except (ValueError, TypeError):
2471         return default
2472
2473
2474 def bool_or_none(v, default=None):
2475     return v if isinstance(v, bool) else default
2476
2477
2478 def strip_or_none(v, default=None):
2479     return v.strip() if isinstance(v, str) else default
2480
2481
2482 def url_or_none(url):
2483     if not url or not isinstance(url, str):
2484         return None
2485     url = url.strip()
2486     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2487
2488
2489 def request_to_url(req):
2490     if isinstance(req, urllib.request.Request):
2491         return req.get_full_url()
2492     else:
2493         return req
2494
2495
2496 def strftime_or_none(timestamp, date_format, default=None):
2497     datetime_object = None
2498     try:
2499         if isinstance(timestamp, (int, float)):  # unix timestamp
2500             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2501         elif isinstance(timestamp, str):  # assume YYYYMMDD
2502             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2503         return datetime_object.strftime(date_format)
2504     except (ValueError, TypeError, AttributeError):
2505         return default
2506
2507
2508 def parse_duration(s):
2509     if not isinstance(s, str):
2510         return None
2511     s = s.strip()
2512     if not s:
2513         return None
2514
2515     days, hours, mins, secs, ms = [None] * 5
2516     m = re.match(r'''(?x)
2517             (?P<before_secs>
2518                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2519             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2520             (?P<ms>[.:][0-9]+)?Z?$
2521         ''', s)
2522     if m:
2523         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2524     else:
2525         m = re.match(
2526             r'''(?ix)(?:P?
2527                 (?:
2528                     [0-9]+\s*y(?:ears?)?,?\s*
2529                 )?
2530                 (?:
2531                     [0-9]+\s*m(?:onths?)?,?\s*
2532                 )?
2533                 (?:
2534                     [0-9]+\s*w(?:eeks?)?,?\s*
2535                 )?
2536                 (?:
2537                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2538                 )?
2539                 T)?
2540                 (?:
2541                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2542                 )?
2543                 (?:
2544                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2545                 )?
2546                 (?:
2547                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2548                 )?Z?$''', s)
2549         if m:
2550             days, hours, mins, secs, ms = m.groups()
2551         else:
2552             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2553             if m:
2554                 hours, mins = m.groups()
2555             else:
2556                 return None
2557
2558     if ms:
2559         ms = ms.replace(':', '.')
2560     return sum(float(part or 0) * mult for part, mult in (
2561         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2562
2563
2564 def prepend_extension(filename, ext, expected_real_ext=None):
2565     name, real_ext = os.path.splitext(filename)
2566     return (
2567         f'{name}.{ext}{real_ext}'
2568         if not expected_real_ext or real_ext[1:] == expected_real_ext
2569         else f'{filename}.{ext}')
2570
2571
2572 def replace_extension(filename, ext, expected_real_ext=None):
2573     name, real_ext = os.path.splitext(filename)
2574     return '{}.{}'.format(
2575         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2576         ext)
2577
2578
2579 def check_executable(exe, args=[]):
2580     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2581     args can be a list of arguments for a short output (like -version) """
2582     try:
2583         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2584     except OSError:
2585         return False
2586     return exe
2587
2588
2589 def _get_exe_version_output(exe, args, *, to_screen=None):
2590     if to_screen:
2591         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2592     try:
2593         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2594         # SIGTTOU if yt-dlp is run in the background.
2595         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2596         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2597                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2598     except OSError:
2599         return False
2600     return stdout
2601
2602
2603 def detect_exe_version(output, version_re=None, unrecognized='present'):
2604     assert isinstance(output, str)
2605     if version_re is None:
2606         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2607     m = re.search(version_re, output)
2608     if m:
2609         return m.group(1)
2610     else:
2611         return unrecognized
2612
2613
2614 def get_exe_version(exe, args=['--version'],
2615                     version_re=None, unrecognized='present'):
2616     """ Returns the version of the specified executable,
2617     or False if the executable is not present """
2618     out = _get_exe_version_output(exe, args)
2619     return detect_exe_version(out, version_re, unrecognized) if out else False
2620
2621
2622 def frange(start=0, stop=None, step=1):
2623     """Float range"""
2624     if stop is None:
2625         start, stop = 0, start
2626     sign = [-1, 1][step > 0] if step else 0
2627     while sign * start < sign * stop:
2628         yield start
2629         start += step
2630
2631
2632 class LazyList(collections.abc.Sequence):
2633     """Lazy immutable list from an iterable
2634     Note that slices of a LazyList are lists and not LazyList"""
2635
2636     class IndexError(IndexError):
2637         pass
2638
2639     def __init__(self, iterable, *, reverse=False, _cache=None):
2640         self._iterable = iter(iterable)
2641         self._cache = [] if _cache is None else _cache
2642         self._reversed = reverse
2643
2644     def __iter__(self):
2645         if self._reversed:
2646             # We need to consume the entire iterable to iterate in reverse
2647             yield from self.exhaust()
2648             return
2649         yield from self._cache
2650         for item in self._iterable:
2651             self._cache.append(item)
2652             yield item
2653
2654     def _exhaust(self):
2655         self._cache.extend(self._iterable)
2656         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2657         return self._cache
2658
2659     def exhaust(self):
2660         """Evaluate the entire iterable"""
2661         return self._exhaust()[::-1 if self._reversed else 1]
2662
2663     @staticmethod
2664     def _reverse_index(x):
2665         return None if x is None else ~x
2666
2667     def __getitem__(self, idx):
2668         if isinstance(idx, slice):
2669             if self._reversed:
2670                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2671             start, stop, step = idx.start, idx.stop, idx.step or 1
2672         elif isinstance(idx, int):
2673             if self._reversed:
2674                 idx = self._reverse_index(idx)
2675             start, stop, step = idx, idx, 0
2676         else:
2677             raise TypeError('indices must be integers or slices')
2678         if ((start or 0) < 0 or (stop or 0) < 0
2679                 or (start is None and step < 0)
2680                 or (stop is None and step > 0)):
2681             # We need to consume the entire iterable to be able to slice from the end
2682             # Obviously, never use this with infinite iterables
2683             self._exhaust()
2684             try:
2685                 return self._cache[idx]
2686             except IndexError as e:
2687                 raise self.IndexError(e) from e
2688         n = max(start or 0, stop or 0) - len(self._cache) + 1
2689         if n > 0:
2690             self._cache.extend(itertools.islice(self._iterable, n))
2691         try:
2692             return self._cache[idx]
2693         except IndexError as e:
2694             raise self.IndexError(e) from e
2695
2696     def __bool__(self):
2697         try:
2698             self[-1] if self._reversed else self[0]
2699         except self.IndexError:
2700             return False
2701         return True
2702
2703     def __len__(self):
2704         self._exhaust()
2705         return len(self._cache)
2706
2707     def __reversed__(self):
2708         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2709
2710     def __copy__(self):
2711         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2712
2713     def __repr__(self):
2714         # repr and str should mimic a list. So we exhaust the iterable
2715         return repr(self.exhaust())
2716
2717     def __str__(self):
2718         return repr(self.exhaust())
2719
2720
2721 class PagedList:
2722
2723     class IndexError(IndexError):
2724         pass
2725
2726     def __len__(self):
2727         # This is only useful for tests
2728         return len(self.getslice())
2729
2730     def __init__(self, pagefunc, pagesize, use_cache=True):
2731         self._pagefunc = pagefunc
2732         self._pagesize = pagesize
2733         self._pagecount = float('inf')
2734         self._use_cache = use_cache
2735         self._cache = {}
2736
2737     def getpage(self, pagenum):
2738         page_results = self._cache.get(pagenum)
2739         if page_results is None:
2740             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2741         if self._use_cache:
2742             self._cache[pagenum] = page_results
2743         return page_results
2744
2745     def getslice(self, start=0, end=None):
2746         return list(self._getslice(start, end))
2747
2748     def _getslice(self, start, end):
2749         raise NotImplementedError('This method must be implemented by subclasses')
2750
2751     def __getitem__(self, idx):
2752         assert self._use_cache, 'Indexing PagedList requires cache'
2753         if not isinstance(idx, int) or idx < 0:
2754             raise TypeError('indices must be non-negative integers')
2755         entries = self.getslice(idx, idx + 1)
2756         if not entries:
2757             raise self.IndexError()
2758         return entries[0]
2759
2760
2761 class OnDemandPagedList(PagedList):
2762     """Download pages until a page with less than maximum results"""
2763
2764     def _getslice(self, start, end):
2765         for pagenum in itertools.count(start // self._pagesize):
2766             firstid = pagenum * self._pagesize
2767             nextfirstid = pagenum * self._pagesize + self._pagesize
2768             if start >= nextfirstid:
2769                 continue
2770
2771             startv = (
2772                 start % self._pagesize
2773                 if firstid <= start < nextfirstid
2774                 else 0)
2775             endv = (
2776                 ((end - 1) % self._pagesize) + 1
2777                 if (end is not None and firstid <= end <= nextfirstid)
2778                 else None)
2779
2780             try:
2781                 page_results = self.getpage(pagenum)
2782             except Exception:
2783                 self._pagecount = pagenum - 1
2784                 raise
2785             if startv != 0 or endv is not None:
2786                 page_results = page_results[startv:endv]
2787             yield from page_results
2788
2789             # A little optimization - if current page is not "full", ie. does
2790             # not contain page_size videos then we can assume that this page
2791             # is the last one - there are no more ids on further pages -
2792             # i.e. no need to query again.
2793             if len(page_results) + startv < self._pagesize:
2794                 break
2795
2796             # If we got the whole page, but the next page is not interesting,
2797             # break out early as well
2798             if end == nextfirstid:
2799                 break
2800
2801
2802 class InAdvancePagedList(PagedList):
2803     """PagedList with total number of pages known in advance"""
2804
2805     def __init__(self, pagefunc, pagecount, pagesize):
2806         PagedList.__init__(self, pagefunc, pagesize, True)
2807         self._pagecount = pagecount
2808
2809     def _getslice(self, start, end):
2810         start_page = start // self._pagesize
2811         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2812         skip_elems = start - start_page * self._pagesize
2813         only_more = None if end is None else end - start
2814         for pagenum in range(start_page, end_page):
2815             page_results = self.getpage(pagenum)
2816             if skip_elems:
2817                 page_results = page_results[skip_elems:]
2818                 skip_elems = None
2819             if only_more is not None:
2820                 if len(page_results) < only_more:
2821                     only_more -= len(page_results)
2822                 else:
2823                     yield from page_results[:only_more]
2824                     break
2825             yield from page_results
2826
2827
2828 class PlaylistEntries:
2829     MissingEntry = object()
2830     is_exhausted = False
2831
2832     def __init__(self, ydl, info_dict):
2833         self.ydl = ydl
2834
2835         # _entries must be assigned now since infodict can change during iteration
2836         entries = info_dict.get('entries')
2837         if entries is None:
2838             raise EntryNotInPlaylist('There are no entries')
2839         elif isinstance(entries, list):
2840             self.is_exhausted = True
2841
2842         requested_entries = info_dict.get('requested_entries')
2843         self.is_incomplete = bool(requested_entries)
2844         if self.is_incomplete:
2845             assert self.is_exhausted
2846             self._entries = [self.MissingEntry] * max(requested_entries)
2847             for i, entry in zip(requested_entries, entries):
2848                 self._entries[i - 1] = entry
2849         elif isinstance(entries, (list, PagedList, LazyList)):
2850             self._entries = entries
2851         else:
2852             self._entries = LazyList(entries)
2853
2854     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2855         (?P<start>[+-]?\d+)?
2856         (?P<range>[:-]
2857             (?P<end>[+-]?\d+|inf(?:inite)?)?
2858             (?::(?P<step>[+-]?\d+))?
2859         )?''')
2860
2861     @classmethod
2862     def parse_playlist_items(cls, string):
2863         for segment in string.split(','):
2864             if not segment:
2865                 raise ValueError('There is two or more consecutive commas')
2866             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2867             if not mobj:
2868                 raise ValueError(f'{segment!r} is not a valid specification')
2869             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2870             if int_or_none(step) == 0:
2871                 raise ValueError(f'Step in {segment!r} cannot be zero')
2872             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2873
2874     def get_requested_items(self):
2875         playlist_items = self.ydl.params.get('playlist_items')
2876         playlist_start = self.ydl.params.get('playliststart', 1)
2877         playlist_end = self.ydl.params.get('playlistend')
2878         # For backwards compatibility, interpret -1 as whole list
2879         if playlist_end in (-1, None):
2880             playlist_end = ''
2881         if not playlist_items:
2882             playlist_items = f'{playlist_start}:{playlist_end}'
2883         elif playlist_start != 1 or playlist_end:
2884             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2885
2886         for index in self.parse_playlist_items(playlist_items):
2887             for i, entry in self[index]:
2888                 yield i, entry
2889                 if not entry:
2890                     continue
2891                 try:
2892                     # TODO: Add auto-generated fields
2893                     self.ydl._match_entry(entry, incomplete=True, silent=True)
2894                 except (ExistingVideoReached, RejectedVideoReached):
2895                     return
2896
2897     def get_full_count(self):
2898         if self.is_exhausted and not self.is_incomplete:
2899             return len(self)
2900         elif isinstance(self._entries, InAdvancePagedList):
2901             if self._entries._pagesize == 1:
2902                 return self._entries._pagecount
2903
2904     @functools.cached_property
2905     def _getter(self):
2906         if isinstance(self._entries, list):
2907             def get_entry(i):
2908                 try:
2909                     entry = self._entries[i]
2910                 except IndexError:
2911                     entry = self.MissingEntry
2912                     if not self.is_incomplete:
2913                         raise self.IndexError()
2914                 if entry is self.MissingEntry:
2915                     raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2916                 return entry
2917         else:
2918             def get_entry(i):
2919                 try:
2920                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2921                 except (LazyList.IndexError, PagedList.IndexError):
2922                     raise self.IndexError()
2923         return get_entry
2924
2925     def __getitem__(self, idx):
2926         if isinstance(idx, int):
2927             idx = slice(idx, idx)
2928
2929         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2930         step = 1 if idx.step is None else idx.step
2931         if idx.start is None:
2932             start = 0 if step > 0 else len(self) - 1
2933         else:
2934             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2935
2936         # NB: Do not call len(self) when idx == [:]
2937         if idx.stop is None:
2938             stop = 0 if step < 0 else float('inf')
2939         else:
2940             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2941         stop += [-1, 1][step > 0]
2942
2943         for i in frange(start, stop, step):
2944             if i < 0:
2945                 continue
2946             try:
2947                 entry = self._getter(i)
2948             except self.IndexError:
2949                 self.is_exhausted = True
2950                 if step > 0:
2951                     break
2952                 continue
2953             yield i + 1, entry
2954
2955     def __len__(self):
2956         return len(tuple(self[:]))
2957
2958     class IndexError(IndexError):
2959         pass
2960
2961
2962 def uppercase_escape(s):
2963     unicode_escape = codecs.getdecoder('unicode_escape')
2964     return re.sub(
2965         r'\\U[0-9a-fA-F]{8}',
2966         lambda m: unicode_escape(m.group(0))[0],
2967         s)
2968
2969
2970 def lowercase_escape(s):
2971     unicode_escape = codecs.getdecoder('unicode_escape')
2972     return re.sub(
2973         r'\\u[0-9a-fA-F]{4}',
2974         lambda m: unicode_escape(m.group(0))[0],
2975         s)
2976
2977
2978 def escape_rfc3986(s):
2979     """Escape non-ASCII characters as suggested by RFC 3986"""
2980     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2981
2982
2983 def escape_url(url):
2984     """Escape URL as suggested by RFC 3986"""
2985     url_parsed = urllib.parse.urlparse(url)
2986     return url_parsed._replace(
2987         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2988         path=escape_rfc3986(url_parsed.path),
2989         params=escape_rfc3986(url_parsed.params),
2990         query=escape_rfc3986(url_parsed.query),
2991         fragment=escape_rfc3986(url_parsed.fragment)
2992     ).geturl()
2993
2994
2995 def parse_qs(url):
2996     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
2997
2998
2999 def read_batch_urls(batch_fd):
3000     def fixup(url):
3001         if not isinstance(url, str):
3002             url = url.decode('utf-8', 'replace')
3003         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3004         for bom in BOM_UTF8:
3005             if url.startswith(bom):
3006                 url = url[len(bom):]
3007         url = url.lstrip()
3008         if not url or url.startswith(('#', ';', ']')):
3009             return False
3010         # "#" cannot be stripped out since it is part of the URI
3011         # However, it can be safely stripped out if following a whitespace
3012         return re.split(r'\s#', url, 1)[0].rstrip()
3013
3014     with contextlib.closing(batch_fd) as fd:
3015         return [url for url in map(fixup, fd) if url]
3016
3017
3018 def urlencode_postdata(*args, **kargs):
3019     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3020
3021
3022 def update_url_query(url, query):
3023     if not query:
3024         return url
3025     parsed_url = urllib.parse.urlparse(url)
3026     qs = urllib.parse.parse_qs(parsed_url.query)
3027     qs.update(query)
3028     return urllib.parse.urlunparse(parsed_url._replace(
3029         query=urllib.parse.urlencode(qs, True)))
3030
3031
3032 def update_Request(req, url=None, data=None, headers=None, query=None):
3033     req_headers = req.headers.copy()
3034     req_headers.update(headers or {})
3035     req_data = data or req.data
3036     req_url = update_url_query(url or req.get_full_url(), query)
3037     req_get_method = req.get_method()
3038     if req_get_method == 'HEAD':
3039         req_type = HEADRequest
3040     elif req_get_method == 'PUT':
3041         req_type = PUTRequest
3042     else:
3043         req_type = urllib.request.Request
3044     new_req = req_type(
3045         req_url, data=req_data, headers=req_headers,
3046         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3047     if hasattr(req, 'timeout'):
3048         new_req.timeout = req.timeout
3049     return new_req
3050
3051
3052 def _multipart_encode_impl(data, boundary):
3053     content_type = 'multipart/form-data; boundary=%s' % boundary
3054
3055     out = b''
3056     for k, v in data.items():
3057         out += b'--' + boundary.encode('ascii') + b'\r\n'
3058         if isinstance(k, str):
3059             k = k.encode()
3060         if isinstance(v, str):
3061             v = v.encode()
3062         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3063         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3064         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3065         if boundary.encode('ascii') in content:
3066             raise ValueError('Boundary overlaps with data')
3067         out += content
3068
3069     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3070
3071     return out, content_type
3072
3073
3074 def multipart_encode(data, boundary=None):
3075     '''
3076     Encode a dict to RFC 7578-compliant form-data
3077
3078     data:
3079         A dict where keys and values can be either Unicode or bytes-like
3080         objects.
3081     boundary:
3082         If specified a Unicode object, it's used as the boundary. Otherwise
3083         a random boundary is generated.
3084
3085     Reference: https://tools.ietf.org/html/rfc7578
3086     '''
3087     has_specified_boundary = boundary is not None
3088
3089     while True:
3090         if boundary is None:
3091             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3092
3093         try:
3094             out, content_type = _multipart_encode_impl(data, boundary)
3095             break
3096         except ValueError:
3097             if has_specified_boundary:
3098                 raise
3099             boundary = None
3100
3101     return out, content_type
3102
3103
3104 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3105     for val in map(d.get, variadic(key_or_keys)):
3106         if val is not None and (val or not skip_false_values):
3107             return val
3108     return default
3109
3110
3111 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3112     for f in funcs:
3113         try:
3114             val = f(*args, **kwargs)
3115         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3116             pass
3117         else:
3118             if expected_type is None or isinstance(val, expected_type):
3119                 return val
3120
3121
3122 def try_get(src, getter, expected_type=None):
3123     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3124
3125
3126 def filter_dict(dct, cndn=lambda _, v: v is not None):
3127     return {k: v for k, v in dct.items() if cndn(k, v)}
3128
3129
3130 def merge_dicts(*dicts):
3131     merged = {}
3132     for a_dict in dicts:
3133         for k, v in a_dict.items():
3134             if (v is not None and k not in merged
3135                     or isinstance(v, str) and merged[k] == ''):
3136                 merged[k] = v
3137     return merged
3138
3139
3140 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3141     return string if isinstance(string, str) else str(string, encoding, errors)
3142
3143
3144 US_RATINGS = {
3145     'G': 0,
3146     'PG': 10,
3147     'PG-13': 13,
3148     'R': 16,
3149     'NC': 18,
3150 }
3151
3152
3153 TV_PARENTAL_GUIDELINES = {
3154     'TV-Y': 0,
3155     'TV-Y7': 7,
3156     'TV-G': 0,
3157     'TV-PG': 0,
3158     'TV-14': 14,
3159     'TV-MA': 17,
3160 }
3161
3162
3163 def parse_age_limit(s):
3164     # isinstance(False, int) is True. So type() must be used instead
3165     if type(s) is int:  # noqa: E721
3166         return s if 0 <= s <= 21 else None
3167     elif not isinstance(s, str):
3168         return None
3169     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3170     if m:
3171         return int(m.group('age'))
3172     s = s.upper()
3173     if s in US_RATINGS:
3174         return US_RATINGS[s]
3175     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3176     if m:
3177         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3178     return None
3179
3180
3181 def strip_jsonp(code):
3182     return re.sub(
3183         r'''(?sx)^
3184             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3185             (?:\s*&&\s*(?P=func_name))?
3186             \s*\(\s*(?P<callback_data>.*)\);?
3187             \s*?(?://[^\n]*)*$''',
3188         r'\g<callback_data>', code)
3189
3190
3191 def js_to_json(code, vars={}):
3192     # vars is a dict of var, val pairs to substitute
3193     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3194     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3195     INTEGER_TABLE = (
3196         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3197         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3198     )
3199
3200     def fix_kv(m):
3201         v = m.group(0)
3202         if v in ('true', 'false', 'null'):
3203             return v
3204         elif v in ('undefined', 'void 0'):
3205             return 'null'
3206         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3207             return ""
3208
3209         if v[0] in ("'", '"'):
3210             v = re.sub(r'(?s)\\.|"', lambda m: {
3211                 '"': '\\"',
3212                 "\\'": "'",
3213                 '\\\n': '',
3214                 '\\x': '\\u00',
3215             }.get(m.group(0), m.group(0)), v[1:-1])
3216         else:
3217             for regex, base in INTEGER_TABLE:
3218                 im = re.match(regex, v)
3219                 if im:
3220                     i = int(im.group(1), base)
3221                     return '"%d":' % i if v.endswith(':') else '%d' % i
3222
3223             if v in vars:
3224                 return vars[v]
3225
3226         return '"%s"' % v
3227
3228     def create_map(mobj):
3229         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3230
3231     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3232     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3233
3234     return re.sub(r'''(?sx)
3235         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3236         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3237         {comment}|,(?={skip}[\]}}])|
3238         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3239         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3240         [0-9]+(?={skip}:)|
3241         !+
3242         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3243
3244
3245 def qualities(quality_ids):
3246     """ Get a numeric quality value out of a list of possible values """
3247     def q(qid):
3248         try:
3249             return quality_ids.index(qid)
3250         except ValueError:
3251             return -1
3252     return q
3253
3254
3255 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3256
3257
3258 DEFAULT_OUTTMPL = {
3259     'default': '%(title)s [%(id)s].%(ext)s',
3260     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3261 }
3262 OUTTMPL_TYPES = {
3263     'chapter': None,
3264     'subtitle': None,
3265     'thumbnail': None,
3266     'description': 'description',
3267     'annotation': 'annotations.xml',
3268     'infojson': 'info.json',
3269     'link': None,
3270     'pl_video': None,
3271     'pl_thumbnail': None,
3272     'pl_description': 'description',
3273     'pl_infojson': 'info.json',
3274 }
3275
3276 # As of [1] format syntax is:
3277 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3278 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3279 STR_FORMAT_RE_TMPL = r'''(?x)
3280     (?<!%)(?P<prefix>(?:%%)*)
3281     %
3282     (?P<has_key>\((?P<key>{0})\))?
3283     (?P<format>
3284         (?P<conversion>[#0\-+ ]+)?
3285         (?P<min_width>\d+)?
3286         (?P<precision>\.\d+)?
3287         (?P<len_mod>[hlL])?  # unused in python
3288         {1}  # conversion type
3289     )
3290 '''
3291
3292
3293 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3294
3295
3296 def limit_length(s, length):
3297     """ Add ellipses to overly long strings """
3298     if s is None:
3299         return None
3300     ELLIPSES = '...'
3301     if len(s) > length:
3302         return s[:length - len(ELLIPSES)] + ELLIPSES
3303     return s
3304
3305
3306 def version_tuple(v):
3307     return tuple(int(e) for e in re.split(r'[-.]', v))
3308
3309
3310 def is_outdated_version(version, limit, assume_new=True):
3311     if not version:
3312         return not assume_new
3313     try:
3314         return version_tuple(version) < version_tuple(limit)
3315     except ValueError:
3316         return not assume_new
3317
3318
3319 def ytdl_is_updateable():
3320     """ Returns if yt-dlp can be updated with -U """
3321
3322     from .update import is_non_updateable
3323
3324     return not is_non_updateable()
3325
3326
3327 def args_to_str(args):
3328     # Get a short string representation for a subprocess command
3329     return ' '.join(compat_shlex_quote(a) for a in args)
3330
3331
3332 def error_to_compat_str(err):
3333     return str(err)
3334
3335
3336 def error_to_str(err):
3337     return f'{type(err).__name__}: {err}'
3338
3339
3340 def mimetype2ext(mt):
3341     if mt is None:
3342         return None
3343
3344     mt, _, params = mt.partition(';')
3345     mt = mt.strip()
3346
3347     FULL_MAP = {
3348         'audio/mp4': 'm4a',
3349         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3350         # it's the most popular one
3351         'audio/mpeg': 'mp3',
3352         'audio/x-wav': 'wav',
3353         'audio/wav': 'wav',
3354         'audio/wave': 'wav',
3355     }
3356
3357     ext = FULL_MAP.get(mt)
3358     if ext is not None:
3359         return ext
3360
3361     SUBTYPE_MAP = {
3362         '3gpp': '3gp',
3363         'smptett+xml': 'tt',
3364         'ttaf+xml': 'dfxp',
3365         'ttml+xml': 'ttml',
3366         'x-flv': 'flv',
3367         'x-mp4-fragmented': 'mp4',
3368         'x-ms-sami': 'sami',
3369         'x-ms-wmv': 'wmv',
3370         'mpegurl': 'm3u8',
3371         'x-mpegurl': 'm3u8',
3372         'vnd.apple.mpegurl': 'm3u8',
3373         'dash+xml': 'mpd',
3374         'f4m+xml': 'f4m',
3375         'hds+xml': 'f4m',
3376         'vnd.ms-sstr+xml': 'ism',
3377         'quicktime': 'mov',
3378         'mp2t': 'ts',
3379         'x-wav': 'wav',
3380         'filmstrip+json': 'fs',
3381         'svg+xml': 'svg',
3382     }
3383
3384     _, _, subtype = mt.rpartition('/')
3385     ext = SUBTYPE_MAP.get(subtype.lower())
3386     if ext is not None:
3387         return ext
3388
3389     SUFFIX_MAP = {
3390         'json': 'json',
3391         'xml': 'xml',
3392         'zip': 'zip',
3393         'gzip': 'gz',
3394     }
3395
3396     _, _, suffix = subtype.partition('+')
3397     ext = SUFFIX_MAP.get(suffix)
3398     if ext is not None:
3399         return ext
3400
3401     return subtype.replace('+', '.')
3402
3403
3404 def ext2mimetype(ext_or_url):
3405     if not ext_or_url:
3406         return None
3407     if '.' not in ext_or_url:
3408         ext_or_url = f'file.{ext_or_url}'
3409     return mimetypes.guess_type(ext_or_url)[0]
3410
3411
3412 def parse_codecs(codecs_str):
3413     # http://tools.ietf.org/html/rfc6381
3414     if not codecs_str:
3415         return {}
3416     split_codecs = list(filter(None, map(
3417         str.strip, codecs_str.strip().strip(',').split(','))))
3418     vcodec, acodec, scodec, hdr = None, None, None, None
3419     for full_codec in split_codecs:
3420         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3421         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3422                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3423             if vcodec:
3424                 continue
3425             vcodec = full_codec
3426             if parts[0] in ('dvh1', 'dvhe'):
3427                 hdr = 'DV'
3428             elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3429                 hdr = 'HDR10'
3430             elif parts[:2] == ['vp9', '2']:
3431                 hdr = 'HDR10'
3432         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3433                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3434             acodec = acodec or full_codec
3435         elif parts[0] in ('stpp', 'wvtt'):
3436             scodec = scodec or full_codec
3437         else:
3438             write_string(f'WARNING: Unknown codec {full_codec}\n')
3439     if vcodec or acodec or scodec:
3440         return {
3441             'vcodec': vcodec or 'none',
3442             'acodec': acodec or 'none',
3443             'dynamic_range': hdr,
3444             **({'scodec': scodec} if scodec is not None else {}),
3445         }
3446     elif len(split_codecs) == 2:
3447         return {
3448             'vcodec': split_codecs[0],
3449             'acodec': split_codecs[1],
3450         }
3451     return {}
3452
3453
3454 def urlhandle_detect_ext(url_handle):
3455     getheader = url_handle.headers.get
3456
3457     cd = getheader('Content-Disposition')
3458     if cd:
3459         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3460         if m:
3461             e = determine_ext(m.group('filename'), default_ext=None)
3462             if e:
3463                 return e
3464
3465     return mimetype2ext(getheader('Content-Type'))
3466
3467
3468 def encode_data_uri(data, mime_type):
3469     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3470
3471
3472 def age_restricted(content_limit, age_limit):
3473     """ Returns True iff the content should be blocked """
3474
3475     if age_limit is None:  # No limit set
3476         return False
3477     if content_limit is None:
3478         return False  # Content available for everyone
3479     return age_limit < content_limit
3480
3481
3482 # List of known byte-order-marks (BOM)
3483 BOMS = [
3484     (b'\xef\xbb\xbf', 'utf-8'),
3485     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3486     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3487     (b'\xff\xfe', 'utf-16-le'),
3488     (b'\xfe\xff', 'utf-16-be'),
3489 ]
3490
3491
3492 def is_html(first_bytes):
3493     """ Detect whether a file contains HTML by examining its first bytes. """
3494
3495     encoding = 'utf-8'
3496     for bom, enc in BOMS:
3497         while first_bytes.startswith(bom):
3498             encoding, first_bytes = enc, first_bytes[len(bom):]
3499
3500     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3501
3502
3503 def determine_protocol(info_dict):
3504     protocol = info_dict.get('protocol')
3505     if protocol is not None:
3506         return protocol
3507
3508     url = sanitize_url(info_dict['url'])
3509     if url.startswith('rtmp'):
3510         return 'rtmp'
3511     elif url.startswith('mms'):
3512         return 'mms'
3513     elif url.startswith('rtsp'):
3514         return 'rtsp'
3515
3516     ext = determine_ext(url)
3517     if ext == 'm3u8':
3518         return 'm3u8'
3519     elif ext == 'f4m':
3520         return 'f4m'
3521
3522     return urllib.parse.urlparse(url).scheme
3523
3524
3525 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3526     """ Render a list of rows, each as a list of values.
3527     Text after a \t will be right aligned """
3528     def width(string):
3529         return len(remove_terminal_sequences(string).replace('\t', ''))
3530
3531     def get_max_lens(table):
3532         return [max(width(str(v)) for v in col) for col in zip(*table)]
3533
3534     def filter_using_list(row, filterArray):
3535         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3536
3537     max_lens = get_max_lens(data) if hide_empty else []
3538     header_row = filter_using_list(header_row, max_lens)
3539     data = [filter_using_list(row, max_lens) for row in data]
3540
3541     table = [header_row] + data
3542     max_lens = get_max_lens(table)
3543     extra_gap += 1
3544     if delim:
3545         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3546         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3547     for row in table:
3548         for pos, text in enumerate(map(str, row)):
3549             if '\t' in text:
3550                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3551             else:
3552                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3553     ret = '\n'.join(''.join(row).rstrip() for row in table)
3554     return ret
3555
3556
3557 def _match_one(filter_part, dct, incomplete):
3558     # TODO: Generalize code with YoutubeDL._build_format_filter
3559     STRING_OPERATORS = {
3560         '*=': operator.contains,
3561         '^=': lambda attr, value: attr.startswith(value),
3562         '$=': lambda attr, value: attr.endswith(value),
3563         '~=': lambda attr, value: re.search(value, attr),
3564     }
3565     COMPARISON_OPERATORS = {
3566         **STRING_OPERATORS,
3567         '<=': operator.le,  # "<=" must be defined above "<"
3568         '<': operator.lt,
3569         '>=': operator.ge,
3570         '>': operator.gt,
3571         '=': operator.eq,
3572     }
3573
3574     if isinstance(incomplete, bool):
3575         is_incomplete = lambda _: incomplete
3576     else:
3577         is_incomplete = lambda k: k in incomplete
3578
3579     operator_rex = re.compile(r'''(?x)
3580         (?P<key>[a-z_]+)
3581         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3582         (?:
3583             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3584             (?P<strval>.+?)
3585         )
3586         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3587     m = operator_rex.fullmatch(filter_part.strip())
3588     if m:
3589         m = m.groupdict()
3590         unnegated_op = COMPARISON_OPERATORS[m['op']]
3591         if m['negation']:
3592             op = lambda attr, value: not unnegated_op(attr, value)
3593         else:
3594             op = unnegated_op
3595         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3596         if m['quote']:
3597             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3598         actual_value = dct.get(m['key'])
3599         numeric_comparison = None
3600         if isinstance(actual_value, (int, float)):
3601             # If the original field is a string and matching comparisonvalue is
3602             # a number we should respect the origin of the original field
3603             # and process comparison value as a string (see
3604             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3605             try:
3606                 numeric_comparison = int(comparison_value)
3607             except ValueError:
3608                 numeric_comparison = parse_filesize(comparison_value)
3609                 if numeric_comparison is None:
3610                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3611                 if numeric_comparison is None:
3612                     numeric_comparison = parse_duration(comparison_value)
3613         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3614             raise ValueError('Operator %s only supports string values!' % m['op'])
3615         if actual_value is None:
3616             return is_incomplete(m['key']) or m['none_inclusive']
3617         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3618
3619     UNARY_OPERATORS = {
3620         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3621         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3622     }
3623     operator_rex = re.compile(r'''(?x)
3624         (?P<op>%s)\s*(?P<key>[a-z_]+)
3625         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3626     m = operator_rex.fullmatch(filter_part.strip())
3627     if m:
3628         op = UNARY_OPERATORS[m.group('op')]
3629         actual_value = dct.get(m.group('key'))
3630         if is_incomplete(m.group('key')) and actual_value is None:
3631             return True
3632         return op(actual_value)
3633
3634     raise ValueError('Invalid filter part %r' % filter_part)
3635
3636
3637 def match_str(filter_str, dct, incomplete=False):
3638     """ Filter a dictionary with a simple string syntax.
3639     @returns           Whether the filter passes
3640     @param incomplete  Set of keys that is expected to be missing from dct.
3641                        Can be True/False to indicate all/none of the keys may be missing.
3642                        All conditions on incomplete keys pass if the key is missing
3643     """
3644     return all(
3645         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3646         for filter_part in re.split(r'(?<!\\)&', filter_str))
3647
3648
3649 def match_filter_func(filters):
3650     if not filters:
3651         return None
3652     filters = set(variadic(filters))
3653
3654     interactive = '-' in filters
3655     if interactive:
3656         filters.remove('-')
3657
3658     def _match_func(info_dict, incomplete=False):
3659         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3660             return NO_DEFAULT if interactive and not incomplete else None
3661         else:
3662             video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3663             filter_str = ') | ('.join(map(str.strip, filters))
3664             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3665     return _match_func
3666
3667
3668 class download_range_func:
3669     def __init__(self, chapters, ranges):
3670         self.chapters, self.ranges = chapters, ranges
3671
3672     def __call__(self, info_dict, ydl):
3673         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3674                    else 'Cannot match chapters since chapter information is unavailable')
3675         for regex in self.chapters or []:
3676             for i, chapter in enumerate(info_dict.get('chapters') or []):
3677                 if re.search(regex, chapter['title']):
3678                     warning = None
3679                     yield {**chapter, 'index': i}
3680         if self.chapters and warning:
3681             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3682
3683         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3684
3685     def __eq__(self, other):
3686         return (isinstance(other, download_range_func)
3687                 and self.chapters == other.chapters and self.ranges == other.ranges)
3688
3689
3690 def parse_dfxp_time_expr(time_expr):
3691     if not time_expr:
3692         return
3693
3694     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3695     if mobj:
3696         return float(mobj.group('time_offset'))
3697
3698     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3699     if mobj:
3700         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3701
3702
3703 def srt_subtitles_timecode(seconds):
3704     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3705
3706
3707 def ass_subtitles_timecode(seconds):
3708     time = timetuple_from_msec(seconds * 1000)
3709     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3710
3711
3712 def dfxp2srt(dfxp_data):
3713     '''
3714     @param dfxp_data A bytes-like object containing DFXP data
3715     @returns A unicode object containing converted SRT data
3716     '''
3717     LEGACY_NAMESPACES = (
3718         (b'http://www.w3.org/ns/ttml', [
3719             b'http://www.w3.org/2004/11/ttaf1',
3720             b'http://www.w3.org/2006/04/ttaf1',
3721             b'http://www.w3.org/2006/10/ttaf1',
3722         ]),
3723         (b'http://www.w3.org/ns/ttml#styling', [
3724             b'http://www.w3.org/ns/ttml#style',
3725         ]),
3726     )
3727
3728     SUPPORTED_STYLING = [
3729         'color',
3730         'fontFamily',
3731         'fontSize',
3732         'fontStyle',
3733         'fontWeight',
3734         'textDecoration'
3735     ]
3736
3737     _x = functools.partial(xpath_with_ns, ns_map={
3738         'xml': 'http://www.w3.org/XML/1998/namespace',
3739         'ttml': 'http://www.w3.org/ns/ttml',
3740         'tts': 'http://www.w3.org/ns/ttml#styling',
3741     })
3742
3743     styles = {}
3744     default_style = {}
3745
3746     class TTMLPElementParser:
3747         _out = ''
3748         _unclosed_elements = []
3749         _applied_styles = []
3750
3751         def start(self, tag, attrib):
3752             if tag in (_x('ttml:br'), 'br'):
3753                 self._out += '\n'
3754             else:
3755                 unclosed_elements = []
3756                 style = {}
3757                 element_style_id = attrib.get('style')
3758                 if default_style:
3759                     style.update(default_style)
3760                 if element_style_id:
3761                     style.update(styles.get(element_style_id, {}))
3762                 for prop in SUPPORTED_STYLING:
3763                     prop_val = attrib.get(_x('tts:' + prop))
3764                     if prop_val:
3765                         style[prop] = prop_val
3766                 if style:
3767                     font = ''
3768                     for k, v in sorted(style.items()):
3769                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3770                             continue
3771                         if k == 'color':
3772                             font += ' color="%s"' % v
3773                         elif k == 'fontSize':
3774                             font += ' size="%s"' % v
3775                         elif k == 'fontFamily':
3776                             font += ' face="%s"' % v
3777                         elif k == 'fontWeight' and v == 'bold':
3778                             self._out += '<b>'
3779                             unclosed_elements.append('b')
3780                         elif k == 'fontStyle' and v == 'italic':
3781                             self._out += '<i>'
3782                             unclosed_elements.append('i')
3783                         elif k == 'textDecoration' and v == 'underline':
3784                             self._out += '<u>'
3785                             unclosed_elements.append('u')
3786                     if font:
3787                         self._out += '<font' + font + '>'
3788                         unclosed_elements.append('font')
3789                     applied_style = {}
3790                     if self._applied_styles:
3791                         applied_style.update(self._applied_styles[-1])
3792                     applied_style.update(style)
3793                     self._applied_styles.append(applied_style)
3794                 self._unclosed_elements.append(unclosed_elements)
3795
3796         def end(self, tag):
3797             if tag not in (_x('ttml:br'), 'br'):
3798                 unclosed_elements = self._unclosed_elements.pop()
3799                 for element in reversed(unclosed_elements):
3800                     self._out += '</%s>' % element
3801                 if unclosed_elements and self._applied_styles:
3802                     self._applied_styles.pop()
3803
3804         def data(self, data):
3805             self._out += data
3806
3807         def close(self):
3808             return self._out.strip()
3809
3810     def parse_node(node):
3811         target = TTMLPElementParser()
3812         parser = xml.etree.ElementTree.XMLParser(target=target)
3813         parser.feed(xml.etree.ElementTree.tostring(node))
3814         return parser.close()
3815
3816     for k, v in LEGACY_NAMESPACES:
3817         for ns in v:
3818             dfxp_data = dfxp_data.replace(ns, k)
3819
3820     dfxp = compat_etree_fromstring(dfxp_data)
3821     out = []
3822     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3823
3824     if not paras:
3825         raise ValueError('Invalid dfxp/TTML subtitle')
3826
3827     repeat = False
3828     while True:
3829         for style in dfxp.findall(_x('.//ttml:style')):
3830             style_id = style.get('id') or style.get(_x('xml:id'))
3831             if not style_id:
3832                 continue
3833             parent_style_id = style.get('style')
3834             if parent_style_id:
3835                 if parent_style_id not in styles:
3836                     repeat = True
3837                     continue
3838                 styles[style_id] = styles[parent_style_id].copy()
3839             for prop in SUPPORTED_STYLING:
3840                 prop_val = style.get(_x('tts:' + prop))
3841                 if prop_val:
3842                     styles.setdefault(style_id, {})[prop] = prop_val
3843         if repeat:
3844             repeat = False
3845         else:
3846             break
3847
3848     for p in ('body', 'div'):
3849         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3850         if ele is None:
3851             continue
3852         style = styles.get(ele.get('style'))
3853         if not style:
3854             continue
3855         default_style.update(style)
3856
3857     for para, index in zip(paras, itertools.count(1)):
3858         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3859         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3860         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3861         if begin_time is None:
3862             continue
3863         if not end_time:
3864             if not dur:
3865                 continue
3866             end_time = begin_time + dur
3867         out.append('%d\n%s --> %s\n%s\n\n' % (
3868             index,
3869             srt_subtitles_timecode(begin_time),
3870             srt_subtitles_timecode(end_time),
3871             parse_node(para)))
3872
3873     return ''.join(out)
3874
3875
3876 def cli_option(params, command_option, param, separator=None):
3877     param = params.get(param)
3878     return ([] if param is None
3879             else [command_option, str(param)] if separator is None
3880             else [f'{command_option}{separator}{param}'])
3881
3882
3883 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3884     param = params.get(param)
3885     assert param in (True, False, None)
3886     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3887
3888
3889 def cli_valueless_option(params, command_option, param, expected_value=True):
3890     return [command_option] if params.get(param) == expected_value else []
3891
3892
3893 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3894     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3895         if use_compat:
3896             return argdict
3897         else:
3898             argdict = None
3899     if argdict is None:
3900         return default
3901     assert isinstance(argdict, dict)
3902
3903     assert isinstance(keys, (list, tuple))
3904     for key_list in keys:
3905         arg_list = list(filter(
3906             lambda x: x is not None,
3907             [argdict.get(key.lower()) for key in variadic(key_list)]))
3908         if arg_list:
3909             return [arg for args in arg_list for arg in args]
3910     return default
3911
3912
3913 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3914     main_key, exe = main_key.lower(), exe.lower()
3915     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3916     keys = [f'{root_key}{k}' for k in (keys or [''])]
3917     if root_key in keys:
3918         if main_key != exe:
3919             keys.append((main_key, exe))
3920         keys.append('default')
3921     else:
3922         use_compat = False
3923     return cli_configuration_args(argdict, keys, default, use_compat)
3924
3925
3926 class ISO639Utils:
3927     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3928     _lang_map = {
3929         'aa': 'aar',
3930         'ab': 'abk',
3931         'ae': 'ave',
3932         'af': 'afr',
3933         'ak': 'aka',
3934         'am': 'amh',
3935         'an': 'arg',
3936         'ar': 'ara',
3937         'as': 'asm',
3938         'av': 'ava',
3939         'ay': 'aym',
3940         'az': 'aze',
3941         'ba': 'bak',
3942         'be': 'bel',
3943         'bg': 'bul',
3944         'bh': 'bih',
3945         'bi': 'bis',
3946         'bm': 'bam',
3947         'bn': 'ben',
3948         'bo': 'bod',
3949         'br': 'bre',
3950         'bs': 'bos',
3951         'ca': 'cat',
3952         'ce': 'che',
3953         'ch': 'cha',
3954         'co': 'cos',
3955         'cr': 'cre',
3956         'cs': 'ces',
3957         'cu': 'chu',
3958         'cv': 'chv',
3959         'cy': 'cym',
3960         'da': 'dan',
3961         'de': 'deu',
3962         'dv': 'div',
3963         'dz': 'dzo',
3964         'ee': 'ewe',
3965         'el': 'ell',
3966         'en': 'eng',
3967         'eo': 'epo',
3968         'es': 'spa',
3969         'et': 'est',
3970         'eu': 'eus',
3971         'fa': 'fas',
3972         'ff': 'ful',
3973         'fi': 'fin',
3974         'fj': 'fij',
3975         'fo': 'fao',
3976         'fr': 'fra',
3977         'fy': 'fry',
3978         'ga': 'gle',
3979         'gd': 'gla',
3980         'gl': 'glg',
3981         'gn': 'grn',
3982         'gu': 'guj',
3983         'gv': 'glv',
3984         'ha': 'hau',
3985         'he': 'heb',
3986         'iw': 'heb',  # Replaced by he in 1989 revision
3987         'hi': 'hin',
3988         'ho': 'hmo',
3989         'hr': 'hrv',
3990         'ht': 'hat',
3991         'hu': 'hun',
3992         'hy': 'hye',
3993         'hz': 'her',
3994         'ia': 'ina',
3995         'id': 'ind',
3996         'in': 'ind',  # Replaced by id in 1989 revision
3997         'ie': 'ile',
3998         'ig': 'ibo',
3999         'ii': 'iii',
4000         'ik': 'ipk',
4001         'io': 'ido',
4002         'is': 'isl',
4003         'it': 'ita',
4004         'iu': 'iku',
4005         'ja': 'jpn',
4006         'jv': 'jav',
4007         'ka': 'kat',
4008         'kg': 'kon',
4009         'ki': 'kik',
4010         'kj': 'kua',
4011         'kk': 'kaz',
4012         'kl': 'kal',
4013         'km': 'khm',
4014         'kn': 'kan',
4015         'ko': 'kor',
4016         'kr': 'kau',
4017         'ks': 'kas',
4018         'ku': 'kur',
4019         'kv': 'kom',
4020         'kw': 'cor',
4021         'ky': 'kir',
4022         'la': 'lat',
4023         'lb': 'ltz',
4024         'lg': 'lug',
4025         'li': 'lim',
4026         'ln': 'lin',
4027         'lo': 'lao',
4028         'lt': 'lit',
4029         'lu': 'lub',
4030         'lv': 'lav',
4031         'mg': 'mlg',
4032         'mh': 'mah',
4033         'mi': 'mri',
4034         'mk': 'mkd',
4035         'ml': 'mal',
4036         'mn': 'mon',
4037         'mr': 'mar',
4038         'ms': 'msa',
4039         'mt': 'mlt',
4040         'my': 'mya',
4041         'na': 'nau',
4042         'nb': 'nob',
4043         'nd': 'nde',
4044         'ne': 'nep',
4045         'ng': 'ndo',
4046         'nl': 'nld',
4047         'nn': 'nno',
4048         'no': 'nor',
4049         'nr': 'nbl',
4050         'nv': 'nav',
4051         'ny': 'nya',
4052         'oc': 'oci',
4053         'oj': 'oji',
4054         'om': 'orm',
4055         'or': 'ori',
4056         'os': 'oss',
4057         'pa': 'pan',
4058         'pi': 'pli',
4059         'pl': 'pol',
4060         'ps': 'pus',
4061         'pt': 'por',
4062         'qu': 'que',
4063         'rm': 'roh',
4064         'rn': 'run',
4065         'ro': 'ron',
4066         'ru': 'rus',
4067         'rw': 'kin',
4068         'sa': 'san',
4069         'sc': 'srd',
4070         'sd': 'snd',
4071         'se': 'sme',
4072         'sg': 'sag',
4073         'si': 'sin',
4074         'sk': 'slk',
4075         'sl': 'slv',
4076         'sm': 'smo',
4077         'sn': 'sna',
4078         'so': 'som',
4079         'sq': 'sqi',
4080         'sr': 'srp',
4081         'ss': 'ssw',
4082         'st': 'sot',
4083         'su': 'sun',
4084         'sv': 'swe',
4085         'sw': 'swa',
4086         'ta': 'tam',
4087         'te': 'tel',
4088         'tg': 'tgk',
4089         'th': 'tha',
4090         'ti': 'tir',
4091         'tk': 'tuk',
4092         'tl': 'tgl',
4093         'tn': 'tsn',
4094         'to': 'ton',
4095         'tr': 'tur',
4096         'ts': 'tso',
4097         'tt': 'tat',
4098         'tw': 'twi',
4099         'ty': 'tah',
4100         'ug': 'uig',
4101         'uk': 'ukr',
4102         'ur': 'urd',
4103         'uz': 'uzb',
4104         've': 'ven',
4105         'vi': 'vie',
4106         'vo': 'vol',
4107         'wa': 'wln',
4108         'wo': 'wol',
4109         'xh': 'xho',
4110         'yi': 'yid',
4111         'ji': 'yid',  # Replaced by yi in 1989 revision
4112         'yo': 'yor',
4113         'za': 'zha',
4114         'zh': 'zho',
4115         'zu': 'zul',
4116     }
4117
4118     @classmethod
4119     def short2long(cls, code):
4120         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4121         return cls._lang_map.get(code[:2])
4122
4123     @classmethod
4124     def long2short(cls, code):
4125         """Convert language code from ISO 639-2/T to ISO 639-1"""
4126         for short_name, long_name in cls._lang_map.items():
4127             if long_name == code:
4128                 return short_name
4129
4130
4131 class ISO3166Utils:
4132     # From http://data.okfn.org/data/core/country-list
4133     _country_map = {
4134         'AF': 'Afghanistan',
4135         'AX': 'Åland Islands',
4136         'AL': 'Albania',
4137         'DZ': 'Algeria',
4138         'AS': 'American Samoa',
4139         'AD': 'Andorra',
4140         'AO': 'Angola',
4141         'AI': 'Anguilla',
4142         'AQ': 'Antarctica',
4143         'AG': 'Antigua and Barbuda',
4144         'AR': 'Argentina',
4145         'AM': 'Armenia',
4146         'AW': 'Aruba',
4147         'AU': 'Australia',
4148         'AT': 'Austria',
4149         'AZ': 'Azerbaijan',
4150         'BS': 'Bahamas',
4151         'BH': 'Bahrain',
4152         'BD': 'Bangladesh',
4153         'BB': 'Barbados',
4154         'BY': 'Belarus',
4155         'BE': 'Belgium',
4156         'BZ': 'Belize',
4157         'BJ': 'Benin',
4158         'BM': 'Bermuda',
4159         'BT': 'Bhutan',
4160         'BO': 'Bolivia, Plurinational State of',
4161         'BQ': 'Bonaire, Sint Eustatius and Saba',
4162         'BA': 'Bosnia and Herzegovina',
4163         'BW': 'Botswana',
4164         'BV': 'Bouvet Island',
4165         'BR': 'Brazil',
4166         'IO': 'British Indian Ocean Territory',
4167         'BN': 'Brunei Darussalam',
4168         'BG': 'Bulgaria',
4169         'BF': 'Burkina Faso',
4170         'BI': 'Burundi',
4171         'KH': 'Cambodia',
4172         'CM': 'Cameroon',
4173         'CA': 'Canada',
4174         'CV': 'Cape Verde',
4175         'KY': 'Cayman Islands',
4176         'CF': 'Central African Republic',
4177         'TD': 'Chad',
4178         'CL': 'Chile',
4179         'CN': 'China',
4180         'CX': 'Christmas Island',
4181         'CC': 'Cocos (Keeling) Islands',
4182         'CO': 'Colombia',
4183         'KM': 'Comoros',
4184         'CG': 'Congo',
4185         'CD': 'Congo, the Democratic Republic of the',
4186         'CK': 'Cook Islands',
4187         'CR': 'Costa Rica',
4188         'CI': 'Côte d\'Ivoire',
4189         'HR': 'Croatia',
4190         'CU': 'Cuba',
4191         'CW': 'Curaçao',
4192         'CY': 'Cyprus',
4193         'CZ': 'Czech Republic',
4194         'DK': 'Denmark',
4195         'DJ': 'Djibouti',
4196         'DM': 'Dominica',
4197         'DO': 'Dominican Republic',
4198         'EC': 'Ecuador',
4199         'EG': 'Egypt',
4200         'SV': 'El Salvador',
4201         'GQ': 'Equatorial Guinea',
4202         'ER': 'Eritrea',
4203         'EE': 'Estonia',
4204         'ET': 'Ethiopia',
4205         'FK': 'Falkland Islands (Malvinas)',
4206         'FO': 'Faroe Islands',
4207         'FJ': 'Fiji',
4208         'FI': 'Finland',
4209         'FR': 'France',
4210         'GF': 'French Guiana',
4211         'PF': 'French Polynesia',
4212         'TF': 'French Southern Territories',
4213         'GA': 'Gabon',
4214         'GM': 'Gambia',
4215         'GE': 'Georgia',
4216         'DE': 'Germany',
4217         'GH': 'Ghana',
4218         'GI': 'Gibraltar',
4219         'GR': 'Greece',
4220         'GL': 'Greenland',
4221         'GD': 'Grenada',
4222         'GP': 'Guadeloupe',
4223         'GU': 'Guam',
4224         'GT': 'Guatemala',
4225         'GG': 'Guernsey',
4226         'GN': 'Guinea',
4227         'GW': 'Guinea-Bissau',
4228         'GY': 'Guyana',
4229         'HT': 'Haiti',
4230         'HM': 'Heard Island and McDonald Islands',
4231         'VA': 'Holy See (Vatican City State)',
4232         'HN': 'Honduras',
4233         'HK': 'Hong Kong',
4234         'HU': 'Hungary',
4235         'IS': 'Iceland',
4236         'IN': 'India',
4237         'ID': 'Indonesia',
4238         'IR': 'Iran, Islamic Republic of',
4239         'IQ': 'Iraq',
4240         'IE': 'Ireland',
4241         'IM': 'Isle of Man',
4242         'IL': 'Israel',
4243         'IT': 'Italy',
4244         'JM': 'Jamaica',
4245         'JP': 'Japan',
4246         'JE': 'Jersey',
4247         'JO': 'Jordan',
4248         'KZ': 'Kazakhstan',
4249         'KE': 'Kenya',
4250         'KI': 'Kiribati',
4251         'KP': 'Korea, Democratic People\'s Republic of',
4252         'KR': 'Korea, Republic of',
4253         'KW': 'Kuwait',
4254         'KG': 'Kyrgyzstan',
4255         'LA': 'Lao People\'s Democratic Republic',
4256         'LV': 'Latvia',
4257         'LB': 'Lebanon',
4258         'LS': 'Lesotho',
4259         'LR': 'Liberia',
4260         'LY': 'Libya',
4261         'LI': 'Liechtenstein',
4262         'LT': 'Lithuania',
4263         'LU': 'Luxembourg',
4264         'MO': 'Macao',
4265         'MK': 'Macedonia, the Former Yugoslav Republic of',
4266         'MG': 'Madagascar',
4267         'MW': 'Malawi',
4268         'MY': 'Malaysia',
4269         'MV': 'Maldives',
4270         'ML': 'Mali',
4271         'MT': 'Malta',
4272         'MH': 'Marshall Islands',
4273         'MQ': 'Martinique',
4274         'MR': 'Mauritania',
4275         'MU': 'Mauritius',
4276         'YT': 'Mayotte',
4277         'MX': 'Mexico',
4278         'FM': 'Micronesia, Federated States of',
4279         'MD': 'Moldova, Republic of',
4280         'MC': 'Monaco',
4281         'MN': 'Mongolia',
4282         'ME': 'Montenegro',
4283         'MS': 'Montserrat',
4284         'MA': 'Morocco',
4285         'MZ': 'Mozambique',
4286         'MM': 'Myanmar',
4287         'NA': 'Namibia',
4288         'NR': 'Nauru',
4289         'NP': 'Nepal',
4290         'NL': 'Netherlands',
4291         'NC': 'New Caledonia',
4292         'NZ': 'New Zealand',
4293         'NI': 'Nicaragua',
4294         'NE': 'Niger',
4295         'NG': 'Nigeria',
4296         'NU': 'Niue',
4297         'NF': 'Norfolk Island',
4298         'MP': 'Northern Mariana Islands',
4299         'NO': 'Norway',
4300         'OM': 'Oman',
4301         'PK': 'Pakistan',
4302         'PW': 'Palau',
4303         'PS': 'Palestine, State of',
4304         'PA': 'Panama',
4305         'PG': 'Papua New Guinea',
4306         'PY': 'Paraguay',
4307         'PE': 'Peru',
4308         'PH': 'Philippines',
4309         'PN': 'Pitcairn',
4310         'PL': 'Poland',
4311         'PT': 'Portugal',
4312         'PR': 'Puerto Rico',
4313         'QA': 'Qatar',
4314         'RE': 'Réunion',
4315         'RO': 'Romania',
4316         'RU': 'Russian Federation',
4317         'RW': 'Rwanda',
4318         'BL': 'Saint Barthélemy',
4319         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4320         'KN': 'Saint Kitts and Nevis',
4321         'LC': 'Saint Lucia',
4322         'MF': 'Saint Martin (French part)',
4323         'PM': 'Saint Pierre and Miquelon',
4324         'VC': 'Saint Vincent and the Grenadines',
4325         'WS': 'Samoa',
4326         'SM': 'San Marino',
4327         'ST': 'Sao Tome and Principe',
4328         'SA': 'Saudi Arabia',
4329         'SN': 'Senegal',
4330         'RS': 'Serbia',
4331         'SC': 'Seychelles',
4332         'SL': 'Sierra Leone',
4333         'SG': 'Singapore',
4334         'SX': 'Sint Maarten (Dutch part)',
4335         'SK': 'Slovakia',
4336         'SI': 'Slovenia',
4337         'SB': 'Solomon Islands',
4338         'SO': 'Somalia',
4339         'ZA': 'South Africa',
4340         'GS': 'South Georgia and the South Sandwich Islands',
4341         'SS': 'South Sudan',
4342         'ES': 'Spain',
4343         'LK': 'Sri Lanka',
4344         'SD': 'Sudan',
4345         'SR': 'Suriname',
4346         'SJ': 'Svalbard and Jan Mayen',
4347         'SZ': 'Swaziland',
4348         'SE': 'Sweden',
4349         'CH': 'Switzerland',
4350         'SY': 'Syrian Arab Republic',
4351         'TW': 'Taiwan, Province of China',
4352         'TJ': 'Tajikistan',
4353         'TZ': 'Tanzania, United Republic of',
4354         'TH': 'Thailand',
4355         'TL': 'Timor-Leste',
4356         'TG': 'Togo',
4357         'TK': 'Tokelau',
4358         'TO': 'Tonga',
4359         'TT': 'Trinidad and Tobago',
4360         'TN': 'Tunisia',
4361         'TR': 'Turkey',
4362         'TM': 'Turkmenistan',
4363         'TC': 'Turks and Caicos Islands',
4364         'TV': 'Tuvalu',
4365         'UG': 'Uganda',
4366         'UA': 'Ukraine',
4367         'AE': 'United Arab Emirates',
4368         'GB': 'United Kingdom',
4369         'US': 'United States',
4370         'UM': 'United States Minor Outlying Islands',
4371         'UY': 'Uruguay',
4372         'UZ': 'Uzbekistan',
4373         'VU': 'Vanuatu',
4374         'VE': 'Venezuela, Bolivarian Republic of',
4375         'VN': 'Viet Nam',
4376         'VG': 'Virgin Islands, British',
4377         'VI': 'Virgin Islands, U.S.',
4378         'WF': 'Wallis and Futuna',
4379         'EH': 'Western Sahara',
4380         'YE': 'Yemen',
4381         'ZM': 'Zambia',
4382         'ZW': 'Zimbabwe',
4383         # Not ISO 3166 codes, but used for IP blocks
4384         'AP': 'Asia/Pacific Region',
4385         'EU': 'Europe',
4386     }
4387
4388     @classmethod
4389     def short2full(cls, code):
4390         """Convert an ISO 3166-2 country code to the corresponding full name"""
4391         return cls._country_map.get(code.upper())
4392
4393
4394 class GeoUtils:
4395     # Major IPv4 address blocks per country
4396     _country_ip_map = {
4397         'AD': '46.172.224.0/19',
4398         'AE': '94.200.0.0/13',
4399         'AF': '149.54.0.0/17',
4400         'AG': '209.59.64.0/18',
4401         'AI': '204.14.248.0/21',
4402         'AL': '46.99.0.0/16',
4403         'AM': '46.70.0.0/15',
4404         'AO': '105.168.0.0/13',
4405         'AP': '182.50.184.0/21',
4406         'AQ': '23.154.160.0/24',
4407         'AR': '181.0.0.0/12',
4408         'AS': '202.70.112.0/20',
4409         'AT': '77.116.0.0/14',
4410         'AU': '1.128.0.0/11',
4411         'AW': '181.41.0.0/18',
4412         'AX': '185.217.4.0/22',
4413         'AZ': '5.197.0.0/16',
4414         'BA': '31.176.128.0/17',
4415         'BB': '65.48.128.0/17',
4416         'BD': '114.130.0.0/16',
4417         'BE': '57.0.0.0/8',
4418         'BF': '102.178.0.0/15',
4419         'BG': '95.42.0.0/15',
4420         'BH': '37.131.0.0/17',
4421         'BI': '154.117.192.0/18',
4422         'BJ': '137.255.0.0/16',
4423         'BL': '185.212.72.0/23',
4424         'BM': '196.12.64.0/18',
4425         'BN': '156.31.0.0/16',
4426         'BO': '161.56.0.0/16',
4427         'BQ': '161.0.80.0/20',
4428         'BR': '191.128.0.0/12',
4429         'BS': '24.51.64.0/18',
4430         'BT': '119.2.96.0/19',
4431         'BW': '168.167.0.0/16',
4432         'BY': '178.120.0.0/13',
4433         'BZ': '179.42.192.0/18',
4434         'CA': '99.224.0.0/11',
4435         'CD': '41.243.0.0/16',
4436         'CF': '197.242.176.0/21',
4437         'CG': '160.113.0.0/16',
4438         'CH': '85.0.0.0/13',
4439         'CI': '102.136.0.0/14',
4440         'CK': '202.65.32.0/19',
4441         'CL': '152.172.0.0/14',
4442         'CM': '102.244.0.0/14',
4443         'CN': '36.128.0.0/10',
4444         'CO': '181.240.0.0/12',
4445         'CR': '201.192.0.0/12',
4446         'CU': '152.206.0.0/15',
4447         'CV': '165.90.96.0/19',
4448         'CW': '190.88.128.0/17',
4449         'CY': '31.153.0.0/16',
4450         'CZ': '88.100.0.0/14',
4451         'DE': '53.0.0.0/8',
4452         'DJ': '197.241.0.0/17',
4453         'DK': '87.48.0.0/12',
4454         'DM': '192.243.48.0/20',
4455         'DO': '152.166.0.0/15',
4456         'DZ': '41.96.0.0/12',
4457         'EC': '186.68.0.0/15',
4458         'EE': '90.190.0.0/15',
4459         'EG': '156.160.0.0/11',
4460         'ER': '196.200.96.0/20',
4461         'ES': '88.0.0.0/11',
4462         'ET': '196.188.0.0/14',
4463         'EU': '2.16.0.0/13',
4464         'FI': '91.152.0.0/13',
4465         'FJ': '144.120.0.0/16',
4466         'FK': '80.73.208.0/21',
4467         'FM': '119.252.112.0/20',
4468         'FO': '88.85.32.0/19',
4469         'FR': '90.0.0.0/9',
4470         'GA': '41.158.0.0/15',
4471         'GB': '25.0.0.0/8',
4472         'GD': '74.122.88.0/21',
4473         'GE': '31.146.0.0/16',
4474         'GF': '161.22.64.0/18',
4475         'GG': '62.68.160.0/19',
4476         'GH': '154.160.0.0/12',
4477         'GI': '95.164.0.0/16',
4478         'GL': '88.83.0.0/19',
4479         'GM': '160.182.0.0/15',
4480         'GN': '197.149.192.0/18',
4481         'GP': '104.250.0.0/19',
4482         'GQ': '105.235.224.0/20',
4483         'GR': '94.64.0.0/13',
4484         'GT': '168.234.0.0/16',
4485         'GU': '168.123.0.0/16',
4486         'GW': '197.214.80.0/20',
4487         'GY': '181.41.64.0/18',
4488         'HK': '113.252.0.0/14',
4489         'HN': '181.210.0.0/16',
4490         'HR': '93.136.0.0/13',
4491         'HT': '148.102.128.0/17',
4492         'HU': '84.0.0.0/14',
4493         'ID': '39.192.0.0/10',
4494         'IE': '87.32.0.0/12',
4495         'IL': '79.176.0.0/13',
4496         'IM': '5.62.80.0/20',
4497         'IN': '117.192.0.0/10',
4498         'IO': '203.83.48.0/21',
4499         'IQ': '37.236.0.0/14',
4500         'IR': '2.176.0.0/12',
4501         'IS': '82.221.0.0/16',
4502         'IT': '79.0.0.0/10',
4503         'JE': '87.244.64.0/18',
4504         'JM': '72.27.0.0/17',
4505         'JO': '176.29.0.0/16',
4506         'JP': '133.0.0.0/8',
4507         'KE': '105.48.0.0/12',
4508         'KG': '158.181.128.0/17',
4509         'KH': '36.37.128.0/17',
4510         'KI': '103.25.140.0/22',
4511         'KM': '197.255.224.0/20',
4512         'KN': '198.167.192.0/19',
4513         'KP': '175.45.176.0/22',
4514         'KR': '175.192.0.0/10',
4515         'KW': '37.36.0.0/14',
4516         'KY': '64.96.0.0/15',
4517         'KZ': '2.72.0.0/13',
4518         'LA': '115.84.64.0/18',
4519         'LB': '178.135.0.0/16',
4520         'LC': '24.92.144.0/20',
4521         'LI': '82.117.0.0/19',
4522         'LK': '112.134.0.0/15',
4523         'LR': '102.183.0.0/16',
4524         'LS': '129.232.0.0/17',
4525         'LT': '78.56.0.0/13',
4526         'LU': '188.42.0.0/16',
4527         'LV': '46.109.0.0/16',
4528         'LY': '41.252.0.0/14',
4529         'MA': '105.128.0.0/11',
4530         'MC': '88.209.64.0/18',
4531         'MD': '37.246.0.0/16',
4532         'ME': '178.175.0.0/17',
4533         'MF': '74.112.232.0/21',
4534         'MG': '154.126.0.0/17',
4535         'MH': '117.103.88.0/21',
4536         'MK': '77.28.0.0/15',
4537         'ML': '154.118.128.0/18',
4538         'MM': '37.111.0.0/17',
4539         'MN': '49.0.128.0/17',
4540         'MO': '60.246.0.0/16',
4541         'MP': '202.88.64.0/20',
4542         'MQ': '109.203.224.0/19',
4543         'MR': '41.188.64.0/18',
4544         'MS': '208.90.112.0/22',
4545         'MT': '46.11.0.0/16',
4546         'MU': '105.16.0.0/12',
4547         'MV': '27.114.128.0/18',
4548         'MW': '102.70.0.0/15',
4549         'MX': '187.192.0.0/11',
4550         'MY': '175.136.0.0/13',
4551         'MZ': '197.218.0.0/15',
4552         'NA': '41.182.0.0/16',
4553         'NC': '101.101.0.0/18',
4554         'NE': '197.214.0.0/18',
4555         'NF': '203.17.240.0/22',
4556         'NG': '105.112.0.0/12',
4557         'NI': '186.76.0.0/15',
4558         'NL': '145.96.0.0/11',
4559         'NO': '84.208.0.0/13',
4560         'NP': '36.252.0.0/15',
4561         'NR': '203.98.224.0/19',
4562         'NU': '49.156.48.0/22',
4563         'NZ': '49.224.0.0/14',
4564         'OM': '5.36.0.0/15',
4565         'PA': '186.72.0.0/15',
4566         'PE': '186.160.0.0/14',
4567         'PF': '123.50.64.0/18',
4568         'PG': '124.240.192.0/19',
4569         'PH': '49.144.0.0/13',
4570         'PK': '39.32.0.0/11',
4571         'PL': '83.0.0.0/11',
4572         'PM': '70.36.0.0/20',
4573         'PR': '66.50.0.0/16',
4574         'PS': '188.161.0.0/16',
4575         'PT': '85.240.0.0/13',
4576         'PW': '202.124.224.0/20',
4577         'PY': '181.120.0.0/14',
4578         'QA': '37.210.0.0/15',
4579         'RE': '102.35.0.0/16',
4580         'RO': '79.112.0.0/13',
4581         'RS': '93.86.0.0/15',
4582         'RU': '5.136.0.0/13',
4583         'RW': '41.186.0.0/16',
4584         'SA': '188.48.0.0/13',
4585         'SB': '202.1.160.0/19',
4586         'SC': '154.192.0.0/11',
4587         'SD': '102.120.0.0/13',
4588         'SE': '78.64.0.0/12',
4589         'SG': '8.128.0.0/10',
4590         'SI': '188.196.0.0/14',
4591         'SK': '78.98.0.0/15',
4592         'SL': '102.143.0.0/17',
4593         'SM': '89.186.32.0/19',
4594         'SN': '41.82.0.0/15',
4595         'SO': '154.115.192.0/18',
4596         'SR': '186.179.128.0/17',
4597         'SS': '105.235.208.0/21',
4598         'ST': '197.159.160.0/19',
4599         'SV': '168.243.0.0/16',
4600         'SX': '190.102.0.0/20',
4601         'SY': '5.0.0.0/16',
4602         'SZ': '41.84.224.0/19',
4603         'TC': '65.255.48.0/20',
4604         'TD': '154.68.128.0/19',
4605         'TG': '196.168.0.0/14',
4606         'TH': '171.96.0.0/13',
4607         'TJ': '85.9.128.0/18',
4608         'TK': '27.96.24.0/21',
4609         'TL': '180.189.160.0/20',
4610         'TM': '95.85.96.0/19',
4611         'TN': '197.0.0.0/11',
4612         'TO': '175.176.144.0/21',
4613         'TR': '78.160.0.0/11',
4614         'TT': '186.44.0.0/15',
4615         'TV': '202.2.96.0/19',
4616         'TW': '120.96.0.0/11',
4617         'TZ': '156.156.0.0/14',
4618         'UA': '37.52.0.0/14',
4619         'UG': '102.80.0.0/13',
4620         'US': '6.0.0.0/8',
4621         'UY': '167.56.0.0/13',
4622         'UZ': '84.54.64.0/18',
4623         'VA': '212.77.0.0/19',
4624         'VC': '207.191.240.0/21',
4625         'VE': '186.88.0.0/13',
4626         'VG': '66.81.192.0/20',
4627         'VI': '146.226.0.0/16',
4628         'VN': '14.160.0.0/11',
4629         'VU': '202.80.32.0/20',
4630         'WF': '117.20.32.0/21',
4631         'WS': '202.4.32.0/19',
4632         'YE': '134.35.0.0/16',
4633         'YT': '41.242.116.0/22',
4634         'ZA': '41.0.0.0/11',
4635         'ZM': '102.144.0.0/13',
4636         'ZW': '102.177.192.0/18',
4637     }
4638
4639     @classmethod
4640     def random_ipv4(cls, code_or_block):
4641         if len(code_or_block) == 2:
4642             block = cls._country_ip_map.get(code_or_block.upper())
4643             if not block:
4644                 return None
4645         else:
4646             block = code_or_block
4647         addr, preflen = block.split('/')
4648         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4649         addr_max = addr_min | (0xffffffff >> int(preflen))
4650         return str(socket.inet_ntoa(
4651             struct.pack('!L', random.randint(addr_min, addr_max))))
4652
4653
4654 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4655     def __init__(self, proxies=None):
4656         # Set default handlers
4657         for type in ('http', 'https'):
4658             setattr(self, '%s_open' % type,
4659                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4660                         meth(r, proxy, type))
4661         urllib.request.ProxyHandler.__init__(self, proxies)
4662
4663     def proxy_open(self, req, proxy, type):
4664         req_proxy = req.headers.get('Ytdl-request-proxy')
4665         if req_proxy is not None:
4666             proxy = req_proxy
4667             del req.headers['Ytdl-request-proxy']
4668
4669         if proxy == '__noproxy__':
4670             return None  # No Proxy
4671         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4672             req.add_header('Ytdl-socks-proxy', proxy)
4673             # yt-dlp's http/https handlers do wrapping the socket with socks
4674             return None
4675         return urllib.request.ProxyHandler.proxy_open(
4676             self, req, proxy, type)
4677
4678
4679 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4680 # released into Public Domain
4681 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4682
4683 def long_to_bytes(n, blocksize=0):
4684     """long_to_bytes(n:long, blocksize:int) : string
4685     Convert a long integer to a byte string.
4686
4687     If optional blocksize is given and greater than zero, pad the front of the
4688     byte string with binary zeros so that the length is a multiple of
4689     blocksize.
4690     """
4691     # after much testing, this algorithm was deemed to be the fastest
4692     s = b''
4693     n = int(n)
4694     while n > 0:
4695         s = struct.pack('>I', n & 0xffffffff) + s
4696         n = n >> 32
4697     # strip off leading zeros
4698     for i in range(len(s)):
4699         if s[i] != b'\000'[0]:
4700             break
4701     else:
4702         # only happens when n == 0
4703         s = b'\000'
4704         i = 0
4705     s = s[i:]
4706     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4707     # de-padding being done above, but sigh...
4708     if blocksize > 0 and len(s) % blocksize:
4709         s = (blocksize - len(s) % blocksize) * b'\000' + s
4710     return s
4711
4712
4713 def bytes_to_long(s):
4714     """bytes_to_long(string) : long
4715     Convert a byte string to a long integer.
4716
4717     This is (essentially) the inverse of long_to_bytes().
4718     """
4719     acc = 0
4720     length = len(s)
4721     if length % 4:
4722         extra = (4 - length % 4)
4723         s = b'\000' * extra + s
4724         length = length + extra
4725     for i in range(0, length, 4):
4726         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4727     return acc
4728
4729
4730 def ohdave_rsa_encrypt(data, exponent, modulus):
4731     '''
4732     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4733
4734     Input:
4735         data: data to encrypt, bytes-like object
4736         exponent, modulus: parameter e and N of RSA algorithm, both integer
4737     Output: hex string of encrypted data
4738
4739     Limitation: supports one block encryption only
4740     '''
4741
4742     payload = int(binascii.hexlify(data[::-1]), 16)
4743     encrypted = pow(payload, exponent, modulus)
4744     return '%x' % encrypted
4745
4746
4747 def pkcs1pad(data, length):
4748     """
4749     Padding input data with PKCS#1 scheme
4750
4751     @param {int[]} data        input data
4752     @param {int}   length      target length
4753     @returns {int[]}           padded data
4754     """
4755     if len(data) > length - 11:
4756         raise ValueError('Input data too long for PKCS#1 padding')
4757
4758     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4759     return [0, 2] + pseudo_random + [0] + data
4760
4761
4762 def _base_n_table(n, table):
4763     if not table and not n:
4764         raise ValueError('Either table or n must be specified')
4765     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4766
4767     if n and n != len(table):
4768         raise ValueError(f'base {n} exceeds table length {len(table)}')
4769     return table
4770
4771
4772 def encode_base_n(num, n=None, table=None):
4773     """Convert given int to a base-n string"""
4774     table = _base_n_table(n, table)
4775     if not num:
4776         return table[0]
4777
4778     result, base = '', len(table)
4779     while num:
4780         result = table[num % base] + result
4781         num = num // base
4782     return result
4783
4784
4785 def decode_base_n(string, n=None, table=None):
4786     """Convert given base-n string to int"""
4787     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4788     result, base = 0, len(table)
4789     for char in string:
4790         result = result * base + table[char]
4791     return result
4792
4793
4794 def decode_base(value, digits):
4795     write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4796                  'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4797     return decode_base_n(value, table=digits)
4798
4799
4800 def decode_packed_codes(code):
4801     mobj = re.search(PACKED_CODES_RE, code)
4802     obfuscated_code, base, count, symbols = mobj.groups()
4803     base = int(base)
4804     count = int(count)
4805     symbols = symbols.split('|')
4806     symbol_table = {}
4807
4808     while count:
4809         count -= 1
4810         base_n_count = encode_base_n(count, base)
4811         symbol_table[base_n_count] = symbols[count] or base_n_count
4812
4813     return re.sub(
4814         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4815         obfuscated_code)
4816
4817
4818 def caesar(s, alphabet, shift):
4819     if shift == 0:
4820         return s
4821     l = len(alphabet)
4822     return ''.join(
4823         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4824         for c in s)
4825
4826
4827 def rot47(s):
4828     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4829
4830
4831 def parse_m3u8_attributes(attrib):
4832     info = {}
4833     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4834         if val.startswith('"'):
4835             val = val[1:-1]
4836         info[key] = val
4837     return info
4838
4839
4840 def urshift(val, n):
4841     return val >> n if val >= 0 else (val + 0x100000000) >> n
4842
4843
4844 # Based on png2str() written by @gdkchan and improved by @yokrysty
4845 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4846 def decode_png(png_data):
4847     # Reference: https://www.w3.org/TR/PNG/
4848     header = png_data[8:]
4849
4850     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4851         raise OSError('Not a valid PNG file.')
4852
4853     int_map = {1: '>B', 2: '>H', 4: '>I'}
4854     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4855
4856     chunks = []
4857
4858     while header:
4859         length = unpack_integer(header[:4])
4860         header = header[4:]
4861
4862         chunk_type = header[:4]
4863         header = header[4:]
4864
4865         chunk_data = header[:length]
4866         header = header[length:]
4867
4868         header = header[4:]  # Skip CRC
4869
4870         chunks.append({
4871             'type': chunk_type,
4872             'length': length,
4873             'data': chunk_data
4874         })
4875
4876     ihdr = chunks[0]['data']
4877
4878     width = unpack_integer(ihdr[:4])
4879     height = unpack_integer(ihdr[4:8])
4880
4881     idat = b''
4882
4883     for chunk in chunks:
4884         if chunk['type'] == b'IDAT':
4885             idat += chunk['data']
4886
4887     if not idat:
4888         raise OSError('Unable to read PNG data.')
4889
4890     decompressed_data = bytearray(zlib.decompress(idat))
4891
4892     stride = width * 3
4893     pixels = []
4894
4895     def _get_pixel(idx):
4896         x = idx % stride
4897         y = idx // stride
4898         return pixels[y][x]
4899
4900     for y in range(height):
4901         basePos = y * (1 + stride)
4902         filter_type = decompressed_data[basePos]
4903
4904         current_row = []
4905
4906         pixels.append(current_row)
4907
4908         for x in range(stride):
4909             color = decompressed_data[1 + basePos + x]
4910             basex = y * stride + x
4911             left = 0
4912             up = 0
4913
4914             if x > 2:
4915                 left = _get_pixel(basex - 3)
4916             if y > 0:
4917                 up = _get_pixel(basex - stride)
4918
4919             if filter_type == 1:  # Sub
4920                 color = (color + left) & 0xff
4921             elif filter_type == 2:  # Up
4922                 color = (color + up) & 0xff
4923             elif filter_type == 3:  # Average
4924                 color = (color + ((left + up) >> 1)) & 0xff
4925             elif filter_type == 4:  # Paeth
4926                 a = left
4927                 b = up
4928                 c = 0
4929
4930                 if x > 2 and y > 0:
4931                     c = _get_pixel(basex - stride - 3)
4932
4933                 p = a + b - c
4934
4935                 pa = abs(p - a)
4936                 pb = abs(p - b)
4937                 pc = abs(p - c)
4938
4939                 if pa <= pb and pa <= pc:
4940                     color = (color + a) & 0xff
4941                 elif pb <= pc:
4942                     color = (color + b) & 0xff
4943                 else:
4944                     color = (color + c) & 0xff
4945
4946             current_row.append(color)
4947
4948     return width, height, pixels
4949
4950
4951 def write_xattr(path, key, value):
4952     # Windows: Write xattrs to NTFS Alternate Data Streams:
4953     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4954     if compat_os_name == 'nt':
4955         assert ':' not in key
4956         assert os.path.exists(path)
4957
4958         try:
4959             with open(f'{path}:{key}', 'wb') as f:
4960                 f.write(value)
4961         except OSError as e:
4962             raise XAttrMetadataError(e.errno, e.strerror)
4963         return
4964
4965     # UNIX Method 1. Use xattrs/pyxattrs modules
4966
4967     setxattr = None
4968     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4969         # Unicode arguments are not supported in pyxattr until version 0.5.0
4970         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4971         if version_tuple(xattr.__version__) >= (0, 5, 0):
4972             setxattr = xattr.set
4973     elif xattr:
4974         setxattr = xattr.setxattr
4975
4976     if setxattr:
4977         try:
4978             setxattr(path, key, value)
4979         except OSError as e:
4980             raise XAttrMetadataError(e.errno, e.strerror)
4981         return
4982
4983     # UNIX Method 2. Use setfattr/xattr executables
4984     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4985            else 'xattr' if check_executable('xattr', ['-h']) else None)
4986     if not exe:
4987         raise XAttrUnavailableError(
4988             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4989             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4990
4991     value = value.decode()
4992     try:
4993         _, stderr, returncode = Popen.run(
4994             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4995             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4996     except OSError as e:
4997         raise XAttrMetadataError(e.errno, e.strerror)
4998     if returncode:
4999         raise XAttrMetadataError(returncode, stderr)
5000
5001
5002 def random_birthday(year_field, month_field, day_field):
5003     start_date = datetime.date(1950, 1, 1)
5004     end_date = datetime.date(1995, 12, 31)
5005     offset = random.randint(0, (end_date - start_date).days)
5006     random_date = start_date + datetime.timedelta(offset)
5007     return {
5008         year_field: str(random_date.year),
5009         month_field: str(random_date.month),
5010         day_field: str(random_date.day),
5011     }
5012
5013
5014 # Templates for internet shortcut files, which are plain text files.
5015 DOT_URL_LINK_TEMPLATE = '''\
5016 [InternetShortcut]
5017 URL=%(url)s
5018 '''
5019
5020 DOT_WEBLOC_LINK_TEMPLATE = '''\
5021 <?xml version="1.0" encoding="UTF-8"?>
5022 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5023 <plist version="1.0">
5024 <dict>
5025 \t<key>URL</key>
5026 \t<string>%(url)s</string>
5027 </dict>
5028 </plist>
5029 '''
5030
5031 DOT_DESKTOP_LINK_TEMPLATE = '''\
5032 [Desktop Entry]
5033 Encoding=UTF-8
5034 Name=%(filename)s
5035 Type=Link
5036 URL=%(url)s
5037 Icon=text-html
5038 '''
5039
5040 LINK_TEMPLATES = {
5041     'url': DOT_URL_LINK_TEMPLATE,
5042     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5043     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5044 }
5045
5046
5047 def iri_to_uri(iri):
5048     """
5049     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5050
5051     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5052     """
5053
5054     iri_parts = urllib.parse.urlparse(iri)
5055
5056     if '[' in iri_parts.netloc:
5057         raise ValueError('IPv6 URIs are not, yet, supported.')
5058         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5059
5060     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5061
5062     net_location = ''
5063     if iri_parts.username:
5064         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5065         if iri_parts.password is not None:
5066             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5067         net_location += '@'
5068
5069     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5070     # The 'idna' encoding produces ASCII text.
5071     if iri_parts.port is not None and iri_parts.port != 80:
5072         net_location += ':' + str(iri_parts.port)
5073
5074     return urllib.parse.urlunparse(
5075         (iri_parts.scheme,
5076             net_location,
5077
5078             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5079
5080             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5081             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5082
5083             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5084             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5085
5086             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5087
5088     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5089
5090
5091 def to_high_limit_path(path):
5092     if sys.platform in ['win32', 'cygwin']:
5093         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5094         return '\\\\?\\' + os.path.abspath(path)
5095
5096     return path
5097
5098
5099 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5100     val = traverse_obj(obj, *variadic(field))
5101     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5102         return default
5103     return template % func(val)
5104
5105
5106 def clean_podcast_url(url):
5107     return re.sub(r'''(?x)
5108         (?:
5109             (?:
5110                 chtbl\.com/track|
5111                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5112                 play\.podtrac\.com
5113             )/[^/]+|
5114             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5115             flex\.acast\.com|
5116             pd(?:
5117                 cn\.co| # https://podcorn.com/analytics-prefix/
5118                 st\.fm # https://podsights.com/docs/
5119             )/e
5120         )/''', '', url)
5121
5122
5123 _HEX_TABLE = '0123456789abcdef'
5124
5125
5126 def random_uuidv4():
5127     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5128
5129
5130 def make_dir(path, to_screen=None):
5131     try:
5132         dn = os.path.dirname(path)
5133         if dn and not os.path.exists(dn):
5134             os.makedirs(dn)
5135         return True
5136     except OSError as err:
5137         if callable(to_screen) is not None:
5138             to_screen('unable to create directory ' + error_to_compat_str(err))
5139         return False
5140
5141
5142 def get_executable_path():
5143     from .update import _get_variant_and_executable_path
5144
5145     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5146
5147
5148 def load_plugins(name, suffix, namespace):
5149     classes = {}
5150     with contextlib.suppress(FileNotFoundError):
5151         plugins_spec = importlib.util.spec_from_file_location(
5152             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5153         plugins = importlib.util.module_from_spec(plugins_spec)
5154         sys.modules[plugins_spec.name] = plugins
5155         plugins_spec.loader.exec_module(plugins)
5156         for name in dir(plugins):
5157             if name in namespace:
5158                 continue
5159             if not name.endswith(suffix):
5160                 continue
5161             klass = getattr(plugins, name)
5162             classes[name] = namespace[name] = klass
5163     return classes
5164
5165
5166 def traverse_obj(
5167         obj, *path_list, default=None, expected_type=None, get_all=True,
5168         casesense=True, is_user_input=False, traverse_string=False):
5169     ''' Traverse nested list/dict/tuple
5170     @param path_list        A list of paths which are checked one by one.
5171                             Each path is a list of keys where each key is a:
5172                               - None:     Do nothing
5173                               - string:   A dictionary key
5174                               - int:      An index into a list
5175                               - tuple:    A list of keys all of which will be traversed
5176                               - Ellipsis: Fetch all values in the object
5177                               - Function: Takes the key and value as arguments
5178                                           and returns whether the key matches or not
5179     @param default          Default value to return
5180     @param expected_type    Only accept final value of this type (Can also be any callable)
5181     @param get_all          Return all the values obtained from a path or only the first one
5182     @param casesense        Whether to consider dictionary keys as case sensitive
5183     @param is_user_input    Whether the keys are generated from user input. If True,
5184                             strings are converted to int/slice if necessary
5185     @param traverse_string  Whether to traverse inside strings. If True, any
5186                             non-compatible object will also be converted into a string
5187     # TODO: Write tests
5188     '''
5189     if not casesense:
5190         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5191         path_list = (map(_lower, variadic(path)) for path in path_list)
5192
5193     def _traverse_obj(obj, path, _current_depth=0):
5194         nonlocal depth
5195         path = tuple(variadic(path))
5196         for i, key in enumerate(path):
5197             if None in (key, obj):
5198                 return obj
5199             if isinstance(key, (list, tuple)):
5200                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5201                 key = ...
5202             if key is ...:
5203                 obj = (obj.values() if isinstance(obj, dict)
5204                        else obj if isinstance(obj, (list, tuple, LazyList))
5205                        else str(obj) if traverse_string else [])
5206                 _current_depth += 1
5207                 depth = max(depth, _current_depth)
5208                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5209             elif callable(key):
5210                 if isinstance(obj, (list, tuple, LazyList)):
5211                     obj = enumerate(obj)
5212                 elif isinstance(obj, dict):
5213                     obj = obj.items()
5214                 else:
5215                     if not traverse_string:
5216                         return None
5217                     obj = str(obj)
5218                 _current_depth += 1
5219                 depth = max(depth, _current_depth)
5220                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5221             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5222                 obj = (obj.get(key) if casesense or (key in obj)
5223                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5224             else:
5225                 if is_user_input:
5226                     key = (int_or_none(key) if ':' not in key
5227                            else slice(*map(int_or_none, key.split(':'))))
5228                     if key == slice(None):
5229                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5230                 if not isinstance(key, (int, slice)):
5231                     return None
5232                 if not isinstance(obj, (list, tuple, LazyList)):
5233                     if not traverse_string:
5234                         return None
5235                     obj = str(obj)
5236                 try:
5237                     obj = obj[key]
5238                 except IndexError:
5239                     return None
5240         return obj
5241
5242     if isinstance(expected_type, type):
5243         type_test = lambda val: val if isinstance(val, expected_type) else None
5244     else:
5245         type_test = expected_type or IDENTITY
5246
5247     for path in path_list:
5248         depth = 0
5249         val = _traverse_obj(obj, path)
5250         if val is not None:
5251             if depth:
5252                 for _ in range(depth - 1):
5253                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5254                 val = [v for v in map(type_test, val) if v is not None]
5255                 if val:
5256                     return val if get_all else val[0]
5257             else:
5258                 val = type_test(val)
5259                 if val is not None:
5260                     return val
5261     return default
5262
5263
5264 def traverse_dict(dictn, keys, casesense=True):
5265     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5266                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5267     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5268
5269
5270 def get_first(obj, keys, **kwargs):
5271     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5272
5273
5274 def variadic(x, allowed_types=(str, bytes, dict)):
5275     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5276
5277
5278 def time_seconds(**kwargs):
5279     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5280     return t.timestamp()
5281
5282
5283 # create a JSON Web Signature (jws) with HS256 algorithm
5284 # the resulting format is in JWS Compact Serialization
5285 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5286 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5287 def jwt_encode_hs256(payload_data, key, headers={}):
5288     header_data = {
5289         'alg': 'HS256',
5290         'typ': 'JWT',
5291     }
5292     if headers:
5293         header_data.update(headers)
5294     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5295     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5296     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5297     signature_b64 = base64.b64encode(h.digest())
5298     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5299     return token
5300
5301
5302 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5303 def jwt_decode_hs256(jwt):
5304     header_b64, payload_b64, signature_b64 = jwt.split('.')
5305     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5306     return payload_data
5307
5308
5309 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5310
5311
5312 @functools.cache
5313 def supports_terminal_sequences(stream):
5314     if compat_os_name == 'nt':
5315         if not WINDOWS_VT_MODE:
5316             return False
5317     elif not os.getenv('TERM'):
5318         return False
5319     try:
5320         return stream.isatty()
5321     except BaseException:
5322         return False
5323
5324
5325 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5326     if get_windows_version() < (10, 0, 10586):
5327         return
5328     global WINDOWS_VT_MODE
5329     try:
5330         Popen.run('', shell=True)
5331     except Exception:
5332         return
5333
5334     WINDOWS_VT_MODE = True
5335     supports_terminal_sequences.cache_clear()
5336
5337
5338 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5339
5340
5341 def remove_terminal_sequences(string):
5342     return _terminal_sequences_re.sub('', string)
5343
5344
5345 def number_of_digits(number):
5346     return len('%d' % number)
5347
5348
5349 def join_nonempty(*values, delim='-', from_dict=None):
5350     if from_dict is not None:
5351         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5352     return delim.join(map(str, filter(None, values)))
5353
5354
5355 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5356     """
5357     Find the largest format dimensions in terms of video width and, for each thumbnail:
5358     * Modify the URL: Match the width with the provided regex and replace with the former width
5359     * Update dimensions
5360
5361     This function is useful with video services that scale the provided thumbnails on demand
5362     """
5363     _keys = ('width', 'height')
5364     max_dimensions = max(
5365         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5366         default=(0, 0))
5367     if not max_dimensions[0]:
5368         return thumbnails
5369     return [
5370         merge_dicts(
5371             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5372             dict(zip(_keys, max_dimensions)), thumbnail)
5373         for thumbnail in thumbnails
5374     ]
5375
5376
5377 def parse_http_range(range):
5378     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5379     if not range:
5380         return None, None, None
5381     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5382     if not crg:
5383         return None, None, None
5384     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5385
5386
5387 def read_stdin(what):
5388     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5389     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5390     return sys.stdin
5391
5392
5393 def determine_file_encoding(data):
5394     """
5395     Detect the text encoding used
5396     @returns (encoding, bytes to skip)
5397     """
5398
5399     # BOM marks are given priority over declarations
5400     for bom, enc in BOMS:
5401         if data.startswith(bom):
5402             return enc, len(bom)
5403
5404     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5405     # We ignore the endianness to get a good enough match
5406     data = data.replace(b'\0', b'')
5407     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5408     return mobj.group(1).decode() if mobj else None, 0
5409
5410
5411 class Config:
5412     own_args = None
5413     parsed_args = None
5414     filename = None
5415     __initialized = False
5416
5417     def __init__(self, parser, label=None):
5418         self.parser, self.label = parser, label
5419         self._loaded_paths, self.configs = set(), []
5420
5421     def init(self, args=None, filename=None):
5422         assert not self.__initialized
5423         self.own_args, self.filename = args, filename
5424         return self.load_configs()
5425
5426     def load_configs(self):
5427         directory = ''
5428         if self.filename:
5429             location = os.path.realpath(self.filename)
5430             directory = os.path.dirname(location)
5431             if location in self._loaded_paths:
5432                 return False
5433             self._loaded_paths.add(location)
5434
5435         self.__initialized = True
5436         opts, _ = self.parser.parse_known_args(self.own_args)
5437         self.parsed_args = self.own_args
5438         for location in opts.config_locations or []:
5439             if location == '-':
5440                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5441                 continue
5442             location = os.path.join(directory, expand_path(location))
5443             if os.path.isdir(location):
5444                 location = os.path.join(location, 'yt-dlp.conf')
5445             if not os.path.exists(location):
5446                 self.parser.error(f'config location {location} does not exist')
5447             self.append_config(self.read_file(location), location)
5448         return True
5449
5450     def __str__(self):
5451         label = join_nonempty(
5452             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5453             delim=' ')
5454         return join_nonempty(
5455             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5456             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5457             delim='\n')
5458
5459     @staticmethod
5460     def read_file(filename, default=[]):
5461         try:
5462             optionf = open(filename, 'rb')
5463         except OSError:
5464             return default  # silently skip if file is not present
5465         try:
5466             enc, skip = determine_file_encoding(optionf.read(512))
5467             optionf.seek(skip, io.SEEK_SET)
5468         except OSError:
5469             enc = None  # silently skip read errors
5470         try:
5471             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5472             contents = optionf.read().decode(enc or preferredencoding())
5473             res = shlex.split(contents, comments=True)
5474         except Exception as err:
5475             raise ValueError(f'Unable to parse "{filename}": {err}')
5476         finally:
5477             optionf.close()
5478         return res
5479
5480     @staticmethod
5481     def hide_login_info(opts):
5482         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5483         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5484
5485         def _scrub_eq(o):
5486             m = eqre.match(o)
5487             if m:
5488                 return m.group('key') + '=PRIVATE'
5489             else:
5490                 return o
5491
5492         opts = list(map(_scrub_eq, opts))
5493         for idx, opt in enumerate(opts):
5494             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5495                 opts[idx + 1] = 'PRIVATE'
5496         return opts
5497
5498     def append_config(self, *args, label=None):
5499         config = type(self)(self.parser, label)
5500         config._loaded_paths = self._loaded_paths
5501         if config.init(*args):
5502             self.configs.append(config)
5503
5504     @property
5505     def all_args(self):
5506         for config in reversed(self.configs):
5507             yield from config.all_args
5508         yield from self.parsed_args or []
5509
5510     def parse_known_args(self, **kwargs):
5511         return self.parser.parse_known_args(self.all_args, **kwargs)
5512
5513     def parse_args(self):
5514         return self.parser.parse_args(self.all_args)
5515
5516
5517 class WebSocketsWrapper():
5518     """Wraps websockets module to use in non-async scopes"""
5519     pool = None
5520
5521     def __init__(self, url, headers=None, connect=True):
5522         self.loop = asyncio.new_event_loop()
5523         # XXX: "loop" is deprecated
5524         self.conn = websockets.connect(
5525             url, extra_headers=headers, ping_interval=None,
5526             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5527         if connect:
5528             self.__enter__()
5529         atexit.register(self.__exit__, None, None, None)
5530
5531     def __enter__(self):
5532         if not self.pool:
5533             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5534         return self
5535
5536     def send(self, *args):
5537         self.run_with_loop(self.pool.send(*args), self.loop)
5538
5539     def recv(self, *args):
5540         return self.run_with_loop(self.pool.recv(*args), self.loop)
5541
5542     def __exit__(self, type, value, traceback):
5543         try:
5544             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5545         finally:
5546             self.loop.close()
5547             self._cancel_all_tasks(self.loop)
5548
5549     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5550     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5551     @staticmethod
5552     def run_with_loop(main, loop):
5553         if not asyncio.iscoroutine(main):
5554             raise ValueError(f'a coroutine was expected, got {main!r}')
5555
5556         try:
5557             return loop.run_until_complete(main)
5558         finally:
5559             loop.run_until_complete(loop.shutdown_asyncgens())
5560             if hasattr(loop, 'shutdown_default_executor'):
5561                 loop.run_until_complete(loop.shutdown_default_executor())
5562
5563     @staticmethod
5564     def _cancel_all_tasks(loop):
5565         to_cancel = asyncio.all_tasks(loop)
5566
5567         if not to_cancel:
5568             return
5569
5570         for task in to_cancel:
5571             task.cancel()
5572
5573         # XXX: "loop" is removed in python 3.10+
5574         loop.run_until_complete(
5575             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5576
5577         for task in to_cancel:
5578             if task.cancelled():
5579                 continue
5580             if task.exception() is not None:
5581                 loop.call_exception_handler({
5582                     'message': 'unhandled exception during asyncio.run() shutdown',
5583                     'exception': task.exception(),
5584                     'task': task,
5585                 })
5586
5587
5588 def merge_headers(*dicts):
5589     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5590     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5591
5592
5593 def cached_method(f):
5594     """Cache a method"""
5595     signature = inspect.signature(f)
5596
5597     @functools.wraps(f)
5598     def wrapper(self, *args, **kwargs):
5599         bound_args = signature.bind(self, *args, **kwargs)
5600         bound_args.apply_defaults()
5601         key = tuple(bound_args.arguments.values())
5602
5603         if not hasattr(self, '__cached_method__cache'):
5604             self.__cached_method__cache = {}
5605         cache = self.__cached_method__cache.setdefault(f.__name__, {})
5606         if key not in cache:
5607             cache[key] = f(self, *args, **kwargs)
5608         return cache[key]
5609     return wrapper
5610
5611
5612 class classproperty:
5613     """property access for class methods"""
5614
5615     def __init__(self, func):
5616         functools.update_wrapper(self, func)
5617         self.func = func
5618
5619     def __get__(self, _, cls):
5620         return self.func(cls)
5621
5622
5623 class Namespace(types.SimpleNamespace):
5624     """Immutable namespace"""
5625
5626     def __iter__(self):
5627         return iter(self.__dict__.values())
5628
5629     @property
5630     def items_(self):
5631         return self.__dict__.items()
5632
5633
5634 MEDIA_EXTENSIONS = Namespace(
5635     common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5636     video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5637     common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5638     audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5639     thumbnails=('jpg', 'png', 'webp'),
5640     storyboards=('mhtml', ),
5641     subtitles=('srt', 'vtt', 'ass', 'lrc'),
5642     manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5643 )
5644 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5645 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5646
5647 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5648
5649
5650 # Deprecated
5651 has_certifi = bool(certifi)
5652 has_websockets = bool(websockets)