yt_dlp/utils.py

   1 import atexit
   2 import base64
   3 import binascii
   4 import calendar
   5 import codecs
   6 import collections
   7 import contextlib
   8 import ctypes
   9 import datetime
  10 import email.header
  11 import email.utils
  12 import errno
  13 import gzip
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import http.client
  19 import http.cookiejar
  20 import importlib.util
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import urllib.error
  44 import urllib.parse
  45 import urllib.request
  46 import xml.etree.ElementTree
  47 import zlib
  48
  49 from .compat import asyncio, functools  # isort: split
  50 from .compat import (
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_HTMLParseError,
  54     compat_os_name,
  55     compat_shlex_quote,
  56 )
  57 from .dependencies import brotli, certifi, websockets, xattr
  58 from .socks import ProxyType, sockssocket
  59
  60
  61 def register_socks_protocols():
  62     # "Register" SOCKS protocols
  63     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  64     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  65     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  66         if scheme not in urllib.parse.uses_netloc:
  67             urllib.parse.uses_netloc.append(scheme)
  68
  69
  70 # This is not clearly defined otherwise
  71 compiled_regex_type = type(re.compile(''))
  72
  73
  74 def random_user_agent():
  75     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  76     _CHROME_VERSIONS = (
  77         '90.0.4430.212',
  78         '90.0.4430.24',
  79         '90.0.4430.70',
  80         '90.0.4430.72',
  81         '90.0.4430.85',
  82         '90.0.4430.93',
  83         '91.0.4472.101',
  84         '91.0.4472.106',
  85         '91.0.4472.114',
  86         '91.0.4472.124',
  87         '91.0.4472.164',
  88         '91.0.4472.19',
  89         '91.0.4472.77',
  90         '92.0.4515.107',
  91         '92.0.4515.115',
  92         '92.0.4515.131',
  93         '92.0.4515.159',
  94         '92.0.4515.43',
  95         '93.0.4556.0',
  96         '93.0.4577.15',
  97         '93.0.4577.63',
  98         '93.0.4577.82',
  99         '94.0.4606.41',
 100         '94.0.4606.54',
 101         '94.0.4606.61',
 102         '94.0.4606.71',
 103         '94.0.4606.81',
 104         '94.0.4606.85',
 105         '95.0.4638.17',
 106         '95.0.4638.50',
 107         '95.0.4638.54',
 108         '95.0.4638.69',
 109         '95.0.4638.74',
 110         '96.0.4664.18',
 111         '96.0.4664.45',
 112         '96.0.4664.55',
 113         '96.0.4664.93',
 114         '97.0.4692.20',
 115     )
 116     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 117
 118
 119 SUPPORTED_ENCODINGS = [
 120     'gzip', 'deflate'
 121 ]
 122 if brotli:
 123     SUPPORTED_ENCODINGS.append('br')
 124
 125 std_headers = {
 126     'User-Agent': random_user_agent(),
 127     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 128     'Accept-Language': 'en-us,en;q=0.5',
 129     'Sec-Fetch-Mode': 'navigate',
 130 }
 131
 132
 133 USER_AGENTS = {
 134     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 135 }
 136
 137
 138 NO_DEFAULT = object()
 139 IDENTITY = lambda x: x
 140
 141 ENGLISH_MONTH_NAMES = [
 142     'January', 'February', 'March', 'April', 'May', 'June',
 143     'July', 'August', 'September', 'October', 'November', 'December']
 144
 145 MONTH_NAMES = {
 146     'en': ENGLISH_MONTH_NAMES,
 147     'fr': [
 148         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 149         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 150 }
 151
 152 KNOWN_EXTENSIONS = (
 153     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 154     'flv', 'f4v', 'f4a', 'f4b',
 155     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 156     'mkv', 'mka', 'mk3d',
 157     'avi', 'divx',
 158     'mov',
 159     'asf', 'wmv', 'wma',
 160     '3gp', '3g2',
 161     'mp3',
 162     'flac',
 163     'ape',
 164     'wav',
 165     'f4f', 'f4m', 'm3u8', 'smil')
 166
 167 # needed for sanitizing filenames in restricted mode
 168 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 169                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 170                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 171
 172 DATE_FORMATS = (
 173     '%d %B %Y',
 174     '%d %b %Y',
 175     '%B %d %Y',
 176     '%B %dst %Y',
 177     '%B %dnd %Y',
 178     '%B %drd %Y',
 179     '%B %dth %Y',
 180     '%b %d %Y',
 181     '%b %dst %Y',
 182     '%b %dnd %Y',
 183     '%b %drd %Y',
 184     '%b %dth %Y',
 185     '%b %dst %Y %I:%M',
 186     '%b %dnd %Y %I:%M',
 187     '%b %drd %Y %I:%M',
 188     '%b %dth %Y %I:%M',
 189     '%Y %m %d',
 190     '%Y-%m-%d',
 191     '%Y.%m.%d.',
 192     '%Y/%m/%d',
 193     '%Y/%m/%d %H:%M',
 194     '%Y/%m/%d %H:%M:%S',
 195     '%Y%m%d%H%M',
 196     '%Y%m%d%H%M%S',
 197     '%Y%m%d',
 198     '%Y-%m-%d %H:%M',
 199     '%Y-%m-%d %H:%M:%S',
 200     '%Y-%m-%d %H:%M:%S.%f',
 201     '%Y-%m-%d %H:%M:%S:%f',
 202     '%d.%m.%Y %H:%M',
 203     '%d.%m.%Y %H.%M',
 204     '%Y-%m-%dT%H:%M:%SZ',
 205     '%Y-%m-%dT%H:%M:%S.%fZ',
 206     '%Y-%m-%dT%H:%M:%S.%f0Z',
 207     '%Y-%m-%dT%H:%M:%S',
 208     '%Y-%m-%dT%H:%M:%S.%f',
 209     '%Y-%m-%dT%H:%M',
 210     '%b %d %Y at %H:%M',
 211     '%b %d %Y at %H:%M:%S',
 212     '%B %d %Y at %H:%M',
 213     '%B %d %Y at %H:%M:%S',
 214     '%H:%M %d-%b-%Y',
 215 )
 216
 217 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 218 DATE_FORMATS_DAY_FIRST.extend([
 219     '%d-%m-%Y',
 220     '%d.%m.%Y',
 221     '%d.%m.%y',
 222     '%d/%m/%Y',
 223     '%d/%m/%y',
 224     '%d/%m/%Y %H:%M:%S',
 225 ])
 226
 227 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 228 DATE_FORMATS_MONTH_FIRST.extend([
 229     '%m-%d-%Y',
 230     '%m.%d.%Y',
 231     '%m/%d/%Y',
 232     '%m/%d/%y',
 233     '%m/%d/%Y %H:%M:%S',
 234 ])
 235
 236 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 237 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
 238
 239 NUMBER_RE = r'\d+(?:\.\d+)?'
 240
 241
 242 @functools.cache
 243 def preferredencoding():
 244     """Get preferred encoding.
 245
 246     Returns the best encoding scheme for the system, based on
 247     locale.getpreferredencoding() and some further tweaks.
 248     """
 249     try:
 250         pref = locale.getpreferredencoding()
 251         'TEST'.encode(pref)
 252     except Exception:
 253         pref = 'UTF-8'
 254
 255     return pref
 256
 257
 258 def write_json_file(obj, fn):
 259     """ Encode obj as JSON and write it to fn, atomically if possible """
 260
 261     tf = tempfile.NamedTemporaryFile(
 262         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 263         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 264
 265     try:
 266         with tf:
 267             json.dump(obj, tf, ensure_ascii=False)
 268         if sys.platform == 'win32':
 269             # Need to remove existing file on Windows, else os.rename raises
 270             # WindowsError or FileExistsError.
 271             with contextlib.suppress(OSError):
 272                 os.unlink(fn)
 273         with contextlib.suppress(OSError):
 274             mask = os.umask(0)
 275             os.umask(mask)
 276             os.chmod(tf.name, 0o666 & ~mask)
 277         os.rename(tf.name, fn)
 278     except Exception:
 279         with contextlib.suppress(OSError):
 280             os.remove(tf.name)
 281         raise
 282
 283
 284 def find_xpath_attr(node, xpath, key, val=None):
 285     """ Find the xpath xpath[@key=val] """
 286     assert re.match(r'^[a-zA-Z_-]+$', key)
 287     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 288     return node.find(expr)
 289
 290 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 291 # the namespace parameter
 292
 293
 294 def xpath_with_ns(path, ns_map):
 295     components = [c.split(':') for c in path.split('/')]
 296     replaced = []
 297     for c in components:
 298         if len(c) == 1:
 299             replaced.append(c[0])
 300         else:
 301             ns, tag = c
 302             replaced.append('{%s}%s' % (ns_map[ns], tag))
 303     return '/'.join(replaced)
 304
 305
 306 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 307     def _find_xpath(xpath):
 308         return node.find(xpath)
 309
 310     if isinstance(xpath, str):
 311         n = _find_xpath(xpath)
 312     else:
 313         for xp in xpath:
 314             n = _find_xpath(xp)
 315             if n is not None:
 316                 break
 317
 318     if n is None:
 319         if default is not NO_DEFAULT:
 320             return default
 321         elif fatal:
 322             name = xpath if name is None else name
 323             raise ExtractorError('Could not find XML element %s' % name)
 324         else:
 325             return None
 326     return n
 327
 328
 329 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 330     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 331     if n is None or n == default:
 332         return n
 333     if n.text is None:
 334         if default is not NO_DEFAULT:
 335             return default
 336         elif fatal:
 337             name = xpath if name is None else name
 338             raise ExtractorError('Could not find XML element\'s text %s' % name)
 339         else:
 340             return None
 341     return n.text
 342
 343
 344 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 345     n = find_xpath_attr(node, xpath, key)
 346     if n is None:
 347         if default is not NO_DEFAULT:
 348             return default
 349         elif fatal:
 350             name = f'{xpath}[@{key}]' if name is None else name
 351             raise ExtractorError('Could not find XML attribute %s' % name)
 352         else:
 353             return None
 354     return n.attrib[key]
 355
 356
 357 def get_element_by_id(id, html, **kwargs):
 358     """Return the content of the tag with the specified ID in the passed HTML document"""
 359     return get_element_by_attribute('id', id, html, **kwargs)
 360
 361
 362 def get_element_html_by_id(id, html, **kwargs):
 363     """Return the html of the tag with the specified ID in the passed HTML document"""
 364     return get_element_html_by_attribute('id', id, html, **kwargs)
 365
 366
 367 def get_element_by_class(class_name, html):
 368     """Return the content of the first tag with the specified class in the passed HTML document"""
 369     retval = get_elements_by_class(class_name, html)
 370     return retval[0] if retval else None
 371
 372
 373 def get_element_html_by_class(class_name, html):
 374     """Return the html of the first tag with the specified class in the passed HTML document"""
 375     retval = get_elements_html_by_class(class_name, html)
 376     return retval[0] if retval else None
 377
 378
 379 def get_element_by_attribute(attribute, value, html, **kwargs):
 380     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 381     return retval[0] if retval else None
 382
 383
 384 def get_element_html_by_attribute(attribute, value, html, **kargs):
 385     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 386     return retval[0] if retval else None
 387
 388
 389 def get_elements_by_class(class_name, html, **kargs):
 390     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 391     return get_elements_by_attribute(
 392         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 393         html, escape_value=False)
 394
 395
 396 def get_elements_html_by_class(class_name, html):
 397     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 398     return get_elements_html_by_attribute(
 399         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 400         html, escape_value=False)
 401
 402
 403 def get_elements_by_attribute(*args, **kwargs):
 404     """Return the content of the tag with the specified attribute in the passed HTML document"""
 405     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 406
 407
 408 def get_elements_html_by_attribute(*args, **kwargs):
 409     """Return the html of the tag with the specified attribute in the passed HTML document"""
 410     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 411
 412
 413 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 414     """
 415     Return the text (content) and the html (whole) of the tag with the specified
 416     attribute in the passed HTML document
 417     """
 418
 419     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 420
 421     value = re.escape(value) if escape_value else value
 422
 423     partial_element_re = rf'''(?x)
 424         <(?P<tag>[a-zA-Z0-9:._-]+)
 425          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 426          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 427         '''
 428
 429     for m in re.finditer(partial_element_re, html):
 430         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 431
 432         yield (
 433             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 434             whole
 435         )
 436
 437
 438 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 439     """
 440     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 441     closing tag for the first opening tag it has encountered, and can be used
 442     as a context manager
 443     """
 444
 445     class HTMLBreakOnClosingTagException(Exception):
 446         pass
 447
 448     def __init__(self):
 449         self.tagstack = collections.deque()
 450         html.parser.HTMLParser.__init__(self)
 451
 452     def __enter__(self):
 453         return self
 454
 455     def __exit__(self, *_):
 456         self.close()
 457
 458     def close(self):
 459         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 460         # so data remains buffered; we no longer have any interest in it, thus
 461         # override this method to discard it
 462         pass
 463
 464     def handle_starttag(self, tag, _):
 465         self.tagstack.append(tag)
 466
 467     def handle_endtag(self, tag):
 468         if not self.tagstack:
 469             raise compat_HTMLParseError('no tags in the stack')
 470         while self.tagstack:
 471             inner_tag = self.tagstack.pop()
 472             if inner_tag == tag:
 473                 break
 474         else:
 475             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 476         if not self.tagstack:
 477             raise self.HTMLBreakOnClosingTagException()
 478
 479
 480 def get_element_text_and_html_by_tag(tag, html):
 481     """
 482     For the first element with the specified tag in the passed HTML document
 483     return its' content (text) and the whole element (html)
 484     """
 485     def find_or_raise(haystack, needle, exc):
 486         try:
 487             return haystack.index(needle)
 488         except ValueError:
 489             raise exc
 490     closing_tag = f'</{tag}>'
 491     whole_start = find_or_raise(
 492         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 493     content_start = find_or_raise(
 494         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 495     content_start += whole_start + 1
 496     with HTMLBreakOnClosingTagParser() as parser:
 497         parser.feed(html[whole_start:content_start])
 498         if not parser.tagstack or parser.tagstack[0] != tag:
 499             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 500         offset = content_start
 501         while offset < len(html):
 502             next_closing_tag_start = find_or_raise(
 503                 html[offset:], closing_tag,
 504                 compat_HTMLParseError(f'closing {tag} tag not found'))
 505             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 506             try:
 507                 parser.feed(html[offset:offset + next_closing_tag_end])
 508                 offset += next_closing_tag_end
 509             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 510                 return html[content_start:offset + next_closing_tag_start], \
 511                     html[whole_start:offset + next_closing_tag_end]
 512         raise compat_HTMLParseError('unexpected end of html')
 513
 514
 515 class HTMLAttributeParser(html.parser.HTMLParser):
 516     """Trivial HTML parser to gather the attributes for a single element"""
 517
 518     def __init__(self):
 519         self.attrs = {}
 520         html.parser.HTMLParser.__init__(self)
 521
 522     def handle_starttag(self, tag, attrs):
 523         self.attrs = dict(attrs)
 524
 525
 526 class HTMLListAttrsParser(html.parser.HTMLParser):
 527     """HTML parser to gather the attributes for the elements of a list"""
 528
 529     def __init__(self):
 530         html.parser.HTMLParser.__init__(self)
 531         self.items = []
 532         self._level = 0
 533
 534     def handle_starttag(self, tag, attrs):
 535         if tag == 'li' and self._level == 0:
 536             self.items.append(dict(attrs))
 537         self._level += 1
 538
 539     def handle_endtag(self, tag):
 540         self._level -= 1
 541
 542
 543 def extract_attributes(html_element):
 544     """Given a string for an HTML element such as
 545     <el
 546          a="foo" B="bar" c="&98;az" d=boz
 547          empty= noval entity="&amp;"
 548          sq='"' dq="'"
 549     >
 550     Decode and return a dictionary of attributes.
 551     {
 552         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 553         'empty': '', 'noval': None, 'entity': '&',
 554         'sq': '"', 'dq': '\''
 555     }.
 556     """
 557     parser = HTMLAttributeParser()
 558     with contextlib.suppress(compat_HTMLParseError):
 559         parser.feed(html_element)
 560         parser.close()
 561     return parser.attrs
 562
 563
 564 def parse_list(webpage):
 565     """Given a string for an series of HTML <li> elements,
 566     return a dictionary of their attributes"""
 567     parser = HTMLListAttrsParser()
 568     parser.feed(webpage)
 569     parser.close()
 570     return parser.items
 571
 572
 573 def clean_html(html):
 574     """Clean an HTML snippet into a readable string"""
 575
 576     if html is None:  # Convenience for sanitizing descriptions etc.
 577         return html
 578
 579     html = re.sub(r'\s+', ' ', html)
 580     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 581     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 582     # Strip html tags
 583     html = re.sub('<.*?>', '', html)
 584     # Replace html entities
 585     html = unescapeHTML(html)
 586     return html.strip()
 587
 588
 589 class LenientJSONDecoder(json.JSONDecoder):
 590     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 591         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 592         super().__init__(*args, **kwargs)
 593
 594     def decode(self, s):
 595         if self.transform_source:
 596             s = self.transform_source(s)
 597         if self.ignore_extra:
 598             return self.raw_decode(s.lstrip())[0]
 599         return super().decode(s)
 600
 601
 602 def sanitize_open(filename, open_mode):
 603     """Try to open the given filename, and slightly tweak it if this fails.
 604
 605     Attempts to open the given filename. If this fails, it tries to change
 606     the filename slightly, step by step, until it's either able to open it
 607     or it fails and raises a final exception, like the standard open()
 608     function.
 609
 610     It returns the tuple (stream, definitive_file_name).
 611     """
 612     if filename == '-':
 613         if sys.platform == 'win32':
 614             import msvcrt
 615             msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 616         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 617
 618     for attempt in range(2):
 619         try:
 620             try:
 621                 if sys.platform == 'win32':
 622                     # FIXME: An exclusive lock also locks the file from being read.
 623                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 624                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 625                     raise LockingUnsupportedError()
 626                 stream = locked_file(filename, open_mode, block=False).__enter__()
 627             except OSError:
 628                 stream = open(filename, open_mode)
 629             return stream, filename
 630         except OSError as err:
 631             if attempt or err.errno in (errno.EACCES,):
 632                 raise
 633             old_filename, filename = filename, sanitize_path(filename)
 634             if old_filename == filename:
 635                 raise
 636
 637
 638 def timeconvert(timestr):
 639     """Convert RFC 2822 defined time string into system timestamp"""
 640     timestamp = None
 641     timetuple = email.utils.parsedate_tz(timestr)
 642     if timetuple is not None:
 643         timestamp = email.utils.mktime_tz(timetuple)
 644     return timestamp
 645
 646
 647 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 648     """Sanitizes a string so it could be used as part of a filename.
 649     @param restricted   Use a stricter subset of allowed characters
 650     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 651                         If unset, yt-dlp's new sanitization rules are in effect
 652     """
 653     if s == '':
 654         return ''
 655
 656     def replace_insane(char):
 657         if restricted and char in ACCENT_CHARS:
 658             return ACCENT_CHARS[char]
 659         elif not restricted and char == '\n':
 660             return '\0 '
 661         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 662             return ''
 663         elif char == '"':
 664             return '' if restricted else '\''
 665         elif char == ':':
 666             return '\0_\0-' if restricted else '\0 \0-'
 667         elif char in '\\/|*<>':
 668             return '\0_'
 669         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 670             return '\0_'
 671         return char
 672
 673     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 674     result = ''.join(map(replace_insane, s))
 675     if is_id is NO_DEFAULT:
 676         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 677         STRIP_RE = r'(?:\0.|[ _-])*'
 678         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 679     result = result.replace('\0', '') or '_'
 680
 681     if not is_id:
 682         while '__' in result:
 683             result = result.replace('__', '_')
 684         result = result.strip('_')
 685         # Common case of "Foreign band name - English song title"
 686         if restricted and result.startswith('-_'):
 687             result = result[2:]
 688         if result.startswith('-'):
 689             result = '_' + result[len('-'):]
 690         result = result.lstrip('.')
 691         if not result:
 692             result = '_'
 693     return result
 694
 695
 696 def sanitize_path(s, force=False):
 697     """Sanitizes and normalizes path on Windows"""
 698     if sys.platform == 'win32':
 699         force = False
 700         drive_or_unc, _ = os.path.splitdrive(s)
 701     elif force:
 702         drive_or_unc = ''
 703     else:
 704         return s
 705
 706     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 707     if drive_or_unc:
 708         norm_path.pop(0)
 709     sanitized_path = [
 710         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 711         for path_part in norm_path]
 712     if drive_or_unc:
 713         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 714     elif force and s and s[0] == os.path.sep:
 715         sanitized_path.insert(0, os.path.sep)
 716     return os.path.join(*sanitized_path)
 717
 718
 719 def sanitize_url(url):
 720     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 721     # the number of unwanted failures due to missing protocol
 722     if url is None:
 723         return
 724     elif url.startswith('//'):
 725         return 'http:%s' % url
 726     # Fix some common typos seen so far
 727     COMMON_TYPOS = (
 728         # https://github.com/ytdl-org/youtube-dl/issues/15649
 729         (r'^httpss://', r'https://'),
 730         # https://bx1.be/lives/direct-tv/
 731         (r'^rmtp([es]?)://', r'rtmp\1://'),
 732     )
 733     for mistake, fixup in COMMON_TYPOS:
 734         if re.match(mistake, url):
 735             return re.sub(mistake, fixup, url)
 736     return url
 737
 738
 739 def extract_basic_auth(url):
 740     parts = urllib.parse.urlsplit(url)
 741     if parts.username is None:
 742         return url, None
 743     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 744         parts.hostname if parts.port is None
 745         else '%s:%d' % (parts.hostname, parts.port))))
 746     auth_payload = base64.b64encode(
 747         ('%s:%s' % (parts.username, parts.password or '')).encode())
 748     return url, f'Basic {auth_payload.decode()}'
 749
 750
 751 def sanitized_Request(url, *args, **kwargs):
 752     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 753     if auth_header is not None:
 754         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 755         headers['Authorization'] = auth_header
 756     return urllib.request.Request(url, *args, **kwargs)
 757
 758
 759 def expand_path(s):
 760     """Expand shell variables and ~"""
 761     return os.path.expandvars(compat_expanduser(s))
 762
 763
 764 def orderedSet(iterable, *, lazy=False):
 765     """Remove all duplicates from the input iterable"""
 766     def _iter():
 767         seen = []  # Do not use set since the items can be unhashable
 768         for x in iterable:
 769             if x not in seen:
 770                 seen.append(x)
 771                 yield x
 772
 773     return _iter() if lazy else list(_iter())
 774
 775
 776 def _htmlentity_transform(entity_with_semicolon):
 777     """Transforms an HTML entity to a character."""
 778     entity = entity_with_semicolon[:-1]
 779
 780     # Known non-numeric HTML entity
 781     if entity in html.entities.name2codepoint:
 782         return chr(html.entities.name2codepoint[entity])
 783
 784     # TODO: HTML5 allows entities without a semicolon. For example,
 785     # '&Eacuteric' should be decoded as 'Éric'.
 786     if entity_with_semicolon in html.entities.html5:
 787         return html.entities.html5[entity_with_semicolon]
 788
 789     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 790     if mobj is not None:
 791         numstr = mobj.group(1)
 792         if numstr.startswith('x'):
 793             base = 16
 794             numstr = '0%s' % numstr
 795         else:
 796             base = 10
 797         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 798         with contextlib.suppress(ValueError):
 799             return chr(int(numstr, base))
 800
 801     # Unknown entity in name, return its literal representation
 802     return '&%s;' % entity
 803
 804
 805 def unescapeHTML(s):
 806     if s is None:
 807         return None
 808     assert isinstance(s, str)
 809
 810     return re.sub(
 811         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 812
 813
 814 def escapeHTML(text):
 815     return (
 816         text
 817         .replace('&', '&amp;')
 818         .replace('<', '&lt;')
 819         .replace('>', '&gt;')
 820         .replace('"', '&quot;')
 821         .replace("'", '&#39;')
 822     )
 823
 824
 825 def process_communicate_or_kill(p, *args, **kwargs):
 826     write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
 827                  'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
 828     return Popen.communicate_or_kill(p, *args, **kwargs)
 829
 830
 831 class Popen(subprocess.Popen):
 832     if sys.platform == 'win32':
 833         _startupinfo = subprocess.STARTUPINFO()
 834         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 835     else:
 836         _startupinfo = None
 837
 838     def __init__(self, *args, text=False, **kwargs):
 839         if text is True:
 840             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 841             kwargs.setdefault('encoding', 'utf-8')
 842             kwargs.setdefault('errors', 'replace')
 843         super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
 844
 845     def communicate_or_kill(self, *args, **kwargs):
 846         try:
 847             return self.communicate(*args, **kwargs)
 848         except BaseException:  # Including KeyboardInterrupt
 849             self.kill(timeout=None)
 850             raise
 851
 852     def kill(self, *, timeout=0):
 853         super().kill()
 854         if timeout != 0:
 855             self.wait(timeout=timeout)
 856
 857     @classmethod
 858     def run(cls, *args, **kwargs):
 859         with cls(*args, **kwargs) as proc:
 860             stdout, stderr = proc.communicate_or_kill()
 861             return stdout or '', stderr or '', proc.returncode
 862
 863
 864 def get_subprocess_encoding():
 865     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 866         # For subprocess calls, encode with locale encoding
 867         # Refer to http://stackoverflow.com/a/9951851/35070
 868         encoding = preferredencoding()
 869     else:
 870         encoding = sys.getfilesystemencoding()
 871     if encoding is None:
 872         encoding = 'utf-8'
 873     return encoding
 874
 875
 876 def encodeFilename(s, for_subprocess=False):
 877     assert isinstance(s, str)
 878     return s
 879
 880
 881 def decodeFilename(b, for_subprocess=False):
 882     return b
 883
 884
 885 def encodeArgument(s):
 886     # Legacy code that uses byte strings
 887     # Uncomment the following line after fixing all post processors
 888     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 889     return s if isinstance(s, str) else s.decode('ascii')
 890
 891
 892 def decodeArgument(b):
 893     return b
 894
 895
 896 def decodeOption(optval):
 897     if optval is None:
 898         return optval
 899     if isinstance(optval, bytes):
 900         optval = optval.decode(preferredencoding())
 901
 902     assert isinstance(optval, str)
 903     return optval
 904
 905
 906 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 907
 908
 909 def timetuple_from_msec(msec):
 910     secs, msec = divmod(msec, 1000)
 911     mins, secs = divmod(secs, 60)
 912     hrs, mins = divmod(mins, 60)
 913     return _timetuple(hrs, mins, secs, msec)
 914
 915
 916 def formatSeconds(secs, delim=':', msec=False):
 917     time = timetuple_from_msec(secs * 1000)
 918     if time.hours:
 919         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 920     elif time.minutes:
 921         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 922     else:
 923         ret = '%d' % time.seconds
 924     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 925
 926
 927 def _ssl_load_windows_store_certs(ssl_context, storename):
 928     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 929     try:
 930         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 931                  if encoding == 'x509_asn' and (
 932                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 933     except PermissionError:
 934         return
 935     for cert in certs:
 936         with contextlib.suppress(ssl.SSLError):
 937             ssl_context.load_verify_locations(cadata=cert)
 938
 939
 940 def make_HTTPS_handler(params, **kwargs):
 941     opts_check_certificate = not params.get('nocheckcertificate')
 942     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 943     context.check_hostname = opts_check_certificate
 944     if params.get('legacyserverconnect'):
 945         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 946         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 947         context.set_ciphers('DEFAULT')
 948
 949     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 950     if opts_check_certificate:
 951         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 952             context.load_verify_locations(cafile=certifi.where())
 953         else:
 954             try:
 955                 context.load_default_certs()
 956                 # Work around the issue in load_default_certs when there are bad certificates. See:
 957                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
 958                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 959             except ssl.SSLError:
 960                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 961                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 962                     for storename in ('CA', 'ROOT'):
 963                         _ssl_load_windows_store_certs(context, storename)
 964                 context.set_default_verify_paths()
 965
 966     client_certfile = params.get('client_certificate')
 967     if client_certfile:
 968         try:
 969             context.load_cert_chain(
 970                 client_certfile, keyfile=params.get('client_certificate_key'),
 971                 password=params.get('client_certificate_password'))
 972         except ssl.SSLError:
 973             raise YoutubeDLError('Unable to load client certificate')
 974
 975     # Some servers may reject requests if ALPN extension is not sent. See:
 976     # https://github.com/python/cpython/issues/85140
 977     # https://github.com/yt-dlp/yt-dlp/issues/3878
 978     with contextlib.suppress(NotImplementedError):
 979         context.set_alpn_protocols(['http/1.1'])
 980
 981     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 982
 983
 984 def bug_reports_message(before=';'):
 985     from .update import REPOSITORY
 986
 987     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 988            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 989
 990     before = before.rstrip()
 991     if not before or before.endswith(('.', '!', '?')):
 992         msg = msg[0].title() + msg[1:]
 993
 994     return (before + ' ' if before else '') + msg
 995
 996
 997 class YoutubeDLError(Exception):
 998     """Base exception for YoutubeDL errors."""
 999     msg = None
1000
1001     def __init__(self, msg=None):
1002         if msg is not None:
1003             self.msg = msg
1004         elif self.msg is None:
1005             self.msg = type(self).__name__
1006         super().__init__(self.msg)
1007
1008
1009 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1010 if hasattr(ssl, 'CertificateError'):
1011     network_exceptions.append(ssl.CertificateError)
1012 network_exceptions = tuple(network_exceptions)
1013
1014
1015 class ExtractorError(YoutubeDLError):
1016     """Error during info extraction."""
1017
1018     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1019         """ tb, if given, is the original traceback (so that it can be printed out).
1020         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1021         """
1022         if sys.exc_info()[0] in network_exceptions:
1023             expected = True
1024
1025         self.orig_msg = str(msg)
1026         self.traceback = tb
1027         self.expected = expected
1028         self.cause = cause
1029         self.video_id = video_id
1030         self.ie = ie
1031         self.exc_info = sys.exc_info()  # preserve original exception
1032         if isinstance(self.exc_info[1], ExtractorError):
1033             self.exc_info = self.exc_info[1].exc_info
1034
1035         super().__init__(''.join((
1036             format_field(ie, None, '[%s] '),
1037             format_field(video_id, None, '%s: '),
1038             msg,
1039             format_field(cause, None, ' (caused by %r)'),
1040             '' if expected else bug_reports_message())))
1041
1042     def format_traceback(self):
1043         return join_nonempty(
1044             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1045             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1046             delim='\n') or None
1047
1048
1049 class UnsupportedError(ExtractorError):
1050     def __init__(self, url):
1051         super().__init__(
1052             'Unsupported URL: %s' % url, expected=True)
1053         self.url = url
1054
1055
1056 class RegexNotFoundError(ExtractorError):
1057     """Error when a regex didn't match"""
1058     pass
1059
1060
1061 class GeoRestrictedError(ExtractorError):
1062     """Geographic restriction Error exception.
1063
1064     This exception may be thrown when a video is not available from your
1065     geographic location due to geographic restrictions imposed by a website.
1066     """
1067
1068     def __init__(self, msg, countries=None, **kwargs):
1069         kwargs['expected'] = True
1070         super().__init__(msg, **kwargs)
1071         self.countries = countries
1072
1073
1074 class DownloadError(YoutubeDLError):
1075     """Download Error exception.
1076
1077     This exception may be thrown by FileDownloader objects if they are not
1078     configured to continue on errors. They will contain the appropriate
1079     error message.
1080     """
1081
1082     def __init__(self, msg, exc_info=None):
1083         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1084         super().__init__(msg)
1085         self.exc_info = exc_info
1086
1087
1088 class EntryNotInPlaylist(YoutubeDLError):
1089     """Entry not in playlist exception.
1090
1091     This exception will be thrown by YoutubeDL when a requested entry
1092     is not found in the playlist info_dict
1093     """
1094     msg = 'Entry not found in info'
1095
1096
1097 class SameFileError(YoutubeDLError):
1098     """Same File exception.
1099
1100     This exception will be thrown by FileDownloader objects if they detect
1101     multiple files would have to be downloaded to the same file on disk.
1102     """
1103     msg = 'Fixed output name but more than one file to download'
1104
1105     def __init__(self, filename=None):
1106         if filename is not None:
1107             self.msg += f': {filename}'
1108         super().__init__(self.msg)
1109
1110
1111 class PostProcessingError(YoutubeDLError):
1112     """Post Processing exception.
1113
1114     This exception may be raised by PostProcessor's .run() method to
1115     indicate an error in the postprocessing task.
1116     """
1117
1118
1119 class DownloadCancelled(YoutubeDLError):
1120     """ Exception raised when the download queue should be interrupted """
1121     msg = 'The download was cancelled'
1122
1123
1124 class ExistingVideoReached(DownloadCancelled):
1125     """ --break-on-existing triggered """
1126     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1127
1128
1129 class RejectedVideoReached(DownloadCancelled):
1130     """ --break-on-reject triggered """
1131     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1132
1133
1134 class MaxDownloadsReached(DownloadCancelled):
1135     """ --max-downloads limit has been reached. """
1136     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1137
1138
1139 class ReExtractInfo(YoutubeDLError):
1140     """ Video info needs to be re-extracted. """
1141
1142     def __init__(self, msg, expected=False):
1143         super().__init__(msg)
1144         self.expected = expected
1145
1146
1147 class ThrottledDownload(ReExtractInfo):
1148     """ Download speed below --throttled-rate. """
1149     msg = 'The download speed is below throttle limit'
1150
1151     def __init__(self):
1152         super().__init__(self.msg, expected=False)
1153
1154
1155 class UnavailableVideoError(YoutubeDLError):
1156     """Unavailable Format exception.
1157
1158     This exception will be thrown when a video is requested
1159     in a format that is not available for that video.
1160     """
1161     msg = 'Unable to download video'
1162
1163     def __init__(self, err=None):
1164         if err is not None:
1165             self.msg += f': {err}'
1166         super().__init__(self.msg)
1167
1168
1169 class ContentTooShortError(YoutubeDLError):
1170     """Content Too Short exception.
1171
1172     This exception may be raised by FileDownloader objects when a file they
1173     download is too small for what the server announced first, indicating
1174     the connection was probably interrupted.
1175     """
1176
1177     def __init__(self, downloaded, expected):
1178         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1179         # Both in bytes
1180         self.downloaded = downloaded
1181         self.expected = expected
1182
1183
1184 class XAttrMetadataError(YoutubeDLError):
1185     def __init__(self, code=None, msg='Unknown error'):
1186         super().__init__(msg)
1187         self.code = code
1188         self.msg = msg
1189
1190         # Parsing code and msg
1191         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1192                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1193             self.reason = 'NO_SPACE'
1194         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1195             self.reason = 'VALUE_TOO_LONG'
1196         else:
1197             self.reason = 'NOT_SUPPORTED'
1198
1199
1200 class XAttrUnavailableError(YoutubeDLError):
1201     pass
1202
1203
1204 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1205     hc = http_class(*args, **kwargs)
1206     source_address = ydl_handler._params.get('source_address')
1207
1208     if source_address is not None:
1209         # This is to workaround _create_connection() from socket where it will try all
1210         # address data from getaddrinfo() including IPv6. This filters the result from
1211         # getaddrinfo() based on the source_address value.
1212         # This is based on the cpython socket.create_connection() function.
1213         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1214         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1215             host, port = address
1216             err = None
1217             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1218             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1219             ip_addrs = [addr for addr in addrs if addr[0] == af]
1220             if addrs and not ip_addrs:
1221                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1222                 raise OSError(
1223                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1224                     % (ip_version, source_address[0]))
1225             for res in ip_addrs:
1226                 af, socktype, proto, canonname, sa = res
1227                 sock = None
1228                 try:
1229                     sock = socket.socket(af, socktype, proto)
1230                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1231                         sock.settimeout(timeout)
1232                     sock.bind(source_address)
1233                     sock.connect(sa)
1234                     err = None  # Explicitly break reference cycle
1235                     return sock
1236                 except OSError as _:
1237                     err = _
1238                     if sock is not None:
1239                         sock.close()
1240             if err is not None:
1241                 raise err
1242             else:
1243                 raise OSError('getaddrinfo returns an empty list')
1244         if hasattr(hc, '_create_connection'):
1245             hc._create_connection = _create_connection
1246         hc.source_address = (source_address, 0)
1247
1248     return hc
1249
1250
1251 def handle_youtubedl_headers(headers):
1252     filtered_headers = headers
1253
1254     if 'Youtubedl-no-compression' in filtered_headers:
1255         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1256         del filtered_headers['Youtubedl-no-compression']
1257
1258     return filtered_headers
1259
1260
1261 class YoutubeDLHandler(urllib.request.HTTPHandler):
1262     """Handler for HTTP requests and responses.
1263
1264     This class, when installed with an OpenerDirector, automatically adds
1265     the standard headers to every HTTP request and handles gzipped and
1266     deflated responses from web servers. If compression is to be avoided in
1267     a particular request, the original request in the program code only has
1268     to include the HTTP header "Youtubedl-no-compression", which will be
1269     removed before making the real request.
1270
1271     Part of this code was copied from:
1272
1273     http://techknack.net/python-urllib2-handlers/
1274
1275     Andrew Rowls, the author of that code, agreed to release it to the
1276     public domain.
1277     """
1278
1279     def __init__(self, params, *args, **kwargs):
1280         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1281         self._params = params
1282
1283     def http_open(self, req):
1284         conn_class = http.client.HTTPConnection
1285
1286         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1287         if socks_proxy:
1288             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1289             del req.headers['Ytdl-socks-proxy']
1290
1291         return self.do_open(functools.partial(
1292             _create_http_connection, self, conn_class, False),
1293             req)
1294
1295     @staticmethod
1296     def deflate(data):
1297         if not data:
1298             return data
1299         try:
1300             return zlib.decompress(data, -zlib.MAX_WBITS)
1301         except zlib.error:
1302             return zlib.decompress(data)
1303
1304     @staticmethod
1305     def brotli(data):
1306         if not data:
1307             return data
1308         return brotli.decompress(data)
1309
1310     def http_request(self, req):
1311         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1312         # always respected by websites, some tend to give out URLs with non percent-encoded
1313         # non-ASCII characters (see telemb.py, ard.py [#3412])
1314         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1315         # To work around aforementioned issue we will replace request's original URL with
1316         # percent-encoded one
1317         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1318         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1319         url = req.get_full_url()
1320         url_escaped = escape_url(url)
1321
1322         # Substitute URL if any change after escaping
1323         if url != url_escaped:
1324             req = update_Request(req, url=url_escaped)
1325
1326         for h, v in self._params.get('http_headers', std_headers).items():
1327             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1328             # The dict keys are capitalized because of this bug by urllib
1329             if h.capitalize() not in req.headers:
1330                 req.add_header(h, v)
1331
1332         if 'Accept-encoding' not in req.headers:
1333             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1334
1335         req.headers = handle_youtubedl_headers(req.headers)
1336
1337         return super().do_request_(req)
1338
1339     def http_response(self, req, resp):
1340         old_resp = resp
1341         # gzip
1342         if resp.headers.get('Content-encoding', '') == 'gzip':
1343             content = resp.read()
1344             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1345             try:
1346                 uncompressed = io.BytesIO(gz.read())
1347             except OSError as original_ioerror:
1348                 # There may be junk add the end of the file
1349                 # See http://stackoverflow.com/q/4928560/35070 for details
1350                 for i in range(1, 1024):
1351                     try:
1352                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1353                         uncompressed = io.BytesIO(gz.read())
1354                     except OSError:
1355                         continue
1356                     break
1357                 else:
1358                     raise original_ioerror
1359             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1360             resp.msg = old_resp.msg
1361             del resp.headers['Content-encoding']
1362         # deflate
1363         if resp.headers.get('Content-encoding', '') == 'deflate':
1364             gz = io.BytesIO(self.deflate(resp.read()))
1365             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1366             resp.msg = old_resp.msg
1367             del resp.headers['Content-encoding']
1368         # brotli
1369         if resp.headers.get('Content-encoding', '') == 'br':
1370             resp = urllib.request.addinfourl(
1371                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1372             resp.msg = old_resp.msg
1373             del resp.headers['Content-encoding']
1374         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1375         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1376         if 300 <= resp.code < 400:
1377             location = resp.headers.get('Location')
1378             if location:
1379                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1380                 location = location.encode('iso-8859-1').decode()
1381                 location_escaped = escape_url(location)
1382                 if location != location_escaped:
1383                     del resp.headers['Location']
1384                     resp.headers['Location'] = location_escaped
1385         return resp
1386
1387     https_request = http_request
1388     https_response = http_response
1389
1390
1391 def make_socks_conn_class(base_class, socks_proxy):
1392     assert issubclass(base_class, (
1393         http.client.HTTPConnection, http.client.HTTPSConnection))
1394
1395     url_components = urllib.parse.urlparse(socks_proxy)
1396     if url_components.scheme.lower() == 'socks5':
1397         socks_type = ProxyType.SOCKS5
1398     elif url_components.scheme.lower() in ('socks', 'socks4'):
1399         socks_type = ProxyType.SOCKS4
1400     elif url_components.scheme.lower() == 'socks4a':
1401         socks_type = ProxyType.SOCKS4A
1402
1403     def unquote_if_non_empty(s):
1404         if not s:
1405             return s
1406         return urllib.parse.unquote_plus(s)
1407
1408     proxy_args = (
1409         socks_type,
1410         url_components.hostname, url_components.port or 1080,
1411         True,  # Remote DNS
1412         unquote_if_non_empty(url_components.username),
1413         unquote_if_non_empty(url_components.password),
1414     )
1415
1416     class SocksConnection(base_class):
1417         def connect(self):
1418             self.sock = sockssocket()
1419             self.sock.setproxy(*proxy_args)
1420             if isinstance(self.timeout, (int, float)):
1421                 self.sock.settimeout(self.timeout)
1422             self.sock.connect((self.host, self.port))
1423
1424             if isinstance(self, http.client.HTTPSConnection):
1425                 if hasattr(self, '_context'):  # Python > 2.6
1426                     self.sock = self._context.wrap_socket(
1427                         self.sock, server_hostname=self.host)
1428                 else:
1429                     self.sock = ssl.wrap_socket(self.sock)
1430
1431     return SocksConnection
1432
1433
1434 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1435     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1436         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1437         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1438         self._params = params
1439
1440     def https_open(self, req):
1441         kwargs = {}
1442         conn_class = self._https_conn_class
1443
1444         if hasattr(self, '_context'):  # python > 2.6
1445             kwargs['context'] = self._context
1446         if hasattr(self, '_check_hostname'):  # python 3.x
1447             kwargs['check_hostname'] = self._check_hostname
1448
1449         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1450         if socks_proxy:
1451             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1452             del req.headers['Ytdl-socks-proxy']
1453
1454         try:
1455             return self.do_open(
1456                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1457         except urllib.error.URLError as e:
1458             if (isinstance(e.reason, ssl.SSLError)
1459                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1460                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1461             raise
1462
1463
1464 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1465     """
1466     See [1] for cookie file format.
1467
1468     1. https://curl.haxx.se/docs/http-cookies.html
1469     """
1470     _HTTPONLY_PREFIX = '#HttpOnly_'
1471     _ENTRY_LEN = 7
1472     _HEADER = '''# Netscape HTTP Cookie File
1473 # This file is generated by yt-dlp.  Do not edit.
1474
1475 '''
1476     _CookieFileEntry = collections.namedtuple(
1477         'CookieFileEntry',
1478         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1479
1480     def __init__(self, filename=None, *args, **kwargs):
1481         super().__init__(None, *args, **kwargs)
1482         if self.is_path(filename):
1483             filename = os.fspath(filename)
1484         self.filename = filename
1485
1486     @staticmethod
1487     def _true_or_false(cndn):
1488         return 'TRUE' if cndn else 'FALSE'
1489
1490     @staticmethod
1491     def is_path(file):
1492         return isinstance(file, (str, bytes, os.PathLike))
1493
1494     @contextlib.contextmanager
1495     def open(self, file, *, write=False):
1496         if self.is_path(file):
1497             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1498                 yield f
1499         else:
1500             if write:
1501                 file.truncate(0)
1502             yield file
1503
1504     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1505         now = time.time()
1506         for cookie in self:
1507             if (not ignore_discard and cookie.discard
1508                     or not ignore_expires and cookie.is_expired(now)):
1509                 continue
1510             name, value = cookie.name, cookie.value
1511             if value is None:
1512                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1513                 # with no name, whereas http.cookiejar regards it as a
1514                 # cookie with no value.
1515                 name, value = '', name
1516             f.write('%s\n' % '\t'.join((
1517                 cookie.domain,
1518                 self._true_or_false(cookie.domain.startswith('.')),
1519                 cookie.path,
1520                 self._true_or_false(cookie.secure),
1521                 str_or_none(cookie.expires, default=''),
1522                 name, value
1523             )))
1524
1525     def save(self, filename=None, *args, **kwargs):
1526         """
1527         Save cookies to a file.
1528         Code is taken from CPython 3.6
1529         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1530
1531         if filename is None:
1532             if self.filename is not None:
1533                 filename = self.filename
1534             else:
1535                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1536
1537         # Store session cookies with `expires` set to 0 instead of an empty string
1538         for cookie in self:
1539             if cookie.expires is None:
1540                 cookie.expires = 0
1541
1542         with self.open(filename, write=True) as f:
1543             f.write(self._HEADER)
1544             self._really_save(f, *args, **kwargs)
1545
1546     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1547         """Load cookies from a file."""
1548         if filename is None:
1549             if self.filename is not None:
1550                 filename = self.filename
1551             else:
1552                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1553
1554         def prepare_line(line):
1555             if line.startswith(self._HTTPONLY_PREFIX):
1556                 line = line[len(self._HTTPONLY_PREFIX):]
1557             # comments and empty lines are fine
1558             if line.startswith('#') or not line.strip():
1559                 return line
1560             cookie_list = line.split('\t')
1561             if len(cookie_list) != self._ENTRY_LEN:
1562                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1563             cookie = self._CookieFileEntry(*cookie_list)
1564             if cookie.expires_at and not cookie.expires_at.isdigit():
1565                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1566             return line
1567
1568         cf = io.StringIO()
1569         with self.open(filename) as f:
1570             for line in f:
1571                 try:
1572                     cf.write(prepare_line(line))
1573                 except http.cookiejar.LoadError as e:
1574                     if f'{line.strip()} '[0] in '[{"':
1575                         raise http.cookiejar.LoadError(
1576                             'Cookies file must be Netscape formatted, not JSON. See  '
1577                             'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1578                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1579                     continue
1580         cf.seek(0)
1581         self._really_load(cf, filename, ignore_discard, ignore_expires)
1582         # Session cookies are denoted by either `expires` field set to
1583         # an empty string or 0. MozillaCookieJar only recognizes the former
1584         # (see [1]). So we need force the latter to be recognized as session
1585         # cookies on our own.
1586         # Session cookies may be important for cookies-based authentication,
1587         # e.g. usually, when user does not check 'Remember me' check box while
1588         # logging in on a site, some important cookies are stored as session
1589         # cookies so that not recognizing them will result in failed login.
1590         # 1. https://bugs.python.org/issue17164
1591         for cookie in self:
1592             # Treat `expires=0` cookies as session cookies
1593             if cookie.expires == 0:
1594                 cookie.expires = None
1595                 cookie.discard = True
1596
1597
1598 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1599     def __init__(self, cookiejar=None):
1600         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1601
1602     def http_response(self, request, response):
1603         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1604
1605     https_request = urllib.request.HTTPCookieProcessor.http_request
1606     https_response = http_response
1607
1608
1609 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1610     """YoutubeDL redirect handler
1611
1612     The code is based on HTTPRedirectHandler implementation from CPython [1].
1613
1614     This redirect handler solves two issues:
1615      - ensures redirect URL is always unicode under python 2
1616      - introduces support for experimental HTTP response status code
1617        308 Permanent Redirect [2] used by some sites [3]
1618
1619     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1620     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1621     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1622     """
1623
1624     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1625
1626     def redirect_request(self, req, fp, code, msg, headers, newurl):
1627         """Return a Request or None in response to a redirect.
1628
1629         This is called by the http_error_30x methods when a
1630         redirection response is received.  If a redirection should
1631         take place, return a new Request to allow http_error_30x to
1632         perform the redirect.  Otherwise, raise HTTPError if no-one
1633         else should try to handle this url.  Return None if you can't
1634         but another Handler might.
1635         """
1636         m = req.get_method()
1637         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1638                  or code in (301, 302, 303) and m == "POST")):
1639             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1640         # Strictly (according to RFC 2616), 301 or 302 in response to
1641         # a POST MUST NOT cause a redirection without confirmation
1642         # from the user (of urllib.request, in this case).  In practice,
1643         # essentially all clients do redirect in this case, so we do
1644         # the same.
1645
1646         # Be conciliant with URIs containing a space.  This is mainly
1647         # redundant with the more complete encoding done in http_error_302(),
1648         # but it is kept for compatibility with other callers.
1649         newurl = newurl.replace(' ', '%20')
1650
1651         CONTENT_HEADERS = ("content-length", "content-type")
1652         # NB: don't use dict comprehension for python 2.6 compatibility
1653         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1654
1655         # A 303 must either use GET or HEAD for subsequent request
1656         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1657         if code == 303 and m != 'HEAD':
1658             m = 'GET'
1659         # 301 and 302 redirects are commonly turned into a GET from a POST
1660         # for subsequent requests by browsers, so we'll do the same.
1661         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1662         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1663         if code in (301, 302) and m == 'POST':
1664             m = 'GET'
1665
1666         return urllib.request.Request(
1667             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1668             unverifiable=True, method=m)
1669
1670
1671 def extract_timezone(date_str):
1672     m = re.search(
1673         r'''(?x)
1674             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1675             (?P<tz>Z|                                            # just the UTC Z, or
1676                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1677                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1678                    [ ]?                                          # optional space
1679                 (?P<sign>\+|-)                                   # +/-
1680                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1681             $)
1682         ''', date_str)
1683     if not m:
1684         timezone = datetime.timedelta()
1685     else:
1686         date_str = date_str[:-len(m.group('tz'))]
1687         if not m.group('sign'):
1688             timezone = datetime.timedelta()
1689         else:
1690             sign = 1 if m.group('sign') == '+' else -1
1691             timezone = datetime.timedelta(
1692                 hours=sign * int(m.group('hours')),
1693                 minutes=sign * int(m.group('minutes')))
1694     return timezone, date_str
1695
1696
1697 def parse_iso8601(date_str, delimiter='T', timezone=None):
1698     """ Return a UNIX timestamp from the given date """
1699
1700     if date_str is None:
1701         return None
1702
1703     date_str = re.sub(r'\.[0-9]+', '', date_str)
1704
1705     if timezone is None:
1706         timezone, date_str = extract_timezone(date_str)
1707
1708     with contextlib.suppress(ValueError):
1709         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1710         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1711         return calendar.timegm(dt.timetuple())
1712
1713
1714 def date_formats(day_first=True):
1715     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1716
1717
1718 def unified_strdate(date_str, day_first=True):
1719     """Return a string with the date in the format YYYYMMDD"""
1720
1721     if date_str is None:
1722         return None
1723     upload_date = None
1724     # Replace commas
1725     date_str = date_str.replace(',', ' ')
1726     # Remove AM/PM + timezone
1727     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1728     _, date_str = extract_timezone(date_str)
1729
1730     for expression in date_formats(day_first):
1731         with contextlib.suppress(ValueError):
1732             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1733     if upload_date is None:
1734         timetuple = email.utils.parsedate_tz(date_str)
1735         if timetuple:
1736             with contextlib.suppress(ValueError):
1737                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1738     if upload_date is not None:
1739         return str(upload_date)
1740
1741
1742 def unified_timestamp(date_str, day_first=True):
1743     if date_str is None:
1744         return None
1745
1746     date_str = re.sub(r'[,|]', '', date_str)
1747
1748     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1749     timezone, date_str = extract_timezone(date_str)
1750
1751     # Remove AM/PM + timezone
1752     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1753
1754     # Remove unrecognized timezones from ISO 8601 alike timestamps
1755     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1756     if m:
1757         date_str = date_str[:-len(m.group('tz'))]
1758
1759     # Python only supports microseconds, so remove nanoseconds
1760     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1761     if m:
1762         date_str = m.group(1)
1763
1764     for expression in date_formats(day_first):
1765         with contextlib.suppress(ValueError):
1766             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1767             return calendar.timegm(dt.timetuple())
1768     timetuple = email.utils.parsedate_tz(date_str)
1769     if timetuple:
1770         return calendar.timegm(timetuple) + pm_delta * 3600
1771
1772
1773 def determine_ext(url, default_ext='unknown_video'):
1774     if url is None or '.' not in url:
1775         return default_ext
1776     guess = url.partition('?')[0].rpartition('.')[2]
1777     if re.match(r'^[A-Za-z0-9]+$', guess):
1778         return guess
1779     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1780     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1781         return guess.rstrip('/')
1782     else:
1783         return default_ext
1784
1785
1786 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1787     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1788
1789
1790 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1791     R"""
1792     Return a datetime object from a string.
1793     Supported format:
1794         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1795
1796     @param format       strftime format of DATE
1797     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1798                         auto: round to the unit provided in date_str (if applicable).
1799     """
1800     auto_precision = False
1801     if precision == 'auto':
1802         auto_precision = True
1803         precision = 'microsecond'
1804     today = datetime_round(datetime.datetime.utcnow(), precision)
1805     if date_str in ('now', 'today'):
1806         return today
1807     if date_str == 'yesterday':
1808         return today - datetime.timedelta(days=1)
1809     match = re.match(
1810         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1811         date_str)
1812     if match is not None:
1813         start_time = datetime_from_str(match.group('start'), precision, format)
1814         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1815         unit = match.group('unit')
1816         if unit == 'month' or unit == 'year':
1817             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1818             unit = 'day'
1819         else:
1820             if unit == 'week':
1821                 unit = 'day'
1822                 time *= 7
1823             delta = datetime.timedelta(**{unit + 's': time})
1824             new_date = start_time + delta
1825         if auto_precision:
1826             return datetime_round(new_date, unit)
1827         return new_date
1828
1829     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1830
1831
1832 def date_from_str(date_str, format='%Y%m%d', strict=False):
1833     R"""
1834     Return a date object from a string using datetime_from_str
1835
1836     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1837                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1838     """
1839     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1840         raise ValueError(f'Invalid date format "{date_str}"')
1841     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1842
1843
1844 def datetime_add_months(dt, months):
1845     """Increment/Decrement a datetime object by months."""
1846     month = dt.month + months - 1
1847     year = dt.year + month // 12
1848     month = month % 12 + 1
1849     day = min(dt.day, calendar.monthrange(year, month)[1])
1850     return dt.replace(year, month, day)
1851
1852
1853 def datetime_round(dt, precision='day'):
1854     """
1855     Round a datetime object's time to a specific precision
1856     """
1857     if precision == 'microsecond':
1858         return dt
1859
1860     unit_seconds = {
1861         'day': 86400,
1862         'hour': 3600,
1863         'minute': 60,
1864         'second': 1,
1865     }
1866     roundto = lambda x, n: ((x + n / 2) // n) * n
1867     timestamp = calendar.timegm(dt.timetuple())
1868     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1869
1870
1871 def hyphenate_date(date_str):
1872     """
1873     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1874     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1875     if match is not None:
1876         return '-'.join(match.groups())
1877     else:
1878         return date_str
1879
1880
1881 class DateRange:
1882     """Represents a time interval between two dates"""
1883
1884     def __init__(self, start=None, end=None):
1885         """start and end must be strings in the format accepted by date"""
1886         if start is not None:
1887             self.start = date_from_str(start, strict=True)
1888         else:
1889             self.start = datetime.datetime.min.date()
1890         if end is not None:
1891             self.end = date_from_str(end, strict=True)
1892         else:
1893             self.end = datetime.datetime.max.date()
1894         if self.start > self.end:
1895             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1896
1897     @classmethod
1898     def day(cls, day):
1899         """Returns a range that only contains the given day"""
1900         return cls(day, day)
1901
1902     def __contains__(self, date):
1903         """Check if the date is in the range"""
1904         if not isinstance(date, datetime.date):
1905             date = date_from_str(date)
1906         return self.start <= date <= self.end
1907
1908     def __str__(self):
1909         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1910
1911     def __eq__(self, other):
1912         return (isinstance(other, DateRange)
1913                 and self.start == other.start and self.end == other.end)
1914
1915
1916 def platform_name():
1917     """ Returns the platform name as a str """
1918     write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1919     return platform.platform()
1920
1921
1922 @functools.cache
1923 def system_identifier():
1924     python_implementation = platform.python_implementation()
1925     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1926         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1927
1928     return 'Python %s (%s %s) - %s %s' % (
1929         platform.python_version(),
1930         python_implementation,
1931         platform.architecture()[0],
1932         platform.platform(),
1933         format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1934     )
1935
1936
1937 @functools.cache
1938 def get_windows_version():
1939     ''' Get Windows version. returns () if it's not running on Windows '''
1940     if compat_os_name == 'nt':
1941         return version_tuple(platform.win32_ver()[1])
1942     else:
1943         return ()
1944
1945
1946 def write_string(s, out=None, encoding=None):
1947     assert isinstance(s, str)
1948     out = out or sys.stderr
1949
1950     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1951         s = re.sub(r'([\r\n]+)', r' \1', s)
1952
1953     enc, buffer = None, out
1954     if 'b' in getattr(out, 'mode', ''):
1955         enc = encoding or preferredencoding()
1956     elif hasattr(out, 'buffer'):
1957         buffer = out.buffer
1958         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1959
1960     buffer.write(s.encode(enc, 'ignore') if enc else s)
1961     out.flush()
1962
1963
1964 def bytes_to_intlist(bs):
1965     if not bs:
1966         return []
1967     if isinstance(bs[0], int):  # Python 3
1968         return list(bs)
1969     else:
1970         return [ord(c) for c in bs]
1971
1972
1973 def intlist_to_bytes(xs):
1974     if not xs:
1975         return b''
1976     return struct.pack('%dB' % len(xs), *xs)
1977
1978
1979 class LockingUnsupportedError(OSError):
1980     msg = 'File locking is not supported'
1981
1982     def __init__(self):
1983         super().__init__(self.msg)
1984
1985
1986 # Cross-platform file locking
1987 if sys.platform == 'win32':
1988     import ctypes.wintypes
1989     import msvcrt
1990
1991     class OVERLAPPED(ctypes.Structure):
1992         _fields_ = [
1993             ('Internal', ctypes.wintypes.LPVOID),
1994             ('InternalHigh', ctypes.wintypes.LPVOID),
1995             ('Offset', ctypes.wintypes.DWORD),
1996             ('OffsetHigh', ctypes.wintypes.DWORD),
1997             ('hEvent', ctypes.wintypes.HANDLE),
1998         ]
1999
2000     kernel32 = ctypes.windll.kernel32
2001     LockFileEx = kernel32.LockFileEx
2002     LockFileEx.argtypes = [
2003         ctypes.wintypes.HANDLE,     # hFile
2004         ctypes.wintypes.DWORD,      # dwFlags
2005         ctypes.wintypes.DWORD,      # dwReserved
2006         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2007         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2008         ctypes.POINTER(OVERLAPPED)  # Overlapped
2009     ]
2010     LockFileEx.restype = ctypes.wintypes.BOOL
2011     UnlockFileEx = kernel32.UnlockFileEx
2012     UnlockFileEx.argtypes = [
2013         ctypes.wintypes.HANDLE,     # hFile
2014         ctypes.wintypes.DWORD,      # dwReserved
2015         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2016         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2017         ctypes.POINTER(OVERLAPPED)  # Overlapped
2018     ]
2019     UnlockFileEx.restype = ctypes.wintypes.BOOL
2020     whole_low = 0xffffffff
2021     whole_high = 0x7fffffff
2022
2023     def _lock_file(f, exclusive, block):
2024         overlapped = OVERLAPPED()
2025         overlapped.Offset = 0
2026         overlapped.OffsetHigh = 0
2027         overlapped.hEvent = 0
2028         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2029
2030         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2031                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2032                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2033             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2034             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2035
2036     def _unlock_file(f):
2037         assert f._lock_file_overlapped_p
2038         handle = msvcrt.get_osfhandle(f.fileno())
2039         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2040             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2041
2042 else:
2043     try:
2044         import fcntl
2045
2046         def _lock_file(f, exclusive, block):
2047             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2048             if not block:
2049                 flags |= fcntl.LOCK_NB
2050             try:
2051                 fcntl.flock(f, flags)
2052             except BlockingIOError:
2053                 raise
2054             except OSError:  # AOSP does not have flock()
2055                 fcntl.lockf(f, flags)
2056
2057         def _unlock_file(f):
2058             try:
2059                 fcntl.flock(f, fcntl.LOCK_UN)
2060             except OSError:
2061                 fcntl.lockf(f, fcntl.LOCK_UN)
2062
2063     except ImportError:
2064
2065         def _lock_file(f, exclusive, block):
2066             raise LockingUnsupportedError()
2067
2068         def _unlock_file(f):
2069             raise LockingUnsupportedError()
2070
2071
2072 class locked_file:
2073     locked = False
2074
2075     def __init__(self, filename, mode, block=True, encoding=None):
2076         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2077             raise NotImplementedError(mode)
2078         self.mode, self.block = mode, block
2079
2080         writable = any(f in mode for f in 'wax+')
2081         readable = any(f in mode for f in 'r+')
2082         flags = functools.reduce(operator.ior, (
2083             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2084             getattr(os, 'O_BINARY', 0),  # Windows only
2085             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2086             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2087             os.O_APPEND if 'a' in mode else 0,
2088             os.O_EXCL if 'x' in mode else 0,
2089             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2090         ))
2091
2092         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2093
2094     def __enter__(self):
2095         exclusive = 'r' not in self.mode
2096         try:
2097             _lock_file(self.f, exclusive, self.block)
2098             self.locked = True
2099         except OSError:
2100             self.f.close()
2101             raise
2102         if 'w' in self.mode:
2103             try:
2104                 self.f.truncate()
2105             except OSError as e:
2106                 if e.errno not in (
2107                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2108                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2109                 ):
2110                     raise
2111         return self
2112
2113     def unlock(self):
2114         if not self.locked:
2115             return
2116         try:
2117             _unlock_file(self.f)
2118         finally:
2119             self.locked = False
2120
2121     def __exit__(self, *_):
2122         try:
2123             self.unlock()
2124         finally:
2125             self.f.close()
2126
2127     open = __enter__
2128     close = __exit__
2129
2130     def __getattr__(self, attr):
2131         return getattr(self.f, attr)
2132
2133     def __iter__(self):
2134         return iter(self.f)
2135
2136
2137 @functools.cache
2138 def get_filesystem_encoding():
2139     encoding = sys.getfilesystemencoding()
2140     return encoding if encoding is not None else 'utf-8'
2141
2142
2143 def shell_quote(args):
2144     quoted_args = []
2145     encoding = get_filesystem_encoding()
2146     for a in args:
2147         if isinstance(a, bytes):
2148             # We may get a filename encoded with 'encodeFilename'
2149             a = a.decode(encoding)
2150         quoted_args.append(compat_shlex_quote(a))
2151     return ' '.join(quoted_args)
2152
2153
2154 def smuggle_url(url, data):
2155     """ Pass additional data in a URL for internal use. """
2156
2157     url, idata = unsmuggle_url(url, {})
2158     data.update(idata)
2159     sdata = urllib.parse.urlencode(
2160         {'__youtubedl_smuggle': json.dumps(data)})
2161     return url + '#' + sdata
2162
2163
2164 def unsmuggle_url(smug_url, default=None):
2165     if '#__youtubedl_smuggle' not in smug_url:
2166         return smug_url, default
2167     url, _, sdata = smug_url.rpartition('#')
2168     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2169     data = json.loads(jsond)
2170     return url, data
2171
2172
2173 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2174     """ Formats numbers with decimal sufixes like K, M, etc """
2175     num, factor = float_or_none(num), float(factor)
2176     if num is None or num < 0:
2177         return None
2178     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2179     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2180     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2181     if factor == 1024:
2182         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2183     converted = num / (factor ** exponent)
2184     return fmt % (converted, suffix)
2185
2186
2187 def format_bytes(bytes):
2188     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2189
2190
2191 def lookup_unit_table(unit_table, s):
2192     units_re = '|'.join(re.escape(u) for u in unit_table)
2193     m = re.match(
2194         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2195     if not m:
2196         return None
2197     num_str = m.group('num').replace(',', '.')
2198     mult = unit_table[m.group('unit')]
2199     return int(float(num_str) * mult)
2200
2201
2202 def parse_filesize(s):
2203     if s is None:
2204         return None
2205
2206     # The lower-case forms are of course incorrect and unofficial,
2207     # but we support those too
2208     _UNIT_TABLE = {
2209         'B': 1,
2210         'b': 1,
2211         'bytes': 1,
2212         'KiB': 1024,
2213         'KB': 1000,
2214         'kB': 1024,
2215         'Kb': 1000,
2216         'kb': 1000,
2217         'kilobytes': 1000,
2218         'kibibytes': 1024,
2219         'MiB': 1024 ** 2,
2220         'MB': 1000 ** 2,
2221         'mB': 1024 ** 2,
2222         'Mb': 1000 ** 2,
2223         'mb': 1000 ** 2,
2224         'megabytes': 1000 ** 2,
2225         'mebibytes': 1024 ** 2,
2226         'GiB': 1024 ** 3,
2227         'GB': 1000 ** 3,
2228         'gB': 1024 ** 3,
2229         'Gb': 1000 ** 3,
2230         'gb': 1000 ** 3,
2231         'gigabytes': 1000 ** 3,
2232         'gibibytes': 1024 ** 3,
2233         'TiB': 1024 ** 4,
2234         'TB': 1000 ** 4,
2235         'tB': 1024 ** 4,
2236         'Tb': 1000 ** 4,
2237         'tb': 1000 ** 4,
2238         'terabytes': 1000 ** 4,
2239         'tebibytes': 1024 ** 4,
2240         'PiB': 1024 ** 5,
2241         'PB': 1000 ** 5,
2242         'pB': 1024 ** 5,
2243         'Pb': 1000 ** 5,
2244         'pb': 1000 ** 5,
2245         'petabytes': 1000 ** 5,
2246         'pebibytes': 1024 ** 5,
2247         'EiB': 1024 ** 6,
2248         'EB': 1000 ** 6,
2249         'eB': 1024 ** 6,
2250         'Eb': 1000 ** 6,
2251         'eb': 1000 ** 6,
2252         'exabytes': 1000 ** 6,
2253         'exbibytes': 1024 ** 6,
2254         'ZiB': 1024 ** 7,
2255         'ZB': 1000 ** 7,
2256         'zB': 1024 ** 7,
2257         'Zb': 1000 ** 7,
2258         'zb': 1000 ** 7,
2259         'zettabytes': 1000 ** 7,
2260         'zebibytes': 1024 ** 7,
2261         'YiB': 1024 ** 8,
2262         'YB': 1000 ** 8,
2263         'yB': 1024 ** 8,
2264         'Yb': 1000 ** 8,
2265         'yb': 1000 ** 8,
2266         'yottabytes': 1000 ** 8,
2267         'yobibytes': 1024 ** 8,
2268     }
2269
2270     return lookup_unit_table(_UNIT_TABLE, s)
2271
2272
2273 def parse_count(s):
2274     if s is None:
2275         return None
2276
2277     s = re.sub(r'^[^\d]+\s', '', s).strip()
2278
2279     if re.match(r'^[\d,.]+$', s):
2280         return str_to_int(s)
2281
2282     _UNIT_TABLE = {
2283         'k': 1000,
2284         'K': 1000,
2285         'm': 1000 ** 2,
2286         'M': 1000 ** 2,
2287         'kk': 1000 ** 2,
2288         'KK': 1000 ** 2,
2289         'b': 1000 ** 3,
2290         'B': 1000 ** 3,
2291     }
2292
2293     ret = lookup_unit_table(_UNIT_TABLE, s)
2294     if ret is not None:
2295         return ret
2296
2297     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2298     if mobj:
2299         return str_to_int(mobj.group(1))
2300
2301
2302 def parse_resolution(s, *, lenient=False):
2303     if s is None:
2304         return {}
2305
2306     if lenient:
2307         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2308     else:
2309         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2310     if mobj:
2311         return {
2312             'width': int(mobj.group('w')),
2313             'height': int(mobj.group('h')),
2314         }
2315
2316     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2317     if mobj:
2318         return {'height': int(mobj.group(1))}
2319
2320     mobj = re.search(r'\b([48])[kK]\b', s)
2321     if mobj:
2322         return {'height': int(mobj.group(1)) * 540}
2323
2324     return {}
2325
2326
2327 def parse_bitrate(s):
2328     if not isinstance(s, str):
2329         return
2330     mobj = re.search(r'\b(\d+)\s*kbps', s)
2331     if mobj:
2332         return int(mobj.group(1))
2333
2334
2335 def month_by_name(name, lang='en'):
2336     """ Return the number of a month by (locale-independently) English name """
2337
2338     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2339
2340     try:
2341         return month_names.index(name) + 1
2342     except ValueError:
2343         return None
2344
2345
2346 def month_by_abbreviation(abbrev):
2347     """ Return the number of a month by (locale-independently) English
2348         abbreviations """
2349
2350     try:
2351         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2352     except ValueError:
2353         return None
2354
2355
2356 def fix_xml_ampersands(xml_str):
2357     """Replace all the '&' by '&amp;' in XML"""
2358     return re.sub(
2359         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2360         '&amp;',
2361         xml_str)
2362
2363
2364 def setproctitle(title):
2365     assert isinstance(title, str)
2366
2367     # ctypes in Jython is not complete
2368     # http://bugs.jython.org/issue2148
2369     if sys.platform.startswith('java'):
2370         return
2371
2372     try:
2373         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2374     except OSError:
2375         return
2376     except TypeError:
2377         # LoadLibrary in Windows Python 2.7.13 only expects
2378         # a bytestring, but since unicode_literals turns
2379         # every string into a unicode string, it fails.
2380         return
2381     title_bytes = title.encode()
2382     buf = ctypes.create_string_buffer(len(title_bytes))
2383     buf.value = title_bytes
2384     try:
2385         libc.prctl(15, buf, 0, 0, 0)
2386     except AttributeError:
2387         return  # Strange libc, just skip this
2388
2389
2390 def remove_start(s, start):
2391     return s[len(start):] if s is not None and s.startswith(start) else s
2392
2393
2394 def remove_end(s, end):
2395     return s[:-len(end)] if s is not None and s.endswith(end) else s
2396
2397
2398 def remove_quotes(s):
2399     if s is None or len(s) < 2:
2400         return s
2401     for quote in ('"', "'", ):
2402         if s[0] == quote and s[-1] == quote:
2403             return s[1:-1]
2404     return s
2405
2406
2407 def get_domain(url):
2408     return '.'.join(urllib.parse.urlparse(url).netloc.rsplit('.', 2)[-2:])
2409
2410
2411 def url_basename(url):
2412     path = urllib.parse.urlparse(url).path
2413     return path.strip('/').split('/')[-1]
2414
2415
2416 def base_url(url):
2417     return re.match(r'https?://[^?#&]+/', url).group()
2418
2419
2420 def urljoin(base, path):
2421     if isinstance(path, bytes):
2422         path = path.decode()
2423     if not isinstance(path, str) or not path:
2424         return None
2425     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2426         return path
2427     if isinstance(base, bytes):
2428         base = base.decode()
2429     if not isinstance(base, str) or not re.match(
2430             r'^(?:https?:)?//', base):
2431         return None
2432     return urllib.parse.urljoin(base, path)
2433
2434
2435 class HEADRequest(urllib.request.Request):
2436     def get_method(self):
2437         return 'HEAD'
2438
2439
2440 class PUTRequest(urllib.request.Request):
2441     def get_method(self):
2442         return 'PUT'
2443
2444
2445 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2446     if get_attr and v is not None:
2447         v = getattr(v, get_attr, None)
2448     try:
2449         return int(v) * invscale // scale
2450     except (ValueError, TypeError, OverflowError):
2451         return default
2452
2453
2454 def str_or_none(v, default=None):
2455     return default if v is None else str(v)
2456
2457
2458 def str_to_int(int_str):
2459     """ A more relaxed version of int_or_none """
2460     if isinstance(int_str, int):
2461         return int_str
2462     elif isinstance(int_str, str):
2463         int_str = re.sub(r'[,\.\+]', '', int_str)
2464         return int_or_none(int_str)
2465
2466
2467 def float_or_none(v, scale=1, invscale=1, default=None):
2468     if v is None:
2469         return default
2470     try:
2471         return float(v) * invscale / scale
2472     except (ValueError, TypeError):
2473         return default
2474
2475
2476 def bool_or_none(v, default=None):
2477     return v if isinstance(v, bool) else default
2478
2479
2480 def strip_or_none(v, default=None):
2481     return v.strip() if isinstance(v, str) else default
2482
2483
2484 def url_or_none(url):
2485     if not url or not isinstance(url, str):
2486         return None
2487     url = url.strip()
2488     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2489
2490
2491 def request_to_url(req):
2492     if isinstance(req, urllib.request.Request):
2493         return req.get_full_url()
2494     else:
2495         return req
2496
2497
2498 def strftime_or_none(timestamp, date_format, default=None):
2499     datetime_object = None
2500     try:
2501         if isinstance(timestamp, (int, float)):  # unix timestamp
2502             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2503         elif isinstance(timestamp, str):  # assume YYYYMMDD
2504             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2505         return datetime_object.strftime(date_format)
2506     except (ValueError, TypeError, AttributeError):
2507         return default
2508
2509
2510 def parse_duration(s):
2511     if not isinstance(s, str):
2512         return None
2513     s = s.strip()
2514     if not s:
2515         return None
2516
2517     days, hours, mins, secs, ms = [None] * 5
2518     m = re.match(r'''(?x)
2519             (?P<before_secs>
2520                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2521             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2522             (?P<ms>[.:][0-9]+)?Z?$
2523         ''', s)
2524     if m:
2525         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2526     else:
2527         m = re.match(
2528             r'''(?ix)(?:P?
2529                 (?:
2530                     [0-9]+\s*y(?:ears?)?,?\s*
2531                 )?
2532                 (?:
2533                     [0-9]+\s*m(?:onths?)?,?\s*
2534                 )?
2535                 (?:
2536                     [0-9]+\s*w(?:eeks?)?,?\s*
2537                 )?
2538                 (?:
2539                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2540                 )?
2541                 T)?
2542                 (?:
2543                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2544                 )?
2545                 (?:
2546                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2547                 )?
2548                 (?:
2549                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2550                 )?Z?$''', s)
2551         if m:
2552             days, hours, mins, secs, ms = m.groups()
2553         else:
2554             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2555             if m:
2556                 hours, mins = m.groups()
2557             else:
2558                 return None
2559
2560     if ms:
2561         ms = ms.replace(':', '.')
2562     return sum(float(part or 0) * mult for part, mult in (
2563         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2564
2565
2566 def prepend_extension(filename, ext, expected_real_ext=None):
2567     name, real_ext = os.path.splitext(filename)
2568     return (
2569         f'{name}.{ext}{real_ext}'
2570         if not expected_real_ext or real_ext[1:] == expected_real_ext
2571         else f'{filename}.{ext}')
2572
2573
2574 def replace_extension(filename, ext, expected_real_ext=None):
2575     name, real_ext = os.path.splitext(filename)
2576     return '{}.{}'.format(
2577         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2578         ext)
2579
2580
2581 def check_executable(exe, args=[]):
2582     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2583     args can be a list of arguments for a short output (like -version) """
2584     try:
2585         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2586     except OSError:
2587         return False
2588     return exe
2589
2590
2591 def _get_exe_version_output(exe, args, *, to_screen=None):
2592     if to_screen:
2593         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2594     try:
2595         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2596         # SIGTTOU if yt-dlp is run in the background.
2597         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2598         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2599                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2600     except OSError:
2601         return False
2602     return stdout
2603
2604
2605 def detect_exe_version(output, version_re=None, unrecognized='present'):
2606     assert isinstance(output, str)
2607     if version_re is None:
2608         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2609     m = re.search(version_re, output)
2610     if m:
2611         return m.group(1)
2612     else:
2613         return unrecognized
2614
2615
2616 def get_exe_version(exe, args=['--version'],
2617                     version_re=None, unrecognized='present'):
2618     """ Returns the version of the specified executable,
2619     or False if the executable is not present """
2620     out = _get_exe_version_output(exe, args)
2621     return detect_exe_version(out, version_re, unrecognized) if out else False
2622
2623
2624 def frange(start=0, stop=None, step=1):
2625     """Float range"""
2626     if stop is None:
2627         start, stop = 0, start
2628     sign = [-1, 1][step > 0] if step else 0
2629     while sign * start < sign * stop:
2630         yield start
2631         start += step
2632
2633
2634 class LazyList(collections.abc.Sequence):
2635     """Lazy immutable list from an iterable
2636     Note that slices of a LazyList are lists and not LazyList"""
2637
2638     class IndexError(IndexError):
2639         pass
2640
2641     def __init__(self, iterable, *, reverse=False, _cache=None):
2642         self._iterable = iter(iterable)
2643         self._cache = [] if _cache is None else _cache
2644         self._reversed = reverse
2645
2646     def __iter__(self):
2647         if self._reversed:
2648             # We need to consume the entire iterable to iterate in reverse
2649             yield from self.exhaust()
2650             return
2651         yield from self._cache
2652         for item in self._iterable:
2653             self._cache.append(item)
2654             yield item
2655
2656     def _exhaust(self):
2657         self._cache.extend(self._iterable)
2658         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2659         return self._cache
2660
2661     def exhaust(self):
2662         """Evaluate the entire iterable"""
2663         return self._exhaust()[::-1 if self._reversed else 1]
2664
2665     @staticmethod
2666     def _reverse_index(x):
2667         return None if x is None else ~x
2668
2669     def __getitem__(self, idx):
2670         if isinstance(idx, slice):
2671             if self._reversed:
2672                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2673             start, stop, step = idx.start, idx.stop, idx.step or 1
2674         elif isinstance(idx, int):
2675             if self._reversed:
2676                 idx = self._reverse_index(idx)
2677             start, stop, step = idx, idx, 0
2678         else:
2679             raise TypeError('indices must be integers or slices')
2680         if ((start or 0) < 0 or (stop or 0) < 0
2681                 or (start is None and step < 0)
2682                 or (stop is None and step > 0)):
2683             # We need to consume the entire iterable to be able to slice from the end
2684             # Obviously, never use this with infinite iterables
2685             self._exhaust()
2686             try:
2687                 return self._cache[idx]
2688             except IndexError as e:
2689                 raise self.IndexError(e) from e
2690         n = max(start or 0, stop or 0) - len(self._cache) + 1
2691         if n > 0:
2692             self._cache.extend(itertools.islice(self._iterable, n))
2693         try:
2694             return self._cache[idx]
2695         except IndexError as e:
2696             raise self.IndexError(e) from e
2697
2698     def __bool__(self):
2699         try:
2700             self[-1] if self._reversed else self[0]
2701         except self.IndexError:
2702             return False
2703         return True
2704
2705     def __len__(self):
2706         self._exhaust()
2707         return len(self._cache)
2708
2709     def __reversed__(self):
2710         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2711
2712     def __copy__(self):
2713         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2714
2715     def __repr__(self):
2716         # repr and str should mimic a list. So we exhaust the iterable
2717         return repr(self.exhaust())
2718
2719     def __str__(self):
2720         return repr(self.exhaust())
2721
2722
2723 class PagedList:
2724
2725     class IndexError(IndexError):
2726         pass
2727
2728     def __len__(self):
2729         # This is only useful for tests
2730         return len(self.getslice())
2731
2732     def __init__(self, pagefunc, pagesize, use_cache=True):
2733         self._pagefunc = pagefunc
2734         self._pagesize = pagesize
2735         self._pagecount = float('inf')
2736         self._use_cache = use_cache
2737         self._cache = {}
2738
2739     def getpage(self, pagenum):
2740         page_results = self._cache.get(pagenum)
2741         if page_results is None:
2742             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2743         if self._use_cache:
2744             self._cache[pagenum] = page_results
2745         return page_results
2746
2747     def getslice(self, start=0, end=None):
2748         return list(self._getslice(start, end))
2749
2750     def _getslice(self, start, end):
2751         raise NotImplementedError('This method must be implemented by subclasses')
2752
2753     def __getitem__(self, idx):
2754         assert self._use_cache, 'Indexing PagedList requires cache'
2755         if not isinstance(idx, int) or idx < 0:
2756             raise TypeError('indices must be non-negative integers')
2757         entries = self.getslice(idx, idx + 1)
2758         if not entries:
2759             raise self.IndexError()
2760         return entries[0]
2761
2762
2763 class OnDemandPagedList(PagedList):
2764     """Download pages until a page with less than maximum results"""
2765
2766     def _getslice(self, start, end):
2767         for pagenum in itertools.count(start // self._pagesize):
2768             firstid = pagenum * self._pagesize
2769             nextfirstid = pagenum * self._pagesize + self._pagesize
2770             if start >= nextfirstid:
2771                 continue
2772
2773             startv = (
2774                 start % self._pagesize
2775                 if firstid <= start < nextfirstid
2776                 else 0)
2777             endv = (
2778                 ((end - 1) % self._pagesize) + 1
2779                 if (end is not None and firstid <= end <= nextfirstid)
2780                 else None)
2781
2782             try:
2783                 page_results = self.getpage(pagenum)
2784             except Exception:
2785                 self._pagecount = pagenum - 1
2786                 raise
2787             if startv != 0 or endv is not None:
2788                 page_results = page_results[startv:endv]
2789             yield from page_results
2790
2791             # A little optimization - if current page is not "full", ie. does
2792             # not contain page_size videos then we can assume that this page
2793             # is the last one - there are no more ids on further pages -
2794             # i.e. no need to query again.
2795             if len(page_results) + startv < self._pagesize:
2796                 break
2797
2798             # If we got the whole page, but the next page is not interesting,
2799             # break out early as well
2800             if end == nextfirstid:
2801                 break
2802
2803
2804 class InAdvancePagedList(PagedList):
2805     """PagedList with total number of pages known in advance"""
2806
2807     def __init__(self, pagefunc, pagecount, pagesize):
2808         PagedList.__init__(self, pagefunc, pagesize, True)
2809         self._pagecount = pagecount
2810
2811     def _getslice(self, start, end):
2812         start_page = start // self._pagesize
2813         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2814         skip_elems = start - start_page * self._pagesize
2815         only_more = None if end is None else end - start
2816         for pagenum in range(start_page, end_page):
2817             page_results = self.getpage(pagenum)
2818             if skip_elems:
2819                 page_results = page_results[skip_elems:]
2820                 skip_elems = None
2821             if only_more is not None:
2822                 if len(page_results) < only_more:
2823                     only_more -= len(page_results)
2824                 else:
2825                     yield from page_results[:only_more]
2826                     break
2827             yield from page_results
2828
2829
2830 class PlaylistEntries:
2831     MissingEntry = object()
2832     is_exhausted = False
2833
2834     def __init__(self, ydl, info_dict):
2835         self.ydl = ydl
2836
2837         # _entries must be assigned now since infodict can change during iteration
2838         entries = info_dict.get('entries')
2839         if entries is None:
2840             raise EntryNotInPlaylist('There are no entries')
2841         elif isinstance(entries, list):
2842             self.is_exhausted = True
2843
2844         requested_entries = info_dict.get('requested_entries')
2845         self.is_incomplete = bool(requested_entries)
2846         if self.is_incomplete:
2847             assert self.is_exhausted
2848             self._entries = [self.MissingEntry] * max(requested_entries)
2849             for i, entry in zip(requested_entries, entries):
2850                 self._entries[i - 1] = entry
2851         elif isinstance(entries, (list, PagedList, LazyList)):
2852             self._entries = entries
2853         else:
2854             self._entries = LazyList(entries)
2855
2856     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2857         (?P<start>[+-]?\d+)?
2858         (?P<range>[:-]
2859             (?P<end>[+-]?\d+|inf(?:inite)?)?
2860             (?::(?P<step>[+-]?\d+))?
2861         )?''')
2862
2863     @classmethod
2864     def parse_playlist_items(cls, string):
2865         for segment in string.split(','):
2866             if not segment:
2867                 raise ValueError('There is two or more consecutive commas')
2868             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2869             if not mobj:
2870                 raise ValueError(f'{segment!r} is not a valid specification')
2871             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2872             if int_or_none(step) == 0:
2873                 raise ValueError(f'Step in {segment!r} cannot be zero')
2874             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2875
2876     def get_requested_items(self):
2877         playlist_items = self.ydl.params.get('playlist_items')
2878         playlist_start = self.ydl.params.get('playliststart', 1)
2879         playlist_end = self.ydl.params.get('playlistend')
2880         # For backwards compatibility, interpret -1 as whole list
2881         if playlist_end in (-1, None):
2882             playlist_end = ''
2883         if not playlist_items:
2884             playlist_items = f'{playlist_start}:{playlist_end}'
2885         elif playlist_start != 1 or playlist_end:
2886             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2887
2888         for index in self.parse_playlist_items(playlist_items):
2889             for i, entry in self[index]:
2890                 yield i, entry
2891                 if not entry:
2892                     continue
2893                 try:
2894                     # TODO: Add auto-generated fields
2895                     self.ydl._match_entry(entry, incomplete=True, silent=True)
2896                 except (ExistingVideoReached, RejectedVideoReached):
2897                     return
2898
2899     def get_full_count(self):
2900         if self.is_exhausted and not self.is_incomplete:
2901             return len(self)
2902         elif isinstance(self._entries, InAdvancePagedList):
2903             if self._entries._pagesize == 1:
2904                 return self._entries._pagecount
2905
2906     @functools.cached_property
2907     def _getter(self):
2908         if isinstance(self._entries, list):
2909             def get_entry(i):
2910                 try:
2911                     entry = self._entries[i]
2912                 except IndexError:
2913                     entry = self.MissingEntry
2914                     if not self.is_incomplete:
2915                         raise self.IndexError()
2916                 if entry is self.MissingEntry:
2917                     raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2918                 return entry
2919         else:
2920             def get_entry(i):
2921                 try:
2922                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2923                 except (LazyList.IndexError, PagedList.IndexError):
2924                     raise self.IndexError()
2925         return get_entry
2926
2927     def __getitem__(self, idx):
2928         if isinstance(idx, int):
2929             idx = slice(idx, idx)
2930
2931         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2932         step = 1 if idx.step is None else idx.step
2933         if idx.start is None:
2934             start = 0 if step > 0 else len(self) - 1
2935         else:
2936             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2937
2938         # NB: Do not call len(self) when idx == [:]
2939         if idx.stop is None:
2940             stop = 0 if step < 0 else float('inf')
2941         else:
2942             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2943         stop += [-1, 1][step > 0]
2944
2945         for i in frange(start, stop, step):
2946             if i < 0:
2947                 continue
2948             try:
2949                 entry = self._getter(i)
2950             except self.IndexError:
2951                 self.is_exhausted = True
2952                 if step > 0:
2953                     break
2954                 continue
2955             yield i + 1, entry
2956
2957     def __len__(self):
2958         return len(tuple(self[:]))
2959
2960     class IndexError(IndexError):
2961         pass
2962
2963
2964 def uppercase_escape(s):
2965     unicode_escape = codecs.getdecoder('unicode_escape')
2966     return re.sub(
2967         r'\\U[0-9a-fA-F]{8}',
2968         lambda m: unicode_escape(m.group(0))[0],
2969         s)
2970
2971
2972 def lowercase_escape(s):
2973     unicode_escape = codecs.getdecoder('unicode_escape')
2974     return re.sub(
2975         r'\\u[0-9a-fA-F]{4}',
2976         lambda m: unicode_escape(m.group(0))[0],
2977         s)
2978
2979
2980 def escape_rfc3986(s):
2981     """Escape non-ASCII characters as suggested by RFC 3986"""
2982     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2983
2984
2985 def escape_url(url):
2986     """Escape URL as suggested by RFC 3986"""
2987     url_parsed = urllib.parse.urlparse(url)
2988     return url_parsed._replace(
2989         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2990         path=escape_rfc3986(url_parsed.path),
2991         params=escape_rfc3986(url_parsed.params),
2992         query=escape_rfc3986(url_parsed.query),
2993         fragment=escape_rfc3986(url_parsed.fragment)
2994     ).geturl()
2995
2996
2997 def parse_qs(url):
2998     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
2999
3000
3001 def read_batch_urls(batch_fd):
3002     def fixup(url):
3003         if not isinstance(url, str):
3004             url = url.decode('utf-8', 'replace')
3005         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3006         for bom in BOM_UTF8:
3007             if url.startswith(bom):
3008                 url = url[len(bom):]
3009         url = url.lstrip()
3010         if not url or url.startswith(('#', ';', ']')):
3011             return False
3012         # "#" cannot be stripped out since it is part of the URI
3013         # However, it can be safely stripped out if following a whitespace
3014         return re.split(r'\s#', url, 1)[0].rstrip()
3015
3016     with contextlib.closing(batch_fd) as fd:
3017         return [url for url in map(fixup, fd) if url]
3018
3019
3020 def urlencode_postdata(*args, **kargs):
3021     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3022
3023
3024 def update_url_query(url, query):
3025     if not query:
3026         return url
3027     parsed_url = urllib.parse.urlparse(url)
3028     qs = urllib.parse.parse_qs(parsed_url.query)
3029     qs.update(query)
3030     return urllib.parse.urlunparse(parsed_url._replace(
3031         query=urllib.parse.urlencode(qs, True)))
3032
3033
3034 def update_Request(req, url=None, data=None, headers=None, query=None):
3035     req_headers = req.headers.copy()
3036     req_headers.update(headers or {})
3037     req_data = data or req.data
3038     req_url = update_url_query(url or req.get_full_url(), query)
3039     req_get_method = req.get_method()
3040     if req_get_method == 'HEAD':
3041         req_type = HEADRequest
3042     elif req_get_method == 'PUT':
3043         req_type = PUTRequest
3044     else:
3045         req_type = urllib.request.Request
3046     new_req = req_type(
3047         req_url, data=req_data, headers=req_headers,
3048         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3049     if hasattr(req, 'timeout'):
3050         new_req.timeout = req.timeout
3051     return new_req
3052
3053
3054 def _multipart_encode_impl(data, boundary):
3055     content_type = 'multipart/form-data; boundary=%s' % boundary
3056
3057     out = b''
3058     for k, v in data.items():
3059         out += b'--' + boundary.encode('ascii') + b'\r\n'
3060         if isinstance(k, str):
3061             k = k.encode()
3062         if isinstance(v, str):
3063             v = v.encode()
3064         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3065         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3066         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3067         if boundary.encode('ascii') in content:
3068             raise ValueError('Boundary overlaps with data')
3069         out += content
3070
3071     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3072
3073     return out, content_type
3074
3075
3076 def multipart_encode(data, boundary=None):
3077     '''
3078     Encode a dict to RFC 7578-compliant form-data
3079
3080     data:
3081         A dict where keys and values can be either Unicode or bytes-like
3082         objects.
3083     boundary:
3084         If specified a Unicode object, it's used as the boundary. Otherwise
3085         a random boundary is generated.
3086
3087     Reference: https://tools.ietf.org/html/rfc7578
3088     '''
3089     has_specified_boundary = boundary is not None
3090
3091     while True:
3092         if boundary is None:
3093             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3094
3095         try:
3096             out, content_type = _multipart_encode_impl(data, boundary)
3097             break
3098         except ValueError:
3099             if has_specified_boundary:
3100                 raise
3101             boundary = None
3102
3103     return out, content_type
3104
3105
3106 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3107     for val in map(d.get, variadic(key_or_keys)):
3108         if val is not None and (val or not skip_false_values):
3109             return val
3110     return default
3111
3112
3113 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3114     for f in funcs:
3115         try:
3116             val = f(*args, **kwargs)
3117         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3118             pass
3119         else:
3120             if expected_type is None or isinstance(val, expected_type):
3121                 return val
3122
3123
3124 def try_get(src, getter, expected_type=None):
3125     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3126
3127
3128 def filter_dict(dct, cndn=lambda _, v: v is not None):
3129     return {k: v for k, v in dct.items() if cndn(k, v)}
3130
3131
3132 def merge_dicts(*dicts):
3133     merged = {}
3134     for a_dict in dicts:
3135         for k, v in a_dict.items():
3136             if (v is not None and k not in merged
3137                     or isinstance(v, str) and merged[k] == ''):
3138                 merged[k] = v
3139     return merged
3140
3141
3142 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3143     return string if isinstance(string, str) else str(string, encoding, errors)
3144
3145
3146 US_RATINGS = {
3147     'G': 0,
3148     'PG': 10,
3149     'PG-13': 13,
3150     'R': 16,
3151     'NC': 18,
3152 }
3153
3154
3155 TV_PARENTAL_GUIDELINES = {
3156     'TV-Y': 0,
3157     'TV-Y7': 7,
3158     'TV-G': 0,
3159     'TV-PG': 0,
3160     'TV-14': 14,
3161     'TV-MA': 17,
3162 }
3163
3164
3165 def parse_age_limit(s):
3166     # isinstance(False, int) is True. So type() must be used instead
3167     if type(s) is int:  # noqa: E721
3168         return s if 0 <= s <= 21 else None
3169     elif not isinstance(s, str):
3170         return None
3171     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3172     if m:
3173         return int(m.group('age'))
3174     s = s.upper()
3175     if s in US_RATINGS:
3176         return US_RATINGS[s]
3177     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3178     if m:
3179         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3180     return None
3181
3182
3183 def strip_jsonp(code):
3184     return re.sub(
3185         r'''(?sx)^
3186             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3187             (?:\s*&&\s*(?P=func_name))?
3188             \s*\(\s*(?P<callback_data>.*)\);?
3189             \s*?(?://[^\n]*)*$''',
3190         r'\g<callback_data>', code)
3191
3192
3193 def js_to_json(code, vars={}):
3194     # vars is a dict of var, val pairs to substitute
3195     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3196     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3197     INTEGER_TABLE = (
3198         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3199         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3200     )
3201
3202     def fix_kv(m):
3203         v = m.group(0)
3204         if v in ('true', 'false', 'null'):
3205             return v
3206         elif v in ('undefined', 'void 0'):
3207             return 'null'
3208         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3209             return ""
3210
3211         if v[0] in ("'", '"'):
3212             v = re.sub(r'(?s)\\.|"', lambda m: {
3213                 '"': '\\"',
3214                 "\\'": "'",
3215                 '\\\n': '',
3216                 '\\x': '\\u00',
3217             }.get(m.group(0), m.group(0)), v[1:-1])
3218         else:
3219             for regex, base in INTEGER_TABLE:
3220                 im = re.match(regex, v)
3221                 if im:
3222                     i = int(im.group(1), base)
3223                     return '"%d":' % i if v.endswith(':') else '%d' % i
3224
3225             if v in vars:
3226                 return vars[v]
3227
3228         return '"%s"' % v
3229
3230     def create_map(mobj):
3231         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3232
3233     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3234     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3235
3236     return re.sub(r'''(?sx)
3237         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3238         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3239         {comment}|,(?={skip}[\]}}])|
3240         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3241         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3242         [0-9]+(?={skip}:)|
3243         !+
3244         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3245
3246
3247 def qualities(quality_ids):
3248     """ Get a numeric quality value out of a list of possible values """
3249     def q(qid):
3250         try:
3251             return quality_ids.index(qid)
3252         except ValueError:
3253             return -1
3254     return q
3255
3256
3257 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3258
3259
3260 DEFAULT_OUTTMPL = {
3261     'default': '%(title)s [%(id)s].%(ext)s',
3262     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3263 }
3264 OUTTMPL_TYPES = {
3265     'chapter': None,
3266     'subtitle': None,
3267     'thumbnail': None,
3268     'description': 'description',
3269     'annotation': 'annotations.xml',
3270     'infojson': 'info.json',
3271     'link': None,
3272     'pl_video': None,
3273     'pl_thumbnail': None,
3274     'pl_description': 'description',
3275     'pl_infojson': 'info.json',
3276 }
3277
3278 # As of [1] format syntax is:
3279 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3280 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3281 STR_FORMAT_RE_TMPL = r'''(?x)
3282     (?<!%)(?P<prefix>(?:%%)*)
3283     %
3284     (?P<has_key>\((?P<key>{0})\))?
3285     (?P<format>
3286         (?P<conversion>[#0\-+ ]+)?
3287         (?P<min_width>\d+)?
3288         (?P<precision>\.\d+)?
3289         (?P<len_mod>[hlL])?  # unused in python
3290         {1}  # conversion type
3291     )
3292 '''
3293
3294
3295 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3296
3297
3298 def limit_length(s, length):
3299     """ Add ellipses to overly long strings """
3300     if s is None:
3301         return None
3302     ELLIPSES = '...'
3303     if len(s) > length:
3304         return s[:length - len(ELLIPSES)] + ELLIPSES
3305     return s
3306
3307
3308 def version_tuple(v):
3309     return tuple(int(e) for e in re.split(r'[-.]', v))
3310
3311
3312 def is_outdated_version(version, limit, assume_new=True):
3313     if not version:
3314         return not assume_new
3315     try:
3316         return version_tuple(version) < version_tuple(limit)
3317     except ValueError:
3318         return not assume_new
3319
3320
3321 def ytdl_is_updateable():
3322     """ Returns if yt-dlp can be updated with -U """
3323
3324     from .update import is_non_updateable
3325
3326     return not is_non_updateable()
3327
3328
3329 def args_to_str(args):
3330     # Get a short string representation for a subprocess command
3331     return ' '.join(compat_shlex_quote(a) for a in args)
3332
3333
3334 def error_to_compat_str(err):
3335     return str(err)
3336
3337
3338 def error_to_str(err):
3339     return f'{type(err).__name__}: {err}'
3340
3341
3342 def mimetype2ext(mt):
3343     if mt is None:
3344         return None
3345
3346     mt, _, params = mt.partition(';')
3347     mt = mt.strip()
3348
3349     FULL_MAP = {
3350         'audio/mp4': 'm4a',
3351         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3352         # it's the most popular one
3353         'audio/mpeg': 'mp3',
3354         'audio/x-wav': 'wav',
3355         'audio/wav': 'wav',
3356         'audio/wave': 'wav',
3357     }
3358
3359     ext = FULL_MAP.get(mt)
3360     if ext is not None:
3361         return ext
3362
3363     SUBTYPE_MAP = {
3364         '3gpp': '3gp',
3365         'smptett+xml': 'tt',
3366         'ttaf+xml': 'dfxp',
3367         'ttml+xml': 'ttml',
3368         'x-flv': 'flv',
3369         'x-mp4-fragmented': 'mp4',
3370         'x-ms-sami': 'sami',
3371         'x-ms-wmv': 'wmv',
3372         'mpegurl': 'm3u8',
3373         'x-mpegurl': 'm3u8',
3374         'vnd.apple.mpegurl': 'm3u8',
3375         'dash+xml': 'mpd',
3376         'f4m+xml': 'f4m',
3377         'hds+xml': 'f4m',
3378         'vnd.ms-sstr+xml': 'ism',
3379         'quicktime': 'mov',
3380         'mp2t': 'ts',
3381         'x-wav': 'wav',
3382         'filmstrip+json': 'fs',
3383         'svg+xml': 'svg',
3384     }
3385
3386     _, _, subtype = mt.rpartition('/')
3387     ext = SUBTYPE_MAP.get(subtype.lower())
3388     if ext is not None:
3389         return ext
3390
3391     SUFFIX_MAP = {
3392         'json': 'json',
3393         'xml': 'xml',
3394         'zip': 'zip',
3395         'gzip': 'gz',
3396     }
3397
3398     _, _, suffix = subtype.partition('+')
3399     ext = SUFFIX_MAP.get(suffix)
3400     if ext is not None:
3401         return ext
3402
3403     return subtype.replace('+', '.')
3404
3405
3406 def ext2mimetype(ext_or_url):
3407     if not ext_or_url:
3408         return None
3409     if '.' not in ext_or_url:
3410         ext_or_url = f'file.{ext_or_url}'
3411     return mimetypes.guess_type(ext_or_url)[0]
3412
3413
3414 def parse_codecs(codecs_str):
3415     # http://tools.ietf.org/html/rfc6381
3416     if not codecs_str:
3417         return {}
3418     split_codecs = list(filter(None, map(
3419         str.strip, codecs_str.strip().strip(',').split(','))))
3420     vcodec, acodec, scodec, hdr = None, None, None, None
3421     for full_codec in split_codecs:
3422         parts = full_codec.split('.')
3423         codec = parts[0].replace('0', '')
3424         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3425                      'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3426             if not vcodec:
3427                 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3428                 if codec in ('dvh1', 'dvhe'):
3429                     hdr = 'DV'
3430                 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3431                     hdr = 'HDR10'
3432                 elif full_codec.replace('0', '').startswith('vp9.2'):
3433                     hdr = 'HDR10'
3434         elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3435             if not acodec:
3436                 acodec = full_codec
3437         elif codec in ('stpp', 'wvtt',):
3438             if not scodec:
3439                 scodec = full_codec
3440         else:
3441             write_string(f'WARNING: Unknown codec {full_codec}\n')
3442     if vcodec or acodec or scodec:
3443         return {
3444             'vcodec': vcodec or 'none',
3445             'acodec': acodec or 'none',
3446             'dynamic_range': hdr,
3447             **({'scodec': scodec} if scodec is not None else {}),
3448         }
3449     elif len(split_codecs) == 2:
3450         return {
3451             'vcodec': split_codecs[0],
3452             'acodec': split_codecs[1],
3453         }
3454     return {}
3455
3456
3457 def urlhandle_detect_ext(url_handle):
3458     getheader = url_handle.headers.get
3459
3460     cd = getheader('Content-Disposition')
3461     if cd:
3462         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3463         if m:
3464             e = determine_ext(m.group('filename'), default_ext=None)
3465             if e:
3466                 return e
3467
3468     return mimetype2ext(getheader('Content-Type'))
3469
3470
3471 def encode_data_uri(data, mime_type):
3472     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3473
3474
3475 def age_restricted(content_limit, age_limit):
3476     """ Returns True iff the content should be blocked """
3477
3478     if age_limit is None:  # No limit set
3479         return False
3480     if content_limit is None:
3481         return False  # Content available for everyone
3482     return age_limit < content_limit
3483
3484
3485 def is_html(first_bytes):
3486     """ Detect whether a file contains HTML by examining its first bytes. """
3487
3488     BOMS = [
3489         (b'\xef\xbb\xbf', 'utf-8'),
3490         (b'\x00\x00\xfe\xff', 'utf-32-be'),
3491         (b'\xff\xfe\x00\x00', 'utf-32-le'),
3492         (b'\xff\xfe', 'utf-16-le'),
3493         (b'\xfe\xff', 'utf-16-be'),
3494     ]
3495
3496     encoding = 'utf-8'
3497     for bom, enc in BOMS:
3498         while first_bytes.startswith(bom):
3499             encoding, first_bytes = enc, first_bytes[len(bom):]
3500
3501     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3502
3503
3504 def determine_protocol(info_dict):
3505     protocol = info_dict.get('protocol')
3506     if protocol is not None:
3507         return protocol
3508
3509     url = sanitize_url(info_dict['url'])
3510     if url.startswith('rtmp'):
3511         return 'rtmp'
3512     elif url.startswith('mms'):
3513         return 'mms'
3514     elif url.startswith('rtsp'):
3515         return 'rtsp'
3516
3517     ext = determine_ext(url)
3518     if ext == 'm3u8':
3519         return 'm3u8'
3520     elif ext == 'f4m':
3521         return 'f4m'
3522
3523     return urllib.parse.urlparse(url).scheme
3524
3525
3526 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3527     """ Render a list of rows, each as a list of values.
3528     Text after a \t will be right aligned """
3529     def width(string):
3530         return len(remove_terminal_sequences(string).replace('\t', ''))
3531
3532     def get_max_lens(table):
3533         return [max(width(str(v)) for v in col) for col in zip(*table)]
3534
3535     def filter_using_list(row, filterArray):
3536         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3537
3538     max_lens = get_max_lens(data) if hide_empty else []
3539     header_row = filter_using_list(header_row, max_lens)
3540     data = [filter_using_list(row, max_lens) for row in data]
3541
3542     table = [header_row] + data
3543     max_lens = get_max_lens(table)
3544     extra_gap += 1
3545     if delim:
3546         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3547         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3548     for row in table:
3549         for pos, text in enumerate(map(str, row)):
3550             if '\t' in text:
3551                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3552             else:
3553                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3554     ret = '\n'.join(''.join(row).rstrip() for row in table)
3555     return ret
3556
3557
3558 def _match_one(filter_part, dct, incomplete):
3559     # TODO: Generalize code with YoutubeDL._build_format_filter
3560     STRING_OPERATORS = {
3561         '*=': operator.contains,
3562         '^=': lambda attr, value: attr.startswith(value),
3563         '$=': lambda attr, value: attr.endswith(value),
3564         '~=': lambda attr, value: re.search(value, attr),
3565     }
3566     COMPARISON_OPERATORS = {
3567         **STRING_OPERATORS,
3568         '<=': operator.le,  # "<=" must be defined above "<"
3569         '<': operator.lt,
3570         '>=': operator.ge,
3571         '>': operator.gt,
3572         '=': operator.eq,
3573     }
3574
3575     if isinstance(incomplete, bool):
3576         is_incomplete = lambda _: incomplete
3577     else:
3578         is_incomplete = lambda k: k in incomplete
3579
3580     operator_rex = re.compile(r'''(?x)
3581         (?P<key>[a-z_]+)
3582         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3583         (?:
3584             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3585             (?P<strval>.+?)
3586         )
3587         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3588     m = operator_rex.fullmatch(filter_part.strip())
3589     if m:
3590         m = m.groupdict()
3591         unnegated_op = COMPARISON_OPERATORS[m['op']]
3592         if m['negation']:
3593             op = lambda attr, value: not unnegated_op(attr, value)
3594         else:
3595             op = unnegated_op
3596         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3597         if m['quote']:
3598             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3599         actual_value = dct.get(m['key'])
3600         numeric_comparison = None
3601         if isinstance(actual_value, (int, float)):
3602             # If the original field is a string and matching comparisonvalue is
3603             # a number we should respect the origin of the original field
3604             # and process comparison value as a string (see
3605             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3606             try:
3607                 numeric_comparison = int(comparison_value)
3608             except ValueError:
3609                 numeric_comparison = parse_filesize(comparison_value)
3610                 if numeric_comparison is None:
3611                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3612                 if numeric_comparison is None:
3613                     numeric_comparison = parse_duration(comparison_value)
3614         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3615             raise ValueError('Operator %s only supports string values!' % m['op'])
3616         if actual_value is None:
3617             return is_incomplete(m['key']) or m['none_inclusive']
3618         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3619
3620     UNARY_OPERATORS = {
3621         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3622         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3623     }
3624     operator_rex = re.compile(r'''(?x)
3625         (?P<op>%s)\s*(?P<key>[a-z_]+)
3626         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3627     m = operator_rex.fullmatch(filter_part.strip())
3628     if m:
3629         op = UNARY_OPERATORS[m.group('op')]
3630         actual_value = dct.get(m.group('key'))
3631         if is_incomplete(m.group('key')) and actual_value is None:
3632             return True
3633         return op(actual_value)
3634
3635     raise ValueError('Invalid filter part %r' % filter_part)
3636
3637
3638 def match_str(filter_str, dct, incomplete=False):
3639     """ Filter a dictionary with a simple string syntax.
3640     @returns           Whether the filter passes
3641     @param incomplete  Set of keys that is expected to be missing from dct.
3642                        Can be True/False to indicate all/none of the keys may be missing.
3643                        All conditions on incomplete keys pass if the key is missing
3644     """
3645     return all(
3646         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3647         for filter_part in re.split(r'(?<!\\)&', filter_str))
3648
3649
3650 def match_filter_func(filters):
3651     if not filters:
3652         return None
3653     filters = set(variadic(filters))
3654
3655     interactive = '-' in filters
3656     if interactive:
3657         filters.remove('-')
3658
3659     def _match_func(info_dict, incomplete=False):
3660         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3661             return NO_DEFAULT if interactive and not incomplete else None
3662         else:
3663             video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3664             filter_str = ') | ('.join(map(str.strip, filters))
3665             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3666     return _match_func
3667
3668
3669 class download_range_func:
3670     def __init__(self, chapters, ranges):
3671         self.chapters, self.ranges = chapters, ranges
3672
3673     def __call__(self, info_dict, ydl):
3674         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3675                    else 'Cannot match chapters since chapter information is unavailable')
3676         for regex in self.chapters or []:
3677             for i, chapter in enumerate(info_dict.get('chapters') or []):
3678                 if re.search(regex, chapter['title']):
3679                     warning = None
3680                     yield {**chapter, 'index': i}
3681         if self.chapters and warning:
3682             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3683
3684         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3685
3686     def __eq__(self, other):
3687         return (isinstance(other, download_range_func)
3688                 and self.chapters == other.chapters and self.ranges == other.ranges)
3689
3690
3691 def parse_dfxp_time_expr(time_expr):
3692     if not time_expr:
3693         return
3694
3695     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3696     if mobj:
3697         return float(mobj.group('time_offset'))
3698
3699     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3700     if mobj:
3701         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3702
3703
3704 def srt_subtitles_timecode(seconds):
3705     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3706
3707
3708 def ass_subtitles_timecode(seconds):
3709     time = timetuple_from_msec(seconds * 1000)
3710     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3711
3712
3713 def dfxp2srt(dfxp_data):
3714     '''
3715     @param dfxp_data A bytes-like object containing DFXP data
3716     @returns A unicode object containing converted SRT data
3717     '''
3718     LEGACY_NAMESPACES = (
3719         (b'http://www.w3.org/ns/ttml', [
3720             b'http://www.w3.org/2004/11/ttaf1',
3721             b'http://www.w3.org/2006/04/ttaf1',
3722             b'http://www.w3.org/2006/10/ttaf1',
3723         ]),
3724         (b'http://www.w3.org/ns/ttml#styling', [
3725             b'http://www.w3.org/ns/ttml#style',
3726         ]),
3727     )
3728
3729     SUPPORTED_STYLING = [
3730         'color',
3731         'fontFamily',
3732         'fontSize',
3733         'fontStyle',
3734         'fontWeight',
3735         'textDecoration'
3736     ]
3737
3738     _x = functools.partial(xpath_with_ns, ns_map={
3739         'xml': 'http://www.w3.org/XML/1998/namespace',
3740         'ttml': 'http://www.w3.org/ns/ttml',
3741         'tts': 'http://www.w3.org/ns/ttml#styling',
3742     })
3743
3744     styles = {}
3745     default_style = {}
3746
3747     class TTMLPElementParser:
3748         _out = ''
3749         _unclosed_elements = []
3750         _applied_styles = []
3751
3752         def start(self, tag, attrib):
3753             if tag in (_x('ttml:br'), 'br'):
3754                 self._out += '\n'
3755             else:
3756                 unclosed_elements = []
3757                 style = {}
3758                 element_style_id = attrib.get('style')
3759                 if default_style:
3760                     style.update(default_style)
3761                 if element_style_id:
3762                     style.update(styles.get(element_style_id, {}))
3763                 for prop in SUPPORTED_STYLING:
3764                     prop_val = attrib.get(_x('tts:' + prop))
3765                     if prop_val:
3766                         style[prop] = prop_val
3767                 if style:
3768                     font = ''
3769                     for k, v in sorted(style.items()):
3770                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3771                             continue
3772                         if k == 'color':
3773                             font += ' color="%s"' % v
3774                         elif k == 'fontSize':
3775                             font += ' size="%s"' % v
3776                         elif k == 'fontFamily':
3777                             font += ' face="%s"' % v
3778                         elif k == 'fontWeight' and v == 'bold':
3779                             self._out += '<b>'
3780                             unclosed_elements.append('b')
3781                         elif k == 'fontStyle' and v == 'italic':
3782                             self._out += '<i>'
3783                             unclosed_elements.append('i')
3784                         elif k == 'textDecoration' and v == 'underline':
3785                             self._out += '<u>'
3786                             unclosed_elements.append('u')
3787                     if font:
3788                         self._out += '<font' + font + '>'
3789                         unclosed_elements.append('font')
3790                     applied_style = {}
3791                     if self._applied_styles:
3792                         applied_style.update(self._applied_styles[-1])
3793                     applied_style.update(style)
3794                     self._applied_styles.append(applied_style)
3795                 self._unclosed_elements.append(unclosed_elements)
3796
3797         def end(self, tag):
3798             if tag not in (_x('ttml:br'), 'br'):
3799                 unclosed_elements = self._unclosed_elements.pop()
3800                 for element in reversed(unclosed_elements):
3801                     self._out += '</%s>' % element
3802                 if unclosed_elements and self._applied_styles:
3803                     self._applied_styles.pop()
3804
3805         def data(self, data):
3806             self._out += data
3807
3808         def close(self):
3809             return self._out.strip()
3810
3811     def parse_node(node):
3812         target = TTMLPElementParser()
3813         parser = xml.etree.ElementTree.XMLParser(target=target)
3814         parser.feed(xml.etree.ElementTree.tostring(node))
3815         return parser.close()
3816
3817     for k, v in LEGACY_NAMESPACES:
3818         for ns in v:
3819             dfxp_data = dfxp_data.replace(ns, k)
3820
3821     dfxp = compat_etree_fromstring(dfxp_data)
3822     out = []
3823     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3824
3825     if not paras:
3826         raise ValueError('Invalid dfxp/TTML subtitle')
3827
3828     repeat = False
3829     while True:
3830         for style in dfxp.findall(_x('.//ttml:style')):
3831             style_id = style.get('id') or style.get(_x('xml:id'))
3832             if not style_id:
3833                 continue
3834             parent_style_id = style.get('style')
3835             if parent_style_id:
3836                 if parent_style_id not in styles:
3837                     repeat = True
3838                     continue
3839                 styles[style_id] = styles[parent_style_id].copy()
3840             for prop in SUPPORTED_STYLING:
3841                 prop_val = style.get(_x('tts:' + prop))
3842                 if prop_val:
3843                     styles.setdefault(style_id, {})[prop] = prop_val
3844         if repeat:
3845             repeat = False
3846         else:
3847             break
3848
3849     for p in ('body', 'div'):
3850         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3851         if ele is None:
3852             continue
3853         style = styles.get(ele.get('style'))
3854         if not style:
3855             continue
3856         default_style.update(style)
3857
3858     for para, index in zip(paras, itertools.count(1)):
3859         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3860         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3861         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3862         if begin_time is None:
3863             continue
3864         if not end_time:
3865             if not dur:
3866                 continue
3867             end_time = begin_time + dur
3868         out.append('%d\n%s --> %s\n%s\n\n' % (
3869             index,
3870             srt_subtitles_timecode(begin_time),
3871             srt_subtitles_timecode(end_time),
3872             parse_node(para)))
3873
3874     return ''.join(out)
3875
3876
3877 def cli_option(params, command_option, param, separator=None):
3878     param = params.get(param)
3879     return ([] if param is None
3880             else [command_option, str(param)] if separator is None
3881             else [f'{command_option}{separator}{param}'])
3882
3883
3884 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3885     param = params.get(param)
3886     assert param in (True, False, None)
3887     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3888
3889
3890 def cli_valueless_option(params, command_option, param, expected_value=True):
3891     return [command_option] if params.get(param) == expected_value else []
3892
3893
3894 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3895     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3896         if use_compat:
3897             return argdict
3898         else:
3899             argdict = None
3900     if argdict is None:
3901         return default
3902     assert isinstance(argdict, dict)
3903
3904     assert isinstance(keys, (list, tuple))
3905     for key_list in keys:
3906         arg_list = list(filter(
3907             lambda x: x is not None,
3908             [argdict.get(key.lower()) for key in variadic(key_list)]))
3909         if arg_list:
3910             return [arg for args in arg_list for arg in args]
3911     return default
3912
3913
3914 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3915     main_key, exe = main_key.lower(), exe.lower()
3916     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3917     keys = [f'{root_key}{k}' for k in (keys or [''])]
3918     if root_key in keys:
3919         if main_key != exe:
3920             keys.append((main_key, exe))
3921         keys.append('default')
3922     else:
3923         use_compat = False
3924     return cli_configuration_args(argdict, keys, default, use_compat)
3925
3926
3927 class ISO639Utils:
3928     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3929     _lang_map = {
3930         'aa': 'aar',
3931         'ab': 'abk',
3932         'ae': 'ave',
3933         'af': 'afr',
3934         'ak': 'aka',
3935         'am': 'amh',
3936         'an': 'arg',
3937         'ar': 'ara',
3938         'as': 'asm',
3939         'av': 'ava',
3940         'ay': 'aym',
3941         'az': 'aze',
3942         'ba': 'bak',
3943         'be': 'bel',
3944         'bg': 'bul',
3945         'bh': 'bih',
3946         'bi': 'bis',
3947         'bm': 'bam',
3948         'bn': 'ben',
3949         'bo': 'bod',
3950         'br': 'bre',
3951         'bs': 'bos',
3952         'ca': 'cat',
3953         'ce': 'che',
3954         'ch': 'cha',
3955         'co': 'cos',
3956         'cr': 'cre',
3957         'cs': 'ces',
3958         'cu': 'chu',
3959         'cv': 'chv',
3960         'cy': 'cym',
3961         'da': 'dan',
3962         'de': 'deu',
3963         'dv': 'div',
3964         'dz': 'dzo',
3965         'ee': 'ewe',
3966         'el': 'ell',
3967         'en': 'eng',
3968         'eo': 'epo',
3969         'es': 'spa',
3970         'et': 'est',
3971         'eu': 'eus',
3972         'fa': 'fas',
3973         'ff': 'ful',
3974         'fi': 'fin',
3975         'fj': 'fij',
3976         'fo': 'fao',
3977         'fr': 'fra',
3978         'fy': 'fry',
3979         'ga': 'gle',
3980         'gd': 'gla',
3981         'gl': 'glg',
3982         'gn': 'grn',
3983         'gu': 'guj',
3984         'gv': 'glv',
3985         'ha': 'hau',
3986         'he': 'heb',
3987         'iw': 'heb',  # Replaced by he in 1989 revision
3988         'hi': 'hin',
3989         'ho': 'hmo',
3990         'hr': 'hrv',
3991         'ht': 'hat',
3992         'hu': 'hun',
3993         'hy': 'hye',
3994         'hz': 'her',
3995         'ia': 'ina',
3996         'id': 'ind',
3997         'in': 'ind',  # Replaced by id in 1989 revision
3998         'ie': 'ile',
3999         'ig': 'ibo',
4000         'ii': 'iii',
4001         'ik': 'ipk',
4002         'io': 'ido',
4003         'is': 'isl',
4004         'it': 'ita',
4005         'iu': 'iku',
4006         'ja': 'jpn',
4007         'jv': 'jav',
4008         'ka': 'kat',
4009         'kg': 'kon',
4010         'ki': 'kik',
4011         'kj': 'kua',
4012         'kk': 'kaz',
4013         'kl': 'kal',
4014         'km': 'khm',
4015         'kn': 'kan',
4016         'ko': 'kor',
4017         'kr': 'kau',
4018         'ks': 'kas',
4019         'ku': 'kur',
4020         'kv': 'kom',
4021         'kw': 'cor',
4022         'ky': 'kir',
4023         'la': 'lat',
4024         'lb': 'ltz',
4025         'lg': 'lug',
4026         'li': 'lim',
4027         'ln': 'lin',
4028         'lo': 'lao',
4029         'lt': 'lit',
4030         'lu': 'lub',
4031         'lv': 'lav',
4032         'mg': 'mlg',
4033         'mh': 'mah',
4034         'mi': 'mri',
4035         'mk': 'mkd',
4036         'ml': 'mal',
4037         'mn': 'mon',
4038         'mr': 'mar',
4039         'ms': 'msa',
4040         'mt': 'mlt',
4041         'my': 'mya',
4042         'na': 'nau',
4043         'nb': 'nob',
4044         'nd': 'nde',
4045         'ne': 'nep',
4046         'ng': 'ndo',
4047         'nl': 'nld',
4048         'nn': 'nno',
4049         'no': 'nor',
4050         'nr': 'nbl',
4051         'nv': 'nav',
4052         'ny': 'nya',
4053         'oc': 'oci',
4054         'oj': 'oji',
4055         'om': 'orm',
4056         'or': 'ori',
4057         'os': 'oss',
4058         'pa': 'pan',
4059         'pi': 'pli',
4060         'pl': 'pol',
4061         'ps': 'pus',
4062         'pt': 'por',
4063         'qu': 'que',
4064         'rm': 'roh',
4065         'rn': 'run',
4066         'ro': 'ron',
4067         'ru': 'rus',
4068         'rw': 'kin',
4069         'sa': 'san',
4070         'sc': 'srd',
4071         'sd': 'snd',
4072         'se': 'sme',
4073         'sg': 'sag',
4074         'si': 'sin',
4075         'sk': 'slk',
4076         'sl': 'slv',
4077         'sm': 'smo',
4078         'sn': 'sna',
4079         'so': 'som',
4080         'sq': 'sqi',
4081         'sr': 'srp',
4082         'ss': 'ssw',
4083         'st': 'sot',
4084         'su': 'sun',
4085         'sv': 'swe',
4086         'sw': 'swa',
4087         'ta': 'tam',
4088         'te': 'tel',
4089         'tg': 'tgk',
4090         'th': 'tha',
4091         'ti': 'tir',
4092         'tk': 'tuk',
4093         'tl': 'tgl',
4094         'tn': 'tsn',
4095         'to': 'ton',
4096         'tr': 'tur',
4097         'ts': 'tso',
4098         'tt': 'tat',
4099         'tw': 'twi',
4100         'ty': 'tah',
4101         'ug': 'uig',
4102         'uk': 'ukr',
4103         'ur': 'urd',
4104         'uz': 'uzb',
4105         've': 'ven',
4106         'vi': 'vie',
4107         'vo': 'vol',
4108         'wa': 'wln',
4109         'wo': 'wol',
4110         'xh': 'xho',
4111         'yi': 'yid',
4112         'ji': 'yid',  # Replaced by yi in 1989 revision
4113         'yo': 'yor',
4114         'za': 'zha',
4115         'zh': 'zho',
4116         'zu': 'zul',
4117     }
4118
4119     @classmethod
4120     def short2long(cls, code):
4121         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4122         return cls._lang_map.get(code[:2])
4123
4124     @classmethod
4125     def long2short(cls, code):
4126         """Convert language code from ISO 639-2/T to ISO 639-1"""
4127         for short_name, long_name in cls._lang_map.items():
4128             if long_name == code:
4129                 return short_name
4130
4131
4132 class ISO3166Utils:
4133     # From http://data.okfn.org/data/core/country-list
4134     _country_map = {
4135         'AF': 'Afghanistan',
4136         'AX': 'Åland Islands',
4137         'AL': 'Albania',
4138         'DZ': 'Algeria',
4139         'AS': 'American Samoa',
4140         'AD': 'Andorra',
4141         'AO': 'Angola',
4142         'AI': 'Anguilla',
4143         'AQ': 'Antarctica',
4144         'AG': 'Antigua and Barbuda',
4145         'AR': 'Argentina',
4146         'AM': 'Armenia',
4147         'AW': 'Aruba',
4148         'AU': 'Australia',
4149         'AT': 'Austria',
4150         'AZ': 'Azerbaijan',
4151         'BS': 'Bahamas',
4152         'BH': 'Bahrain',
4153         'BD': 'Bangladesh',
4154         'BB': 'Barbados',
4155         'BY': 'Belarus',
4156         'BE': 'Belgium',
4157         'BZ': 'Belize',
4158         'BJ': 'Benin',
4159         'BM': 'Bermuda',
4160         'BT': 'Bhutan',
4161         'BO': 'Bolivia, Plurinational State of',
4162         'BQ': 'Bonaire, Sint Eustatius and Saba',
4163         'BA': 'Bosnia and Herzegovina',
4164         'BW': 'Botswana',
4165         'BV': 'Bouvet Island',
4166         'BR': 'Brazil',
4167         'IO': 'British Indian Ocean Territory',
4168         'BN': 'Brunei Darussalam',
4169         'BG': 'Bulgaria',
4170         'BF': 'Burkina Faso',
4171         'BI': 'Burundi',
4172         'KH': 'Cambodia',
4173         'CM': 'Cameroon',
4174         'CA': 'Canada',
4175         'CV': 'Cape Verde',
4176         'KY': 'Cayman Islands',
4177         'CF': 'Central African Republic',
4178         'TD': 'Chad',
4179         'CL': 'Chile',
4180         'CN': 'China',
4181         'CX': 'Christmas Island',
4182         'CC': 'Cocos (Keeling) Islands',
4183         'CO': 'Colombia',
4184         'KM': 'Comoros',
4185         'CG': 'Congo',
4186         'CD': 'Congo, the Democratic Republic of the',
4187         'CK': 'Cook Islands',
4188         'CR': 'Costa Rica',
4189         'CI': 'Côte d\'Ivoire',
4190         'HR': 'Croatia',
4191         'CU': 'Cuba',
4192         'CW': 'Curaçao',
4193         'CY': 'Cyprus',
4194         'CZ': 'Czech Republic',
4195         'DK': 'Denmark',
4196         'DJ': 'Djibouti',
4197         'DM': 'Dominica',
4198         'DO': 'Dominican Republic',
4199         'EC': 'Ecuador',
4200         'EG': 'Egypt',
4201         'SV': 'El Salvador',
4202         'GQ': 'Equatorial Guinea',
4203         'ER': 'Eritrea',
4204         'EE': 'Estonia',
4205         'ET': 'Ethiopia',
4206         'FK': 'Falkland Islands (Malvinas)',
4207         'FO': 'Faroe Islands',
4208         'FJ': 'Fiji',
4209         'FI': 'Finland',
4210         'FR': 'France',
4211         'GF': 'French Guiana',
4212         'PF': 'French Polynesia',
4213         'TF': 'French Southern Territories',
4214         'GA': 'Gabon',
4215         'GM': 'Gambia',
4216         'GE': 'Georgia',
4217         'DE': 'Germany',
4218         'GH': 'Ghana',
4219         'GI': 'Gibraltar',
4220         'GR': 'Greece',
4221         'GL': 'Greenland',
4222         'GD': 'Grenada',
4223         'GP': 'Guadeloupe',
4224         'GU': 'Guam',
4225         'GT': 'Guatemala',
4226         'GG': 'Guernsey',
4227         'GN': 'Guinea',
4228         'GW': 'Guinea-Bissau',
4229         'GY': 'Guyana',
4230         'HT': 'Haiti',
4231         'HM': 'Heard Island and McDonald Islands',
4232         'VA': 'Holy See (Vatican City State)',
4233         'HN': 'Honduras',
4234         'HK': 'Hong Kong',
4235         'HU': 'Hungary',
4236         'IS': 'Iceland',
4237         'IN': 'India',
4238         'ID': 'Indonesia',
4239         'IR': 'Iran, Islamic Republic of',
4240         'IQ': 'Iraq',
4241         'IE': 'Ireland',
4242         'IM': 'Isle of Man',
4243         'IL': 'Israel',
4244         'IT': 'Italy',
4245         'JM': 'Jamaica',
4246         'JP': 'Japan',
4247         'JE': 'Jersey',
4248         'JO': 'Jordan',
4249         'KZ': 'Kazakhstan',
4250         'KE': 'Kenya',
4251         'KI': 'Kiribati',
4252         'KP': 'Korea, Democratic People\'s Republic of',
4253         'KR': 'Korea, Republic of',
4254         'KW': 'Kuwait',
4255         'KG': 'Kyrgyzstan',
4256         'LA': 'Lao People\'s Democratic Republic',
4257         'LV': 'Latvia',
4258         'LB': 'Lebanon',
4259         'LS': 'Lesotho',
4260         'LR': 'Liberia',
4261         'LY': 'Libya',
4262         'LI': 'Liechtenstein',
4263         'LT': 'Lithuania',
4264         'LU': 'Luxembourg',
4265         'MO': 'Macao',
4266         'MK': 'Macedonia, the Former Yugoslav Republic of',
4267         'MG': 'Madagascar',
4268         'MW': 'Malawi',
4269         'MY': 'Malaysia',
4270         'MV': 'Maldives',
4271         'ML': 'Mali',
4272         'MT': 'Malta',
4273         'MH': 'Marshall Islands',
4274         'MQ': 'Martinique',
4275         'MR': 'Mauritania',
4276         'MU': 'Mauritius',
4277         'YT': 'Mayotte',
4278         'MX': 'Mexico',
4279         'FM': 'Micronesia, Federated States of',
4280         'MD': 'Moldova, Republic of',
4281         'MC': 'Monaco',
4282         'MN': 'Mongolia',
4283         'ME': 'Montenegro',
4284         'MS': 'Montserrat',
4285         'MA': 'Morocco',
4286         'MZ': 'Mozambique',
4287         'MM': 'Myanmar',
4288         'NA': 'Namibia',
4289         'NR': 'Nauru',
4290         'NP': 'Nepal',
4291         'NL': 'Netherlands',
4292         'NC': 'New Caledonia',
4293         'NZ': 'New Zealand',
4294         'NI': 'Nicaragua',
4295         'NE': 'Niger',
4296         'NG': 'Nigeria',
4297         'NU': 'Niue',
4298         'NF': 'Norfolk Island',
4299         'MP': 'Northern Mariana Islands',
4300         'NO': 'Norway',
4301         'OM': 'Oman',
4302         'PK': 'Pakistan',
4303         'PW': 'Palau',
4304         'PS': 'Palestine, State of',
4305         'PA': 'Panama',
4306         'PG': 'Papua New Guinea',
4307         'PY': 'Paraguay',
4308         'PE': 'Peru',
4309         'PH': 'Philippines',
4310         'PN': 'Pitcairn',
4311         'PL': 'Poland',
4312         'PT': 'Portugal',
4313         'PR': 'Puerto Rico',
4314         'QA': 'Qatar',
4315         'RE': 'Réunion',
4316         'RO': 'Romania',
4317         'RU': 'Russian Federation',
4318         'RW': 'Rwanda',
4319         'BL': 'Saint Barthélemy',
4320         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4321         'KN': 'Saint Kitts and Nevis',
4322         'LC': 'Saint Lucia',
4323         'MF': 'Saint Martin (French part)',
4324         'PM': 'Saint Pierre and Miquelon',
4325         'VC': 'Saint Vincent and the Grenadines',
4326         'WS': 'Samoa',
4327         'SM': 'San Marino',
4328         'ST': 'Sao Tome and Principe',
4329         'SA': 'Saudi Arabia',
4330         'SN': 'Senegal',
4331         'RS': 'Serbia',
4332         'SC': 'Seychelles',
4333         'SL': 'Sierra Leone',
4334         'SG': 'Singapore',
4335         'SX': 'Sint Maarten (Dutch part)',
4336         'SK': 'Slovakia',
4337         'SI': 'Slovenia',
4338         'SB': 'Solomon Islands',
4339         'SO': 'Somalia',
4340         'ZA': 'South Africa',
4341         'GS': 'South Georgia and the South Sandwich Islands',
4342         'SS': 'South Sudan',
4343         'ES': 'Spain',
4344         'LK': 'Sri Lanka',
4345         'SD': 'Sudan',
4346         'SR': 'Suriname',
4347         'SJ': 'Svalbard and Jan Mayen',
4348         'SZ': 'Swaziland',
4349         'SE': 'Sweden',
4350         'CH': 'Switzerland',
4351         'SY': 'Syrian Arab Republic',
4352         'TW': 'Taiwan, Province of China',
4353         'TJ': 'Tajikistan',
4354         'TZ': 'Tanzania, United Republic of',
4355         'TH': 'Thailand',
4356         'TL': 'Timor-Leste',
4357         'TG': 'Togo',
4358         'TK': 'Tokelau',
4359         'TO': 'Tonga',
4360         'TT': 'Trinidad and Tobago',
4361         'TN': 'Tunisia',
4362         'TR': 'Turkey',
4363         'TM': 'Turkmenistan',
4364         'TC': 'Turks and Caicos Islands',
4365         'TV': 'Tuvalu',
4366         'UG': 'Uganda',
4367         'UA': 'Ukraine',
4368         'AE': 'United Arab Emirates',
4369         'GB': 'United Kingdom',
4370         'US': 'United States',
4371         'UM': 'United States Minor Outlying Islands',
4372         'UY': 'Uruguay',
4373         'UZ': 'Uzbekistan',
4374         'VU': 'Vanuatu',
4375         'VE': 'Venezuela, Bolivarian Republic of',
4376         'VN': 'Viet Nam',
4377         'VG': 'Virgin Islands, British',
4378         'VI': 'Virgin Islands, U.S.',
4379         'WF': 'Wallis and Futuna',
4380         'EH': 'Western Sahara',
4381         'YE': 'Yemen',
4382         'ZM': 'Zambia',
4383         'ZW': 'Zimbabwe',
4384         # Not ISO 3166 codes, but used for IP blocks
4385         'AP': 'Asia/Pacific Region',
4386         'EU': 'Europe',
4387     }
4388
4389     @classmethod
4390     def short2full(cls, code):
4391         """Convert an ISO 3166-2 country code to the corresponding full name"""
4392         return cls._country_map.get(code.upper())
4393
4394
4395 class GeoUtils:
4396     # Major IPv4 address blocks per country
4397     _country_ip_map = {
4398         'AD': '46.172.224.0/19',
4399         'AE': '94.200.0.0/13',
4400         'AF': '149.54.0.0/17',
4401         'AG': '209.59.64.0/18',
4402         'AI': '204.14.248.0/21',
4403         'AL': '46.99.0.0/16',
4404         'AM': '46.70.0.0/15',
4405         'AO': '105.168.0.0/13',
4406         'AP': '182.50.184.0/21',
4407         'AQ': '23.154.160.0/24',
4408         'AR': '181.0.0.0/12',
4409         'AS': '202.70.112.0/20',
4410         'AT': '77.116.0.0/14',
4411         'AU': '1.128.0.0/11',
4412         'AW': '181.41.0.0/18',
4413         'AX': '185.217.4.0/22',
4414         'AZ': '5.197.0.0/16',
4415         'BA': '31.176.128.0/17',
4416         'BB': '65.48.128.0/17',
4417         'BD': '114.130.0.0/16',
4418         'BE': '57.0.0.0/8',
4419         'BF': '102.178.0.0/15',
4420         'BG': '95.42.0.0/15',
4421         'BH': '37.131.0.0/17',
4422         'BI': '154.117.192.0/18',
4423         'BJ': '137.255.0.0/16',
4424         'BL': '185.212.72.0/23',
4425         'BM': '196.12.64.0/18',
4426         'BN': '156.31.0.0/16',
4427         'BO': '161.56.0.0/16',
4428         'BQ': '161.0.80.0/20',
4429         'BR': '191.128.0.0/12',
4430         'BS': '24.51.64.0/18',
4431         'BT': '119.2.96.0/19',
4432         'BW': '168.167.0.0/16',
4433         'BY': '178.120.0.0/13',
4434         'BZ': '179.42.192.0/18',
4435         'CA': '99.224.0.0/11',
4436         'CD': '41.243.0.0/16',
4437         'CF': '197.242.176.0/21',
4438         'CG': '160.113.0.0/16',
4439         'CH': '85.0.0.0/13',
4440         'CI': '102.136.0.0/14',
4441         'CK': '202.65.32.0/19',
4442         'CL': '152.172.0.0/14',
4443         'CM': '102.244.0.0/14',
4444         'CN': '36.128.0.0/10',
4445         'CO': '181.240.0.0/12',
4446         'CR': '201.192.0.0/12',
4447         'CU': '152.206.0.0/15',
4448         'CV': '165.90.96.0/19',
4449         'CW': '190.88.128.0/17',
4450         'CY': '31.153.0.0/16',
4451         'CZ': '88.100.0.0/14',
4452         'DE': '53.0.0.0/8',
4453         'DJ': '197.241.0.0/17',
4454         'DK': '87.48.0.0/12',
4455         'DM': '192.243.48.0/20',
4456         'DO': '152.166.0.0/15',
4457         'DZ': '41.96.0.0/12',
4458         'EC': '186.68.0.0/15',
4459         'EE': '90.190.0.0/15',
4460         'EG': '156.160.0.0/11',
4461         'ER': '196.200.96.0/20',
4462         'ES': '88.0.0.0/11',
4463         'ET': '196.188.0.0/14',
4464         'EU': '2.16.0.0/13',
4465         'FI': '91.152.0.0/13',
4466         'FJ': '144.120.0.0/16',
4467         'FK': '80.73.208.0/21',
4468         'FM': '119.252.112.0/20',
4469         'FO': '88.85.32.0/19',
4470         'FR': '90.0.0.0/9',
4471         'GA': '41.158.0.0/15',
4472         'GB': '25.0.0.0/8',
4473         'GD': '74.122.88.0/21',
4474         'GE': '31.146.0.0/16',
4475         'GF': '161.22.64.0/18',
4476         'GG': '62.68.160.0/19',
4477         'GH': '154.160.0.0/12',
4478         'GI': '95.164.0.0/16',
4479         'GL': '88.83.0.0/19',
4480         'GM': '160.182.0.0/15',
4481         'GN': '197.149.192.0/18',
4482         'GP': '104.250.0.0/19',
4483         'GQ': '105.235.224.0/20',
4484         'GR': '94.64.0.0/13',
4485         'GT': '168.234.0.0/16',
4486         'GU': '168.123.0.0/16',
4487         'GW': '197.214.80.0/20',
4488         'GY': '181.41.64.0/18',
4489         'HK': '113.252.0.0/14',
4490         'HN': '181.210.0.0/16',
4491         'HR': '93.136.0.0/13',
4492         'HT': '148.102.128.0/17',
4493         'HU': '84.0.0.0/14',
4494         'ID': '39.192.0.0/10',
4495         'IE': '87.32.0.0/12',
4496         'IL': '79.176.0.0/13',
4497         'IM': '5.62.80.0/20',
4498         'IN': '117.192.0.0/10',
4499         'IO': '203.83.48.0/21',
4500         'IQ': '37.236.0.0/14',
4501         'IR': '2.176.0.0/12',
4502         'IS': '82.221.0.0/16',
4503         'IT': '79.0.0.0/10',
4504         'JE': '87.244.64.0/18',
4505         'JM': '72.27.0.0/17',
4506         'JO': '176.29.0.0/16',
4507         'JP': '133.0.0.0/8',
4508         'KE': '105.48.0.0/12',
4509         'KG': '158.181.128.0/17',
4510         'KH': '36.37.128.0/17',
4511         'KI': '103.25.140.0/22',
4512         'KM': '197.255.224.0/20',
4513         'KN': '198.167.192.0/19',
4514         'KP': '175.45.176.0/22',
4515         'KR': '175.192.0.0/10',
4516         'KW': '37.36.0.0/14',
4517         'KY': '64.96.0.0/15',
4518         'KZ': '2.72.0.0/13',
4519         'LA': '115.84.64.0/18',
4520         'LB': '178.135.0.0/16',
4521         'LC': '24.92.144.0/20',
4522         'LI': '82.117.0.0/19',
4523         'LK': '112.134.0.0/15',
4524         'LR': '102.183.0.0/16',
4525         'LS': '129.232.0.0/17',
4526         'LT': '78.56.0.0/13',
4527         'LU': '188.42.0.0/16',
4528         'LV': '46.109.0.0/16',
4529         'LY': '41.252.0.0/14',
4530         'MA': '105.128.0.0/11',
4531         'MC': '88.209.64.0/18',
4532         'MD': '37.246.0.0/16',
4533         'ME': '178.175.0.0/17',
4534         'MF': '74.112.232.0/21',
4535         'MG': '154.126.0.0/17',
4536         'MH': '117.103.88.0/21',
4537         'MK': '77.28.0.0/15',
4538         'ML': '154.118.128.0/18',
4539         'MM': '37.111.0.0/17',
4540         'MN': '49.0.128.0/17',
4541         'MO': '60.246.0.0/16',
4542         'MP': '202.88.64.0/20',
4543         'MQ': '109.203.224.0/19',
4544         'MR': '41.188.64.0/18',
4545         'MS': '208.90.112.0/22',
4546         'MT': '46.11.0.0/16',
4547         'MU': '105.16.0.0/12',
4548         'MV': '27.114.128.0/18',
4549         'MW': '102.70.0.0/15',
4550         'MX': '187.192.0.0/11',
4551         'MY': '175.136.0.0/13',
4552         'MZ': '197.218.0.0/15',
4553         'NA': '41.182.0.0/16',
4554         'NC': '101.101.0.0/18',
4555         'NE': '197.214.0.0/18',
4556         'NF': '203.17.240.0/22',
4557         'NG': '105.112.0.0/12',
4558         'NI': '186.76.0.0/15',
4559         'NL': '145.96.0.0/11',
4560         'NO': '84.208.0.0/13',
4561         'NP': '36.252.0.0/15',
4562         'NR': '203.98.224.0/19',
4563         'NU': '49.156.48.0/22',
4564         'NZ': '49.224.0.0/14',
4565         'OM': '5.36.0.0/15',
4566         'PA': '186.72.0.0/15',
4567         'PE': '186.160.0.0/14',
4568         'PF': '123.50.64.0/18',
4569         'PG': '124.240.192.0/19',
4570         'PH': '49.144.0.0/13',
4571         'PK': '39.32.0.0/11',
4572         'PL': '83.0.0.0/11',
4573         'PM': '70.36.0.0/20',
4574         'PR': '66.50.0.0/16',
4575         'PS': '188.161.0.0/16',
4576         'PT': '85.240.0.0/13',
4577         'PW': '202.124.224.0/20',
4578         'PY': '181.120.0.0/14',
4579         'QA': '37.210.0.0/15',
4580         'RE': '102.35.0.0/16',
4581         'RO': '79.112.0.0/13',
4582         'RS': '93.86.0.0/15',
4583         'RU': '5.136.0.0/13',
4584         'RW': '41.186.0.0/16',
4585         'SA': '188.48.0.0/13',
4586         'SB': '202.1.160.0/19',
4587         'SC': '154.192.0.0/11',
4588         'SD': '102.120.0.0/13',
4589         'SE': '78.64.0.0/12',
4590         'SG': '8.128.0.0/10',
4591         'SI': '188.196.0.0/14',
4592         'SK': '78.98.0.0/15',
4593         'SL': '102.143.0.0/17',
4594         'SM': '89.186.32.0/19',
4595         'SN': '41.82.0.0/15',
4596         'SO': '154.115.192.0/18',
4597         'SR': '186.179.128.0/17',
4598         'SS': '105.235.208.0/21',
4599         'ST': '197.159.160.0/19',
4600         'SV': '168.243.0.0/16',
4601         'SX': '190.102.0.0/20',
4602         'SY': '5.0.0.0/16',
4603         'SZ': '41.84.224.0/19',
4604         'TC': '65.255.48.0/20',
4605         'TD': '154.68.128.0/19',
4606         'TG': '196.168.0.0/14',
4607         'TH': '171.96.0.0/13',
4608         'TJ': '85.9.128.0/18',
4609         'TK': '27.96.24.0/21',
4610         'TL': '180.189.160.0/20',
4611         'TM': '95.85.96.0/19',
4612         'TN': '197.0.0.0/11',
4613         'TO': '175.176.144.0/21',
4614         'TR': '78.160.0.0/11',
4615         'TT': '186.44.0.0/15',
4616         'TV': '202.2.96.0/19',
4617         'TW': '120.96.0.0/11',
4618         'TZ': '156.156.0.0/14',
4619         'UA': '37.52.0.0/14',
4620         'UG': '102.80.0.0/13',
4621         'US': '6.0.0.0/8',
4622         'UY': '167.56.0.0/13',
4623         'UZ': '84.54.64.0/18',
4624         'VA': '212.77.0.0/19',
4625         'VC': '207.191.240.0/21',
4626         'VE': '186.88.0.0/13',
4627         'VG': '66.81.192.0/20',
4628         'VI': '146.226.0.0/16',
4629         'VN': '14.160.0.0/11',
4630         'VU': '202.80.32.0/20',
4631         'WF': '117.20.32.0/21',
4632         'WS': '202.4.32.0/19',
4633         'YE': '134.35.0.0/16',
4634         'YT': '41.242.116.0/22',
4635         'ZA': '41.0.0.0/11',
4636         'ZM': '102.144.0.0/13',
4637         'ZW': '102.177.192.0/18',
4638     }
4639
4640     @classmethod
4641     def random_ipv4(cls, code_or_block):
4642         if len(code_or_block) == 2:
4643             block = cls._country_ip_map.get(code_or_block.upper())
4644             if not block:
4645                 return None
4646         else:
4647             block = code_or_block
4648         addr, preflen = block.split('/')
4649         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4650         addr_max = addr_min | (0xffffffff >> int(preflen))
4651         return str(socket.inet_ntoa(
4652             struct.pack('!L', random.randint(addr_min, addr_max))))
4653
4654
4655 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4656     def __init__(self, proxies=None):
4657         # Set default handlers
4658         for type in ('http', 'https'):
4659             setattr(self, '%s_open' % type,
4660                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4661                         meth(r, proxy, type))
4662         urllib.request.ProxyHandler.__init__(self, proxies)
4663
4664     def proxy_open(self, req, proxy, type):
4665         req_proxy = req.headers.get('Ytdl-request-proxy')
4666         if req_proxy is not None:
4667             proxy = req_proxy
4668             del req.headers['Ytdl-request-proxy']
4669
4670         if proxy == '__noproxy__':
4671             return None  # No Proxy
4672         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4673             req.add_header('Ytdl-socks-proxy', proxy)
4674             # yt-dlp's http/https handlers do wrapping the socket with socks
4675             return None
4676         return urllib.request.ProxyHandler.proxy_open(
4677             self, req, proxy, type)
4678
4679
4680 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4681 # released into Public Domain
4682 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4683
4684 def long_to_bytes(n, blocksize=0):
4685     """long_to_bytes(n:long, blocksize:int) : string
4686     Convert a long integer to a byte string.
4687
4688     If optional blocksize is given and greater than zero, pad the front of the
4689     byte string with binary zeros so that the length is a multiple of
4690     blocksize.
4691     """
4692     # after much testing, this algorithm was deemed to be the fastest
4693     s = b''
4694     n = int(n)
4695     while n > 0:
4696         s = struct.pack('>I', n & 0xffffffff) + s
4697         n = n >> 32
4698     # strip off leading zeros
4699     for i in range(len(s)):
4700         if s[i] != b'\000'[0]:
4701             break
4702     else:
4703         # only happens when n == 0
4704         s = b'\000'
4705         i = 0
4706     s = s[i:]
4707     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4708     # de-padding being done above, but sigh...
4709     if blocksize > 0 and len(s) % blocksize:
4710         s = (blocksize - len(s) % blocksize) * b'\000' + s
4711     return s
4712
4713
4714 def bytes_to_long(s):
4715     """bytes_to_long(string) : long
4716     Convert a byte string to a long integer.
4717
4718     This is (essentially) the inverse of long_to_bytes().
4719     """
4720     acc = 0
4721     length = len(s)
4722     if length % 4:
4723         extra = (4 - length % 4)
4724         s = b'\000' * extra + s
4725         length = length + extra
4726     for i in range(0, length, 4):
4727         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4728     return acc
4729
4730
4731 def ohdave_rsa_encrypt(data, exponent, modulus):
4732     '''
4733     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4734
4735     Input:
4736         data: data to encrypt, bytes-like object
4737         exponent, modulus: parameter e and N of RSA algorithm, both integer
4738     Output: hex string of encrypted data
4739
4740     Limitation: supports one block encryption only
4741     '''
4742
4743     payload = int(binascii.hexlify(data[::-1]), 16)
4744     encrypted = pow(payload, exponent, modulus)
4745     return '%x' % encrypted
4746
4747
4748 def pkcs1pad(data, length):
4749     """
4750     Padding input data with PKCS#1 scheme
4751
4752     @param {int[]} data        input data
4753     @param {int}   length      target length
4754     @returns {int[]}           padded data
4755     """
4756     if len(data) > length - 11:
4757         raise ValueError('Input data too long for PKCS#1 padding')
4758
4759     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4760     return [0, 2] + pseudo_random + [0] + data
4761
4762
4763 def _base_n_table(n, table):
4764     if not table and not n:
4765         raise ValueError('Either table or n must be specified')
4766     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4767
4768     if n and n != len(table):
4769         raise ValueError(f'base {n} exceeds table length {len(table)}')
4770     return table
4771
4772
4773 def encode_base_n(num, n=None, table=None):
4774     """Convert given int to a base-n string"""
4775     table = _base_n_table(n, table)
4776     if not num:
4777         return table[0]
4778
4779     result, base = '', len(table)
4780     while num:
4781         result = table[num % base] + result
4782         num = num // base
4783     return result
4784
4785
4786 def decode_base_n(string, n=None, table=None):
4787     """Convert given base-n string to int"""
4788     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4789     result, base = 0, len(table)
4790     for char in string:
4791         result = result * base + table[char]
4792     return result
4793
4794
4795 def decode_base(value, digits):
4796     write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4797                  'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4798     return decode_base_n(value, table=digits)
4799
4800
4801 def decode_packed_codes(code):
4802     mobj = re.search(PACKED_CODES_RE, code)
4803     obfuscated_code, base, count, symbols = mobj.groups()
4804     base = int(base)
4805     count = int(count)
4806     symbols = symbols.split('|')
4807     symbol_table = {}
4808
4809     while count:
4810         count -= 1
4811         base_n_count = encode_base_n(count, base)
4812         symbol_table[base_n_count] = symbols[count] or base_n_count
4813
4814     return re.sub(
4815         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4816         obfuscated_code)
4817
4818
4819 def caesar(s, alphabet, shift):
4820     if shift == 0:
4821         return s
4822     l = len(alphabet)
4823     return ''.join(
4824         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4825         for c in s)
4826
4827
4828 def rot47(s):
4829     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4830
4831
4832 def parse_m3u8_attributes(attrib):
4833     info = {}
4834     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4835         if val.startswith('"'):
4836             val = val[1:-1]
4837         info[key] = val
4838     return info
4839
4840
4841 def urshift(val, n):
4842     return val >> n if val >= 0 else (val + 0x100000000) >> n
4843
4844
4845 # Based on png2str() written by @gdkchan and improved by @yokrysty
4846 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4847 def decode_png(png_data):
4848     # Reference: https://www.w3.org/TR/PNG/
4849     header = png_data[8:]
4850
4851     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4852         raise OSError('Not a valid PNG file.')
4853
4854     int_map = {1: '>B', 2: '>H', 4: '>I'}
4855     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4856
4857     chunks = []
4858
4859     while header:
4860         length = unpack_integer(header[:4])
4861         header = header[4:]
4862
4863         chunk_type = header[:4]
4864         header = header[4:]
4865
4866         chunk_data = header[:length]
4867         header = header[length:]
4868
4869         header = header[4:]  # Skip CRC
4870
4871         chunks.append({
4872             'type': chunk_type,
4873             'length': length,
4874             'data': chunk_data
4875         })
4876
4877     ihdr = chunks[0]['data']
4878
4879     width = unpack_integer(ihdr[:4])
4880     height = unpack_integer(ihdr[4:8])
4881
4882     idat = b''
4883
4884     for chunk in chunks:
4885         if chunk['type'] == b'IDAT':
4886             idat += chunk['data']
4887
4888     if not idat:
4889         raise OSError('Unable to read PNG data.')
4890
4891     decompressed_data = bytearray(zlib.decompress(idat))
4892
4893     stride = width * 3
4894     pixels = []
4895
4896     def _get_pixel(idx):
4897         x = idx % stride
4898         y = idx // stride
4899         return pixels[y][x]
4900
4901     for y in range(height):
4902         basePos = y * (1 + stride)
4903         filter_type = decompressed_data[basePos]
4904
4905         current_row = []
4906
4907         pixels.append(current_row)
4908
4909         for x in range(stride):
4910             color = decompressed_data[1 + basePos + x]
4911             basex = y * stride + x
4912             left = 0
4913             up = 0
4914
4915             if x > 2:
4916                 left = _get_pixel(basex - 3)
4917             if y > 0:
4918                 up = _get_pixel(basex - stride)
4919
4920             if filter_type == 1:  # Sub
4921                 color = (color + left) & 0xff
4922             elif filter_type == 2:  # Up
4923                 color = (color + up) & 0xff
4924             elif filter_type == 3:  # Average
4925                 color = (color + ((left + up) >> 1)) & 0xff
4926             elif filter_type == 4:  # Paeth
4927                 a = left
4928                 b = up
4929                 c = 0
4930
4931                 if x > 2 and y > 0:
4932                     c = _get_pixel(basex - stride - 3)
4933
4934                 p = a + b - c
4935
4936                 pa = abs(p - a)
4937                 pb = abs(p - b)
4938                 pc = abs(p - c)
4939
4940                 if pa <= pb and pa <= pc:
4941                     color = (color + a) & 0xff
4942                 elif pb <= pc:
4943                     color = (color + b) & 0xff
4944                 else:
4945                     color = (color + c) & 0xff
4946
4947             current_row.append(color)
4948
4949     return width, height, pixels
4950
4951
4952 def write_xattr(path, key, value):
4953     # Windows: Write xattrs to NTFS Alternate Data Streams:
4954     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4955     if compat_os_name == 'nt':
4956         assert ':' not in key
4957         assert os.path.exists(path)
4958
4959         try:
4960             with open(f'{path}:{key}', 'wb') as f:
4961                 f.write(value)
4962         except OSError as e:
4963             raise XAttrMetadataError(e.errno, e.strerror)
4964         return
4965
4966     # UNIX Method 1. Use xattrs/pyxattrs modules
4967
4968     setxattr = None
4969     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4970         # Unicode arguments are not supported in pyxattr until version 0.5.0
4971         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4972         if version_tuple(xattr.__version__) >= (0, 5, 0):
4973             setxattr = xattr.set
4974     elif xattr:
4975         setxattr = xattr.setxattr
4976
4977     if setxattr:
4978         try:
4979             setxattr(path, key, value)
4980         except OSError as e:
4981             raise XAttrMetadataError(e.errno, e.strerror)
4982         return
4983
4984     # UNIX Method 2. Use setfattr/xattr executables
4985     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4986            else 'xattr' if check_executable('xattr', ['-h']) else None)
4987     if not exe:
4988         raise XAttrUnavailableError(
4989             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4990             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4991
4992     value = value.decode()
4993     try:
4994         _, stderr, returncode = Popen.run(
4995             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4996             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4997     except OSError as e:
4998         raise XAttrMetadataError(e.errno, e.strerror)
4999     if returncode:
5000         raise XAttrMetadataError(returncode, stderr)
5001
5002
5003 def random_birthday(year_field, month_field, day_field):
5004     start_date = datetime.date(1950, 1, 1)
5005     end_date = datetime.date(1995, 12, 31)
5006     offset = random.randint(0, (end_date - start_date).days)
5007     random_date = start_date + datetime.timedelta(offset)
5008     return {
5009         year_field: str(random_date.year),
5010         month_field: str(random_date.month),
5011         day_field: str(random_date.day),
5012     }
5013
5014
5015 # Templates for internet shortcut files, which are plain text files.
5016 DOT_URL_LINK_TEMPLATE = '''\
5017 [InternetShortcut]
5018 URL=%(url)s
5019 '''
5020
5021 DOT_WEBLOC_LINK_TEMPLATE = '''\
5022 <?xml version="1.0" encoding="UTF-8"?>
5023 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5024 <plist version="1.0">
5025 <dict>
5026 \t<key>URL</key>
5027 \t<string>%(url)s</string>
5028 </dict>
5029 </plist>
5030 '''
5031
5032 DOT_DESKTOP_LINK_TEMPLATE = '''\
5033 [Desktop Entry]
5034 Encoding=UTF-8
5035 Name=%(filename)s
5036 Type=Link
5037 URL=%(url)s
5038 Icon=text-html
5039 '''
5040
5041 LINK_TEMPLATES = {
5042     'url': DOT_URL_LINK_TEMPLATE,
5043     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5044     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5045 }
5046
5047
5048 def iri_to_uri(iri):
5049     """
5050     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5051
5052     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5053     """
5054
5055     iri_parts = urllib.parse.urlparse(iri)
5056
5057     if '[' in iri_parts.netloc:
5058         raise ValueError('IPv6 URIs are not, yet, supported.')
5059         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5060
5061     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5062
5063     net_location = ''
5064     if iri_parts.username:
5065         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5066         if iri_parts.password is not None:
5067             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5068         net_location += '@'
5069
5070     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5071     # The 'idna' encoding produces ASCII text.
5072     if iri_parts.port is not None and iri_parts.port != 80:
5073         net_location += ':' + str(iri_parts.port)
5074
5075     return urllib.parse.urlunparse(
5076         (iri_parts.scheme,
5077             net_location,
5078
5079             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5080
5081             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5082             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5083
5084             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5085             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5086
5087             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5088
5089     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5090
5091
5092 def to_high_limit_path(path):
5093     if sys.platform in ['win32', 'cygwin']:
5094         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5095         return '\\\\?\\' + os.path.abspath(path)
5096
5097     return path
5098
5099
5100 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5101     val = traverse_obj(obj, *variadic(field))
5102     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5103         return default
5104     return template % func(val)
5105
5106
5107 def clean_podcast_url(url):
5108     return re.sub(r'''(?x)
5109         (?:
5110             (?:
5111                 chtbl\.com/track|
5112                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5113                 play\.podtrac\.com
5114             )/[^/]+|
5115             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5116             flex\.acast\.com|
5117             pd(?:
5118                 cn\.co| # https://podcorn.com/analytics-prefix/
5119                 st\.fm # https://podsights.com/docs/
5120             )/e
5121         )/''', '', url)
5122
5123
5124 _HEX_TABLE = '0123456789abcdef'
5125
5126
5127 def random_uuidv4():
5128     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5129
5130
5131 def make_dir(path, to_screen=None):
5132     try:
5133         dn = os.path.dirname(path)
5134         if dn and not os.path.exists(dn):
5135             os.makedirs(dn)
5136         return True
5137     except OSError as err:
5138         if callable(to_screen) is not None:
5139             to_screen('unable to create directory ' + error_to_compat_str(err))
5140         return False
5141
5142
5143 def get_executable_path():
5144     from .update import _get_variant_and_executable_path
5145
5146     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5147
5148
5149 def load_plugins(name, suffix, namespace):
5150     classes = {}
5151     with contextlib.suppress(FileNotFoundError):
5152         plugins_spec = importlib.util.spec_from_file_location(
5153             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5154         plugins = importlib.util.module_from_spec(plugins_spec)
5155         sys.modules[plugins_spec.name] = plugins
5156         plugins_spec.loader.exec_module(plugins)
5157         for name in dir(plugins):
5158             if name in namespace:
5159                 continue
5160             if not name.endswith(suffix):
5161                 continue
5162             klass = getattr(plugins, name)
5163             classes[name] = namespace[name] = klass
5164     return classes
5165
5166
5167 def traverse_obj(
5168         obj, *path_list, default=None, expected_type=None, get_all=True,
5169         casesense=True, is_user_input=False, traverse_string=False):
5170     ''' Traverse nested list/dict/tuple
5171     @param path_list        A list of paths which are checked one by one.
5172                             Each path is a list of keys where each key is a:
5173                               - None:     Do nothing
5174                               - string:   A dictionary key
5175                               - int:      An index into a list
5176                               - tuple:    A list of keys all of which will be traversed
5177                               - Ellipsis: Fetch all values in the object
5178                               - Function: Takes the key and value as arguments
5179                                           and returns whether the key matches or not
5180     @param default          Default value to return
5181     @param expected_type    Only accept final value of this type (Can also be any callable)
5182     @param get_all          Return all the values obtained from a path or only the first one
5183     @param casesense        Whether to consider dictionary keys as case sensitive
5184     @param is_user_input    Whether the keys are generated from user input. If True,
5185                             strings are converted to int/slice if necessary
5186     @param traverse_string  Whether to traverse inside strings. If True, any
5187                             non-compatible object will also be converted into a string
5188     # TODO: Write tests
5189     '''
5190     if not casesense:
5191         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5192         path_list = (map(_lower, variadic(path)) for path in path_list)
5193
5194     def _traverse_obj(obj, path, _current_depth=0):
5195         nonlocal depth
5196         path = tuple(variadic(path))
5197         for i, key in enumerate(path):
5198             if None in (key, obj):
5199                 return obj
5200             if isinstance(key, (list, tuple)):
5201                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5202                 key = ...
5203             if key is ...:
5204                 obj = (obj.values() if isinstance(obj, dict)
5205                        else obj if isinstance(obj, (list, tuple, LazyList))
5206                        else str(obj) if traverse_string else [])
5207                 _current_depth += 1
5208                 depth = max(depth, _current_depth)
5209                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5210             elif callable(key):
5211                 if isinstance(obj, (list, tuple, LazyList)):
5212                     obj = enumerate(obj)
5213                 elif isinstance(obj, dict):
5214                     obj = obj.items()
5215                 else:
5216                     if not traverse_string:
5217                         return None
5218                     obj = str(obj)
5219                 _current_depth += 1
5220                 depth = max(depth, _current_depth)
5221                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5222             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5223                 obj = (obj.get(key) if casesense or (key in obj)
5224                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5225             else:
5226                 if is_user_input:
5227                     key = (int_or_none(key) if ':' not in key
5228                            else slice(*map(int_or_none, key.split(':'))))
5229                     if key == slice(None):
5230                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5231                 if not isinstance(key, (int, slice)):
5232                     return None
5233                 if not isinstance(obj, (list, tuple, LazyList)):
5234                     if not traverse_string:
5235                         return None
5236                     obj = str(obj)
5237                 try:
5238                     obj = obj[key]
5239                 except IndexError:
5240                     return None
5241         return obj
5242
5243     if isinstance(expected_type, type):
5244         type_test = lambda val: val if isinstance(val, expected_type) else None
5245     else:
5246         type_test = expected_type or IDENTITY
5247
5248     for path in path_list:
5249         depth = 0
5250         val = _traverse_obj(obj, path)
5251         if val is not None:
5252             if depth:
5253                 for _ in range(depth - 1):
5254                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5255                 val = [v for v in map(type_test, val) if v is not None]
5256                 if val:
5257                     return val if get_all else val[0]
5258             else:
5259                 val = type_test(val)
5260                 if val is not None:
5261                     return val
5262     return default
5263
5264
5265 def traverse_dict(dictn, keys, casesense=True):
5266     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5267                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5268     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5269
5270
5271 def get_first(obj, keys, **kwargs):
5272     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5273
5274
5275 def variadic(x, allowed_types=(str, bytes, dict)):
5276     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5277
5278
5279 def time_seconds(**kwargs):
5280     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5281     return t.timestamp()
5282
5283
5284 # create a JSON Web Signature (jws) with HS256 algorithm
5285 # the resulting format is in JWS Compact Serialization
5286 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5287 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5288 def jwt_encode_hs256(payload_data, key, headers={}):
5289     header_data = {
5290         'alg': 'HS256',
5291         'typ': 'JWT',
5292     }
5293     if headers:
5294         header_data.update(headers)
5295     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5296     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5297     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5298     signature_b64 = base64.b64encode(h.digest())
5299     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5300     return token
5301
5302
5303 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5304 def jwt_decode_hs256(jwt):
5305     header_b64, payload_b64, signature_b64 = jwt.split('.')
5306     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5307     return payload_data
5308
5309
5310 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5311
5312
5313 @functools.cache
5314 def supports_terminal_sequences(stream):
5315     if compat_os_name == 'nt':
5316         if not WINDOWS_VT_MODE:
5317             return False
5318     elif not os.getenv('TERM'):
5319         return False
5320     try:
5321         return stream.isatty()
5322     except BaseException:
5323         return False
5324
5325
5326 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5327     if get_windows_version() < (10, 0, 10586):
5328         return
5329     global WINDOWS_VT_MODE
5330     try:
5331         Popen.run('', shell=True)
5332     except Exception:
5333         return
5334
5335     WINDOWS_VT_MODE = True
5336     supports_terminal_sequences.cache_clear()
5337
5338
5339 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5340
5341
5342 def remove_terminal_sequences(string):
5343     return _terminal_sequences_re.sub('', string)
5344
5345
5346 def number_of_digits(number):
5347     return len('%d' % number)
5348
5349
5350 def join_nonempty(*values, delim='-', from_dict=None):
5351     if from_dict is not None:
5352         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5353     return delim.join(map(str, filter(None, values)))
5354
5355
5356 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5357     """
5358     Find the largest format dimensions in terms of video width and, for each thumbnail:
5359     * Modify the URL: Match the width with the provided regex and replace with the former width
5360     * Update dimensions
5361
5362     This function is useful with video services that scale the provided thumbnails on demand
5363     """
5364     _keys = ('width', 'height')
5365     max_dimensions = max(
5366         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5367         default=(0, 0))
5368     if not max_dimensions[0]:
5369         return thumbnails
5370     return [
5371         merge_dicts(
5372             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5373             dict(zip(_keys, max_dimensions)), thumbnail)
5374         for thumbnail in thumbnails
5375     ]
5376
5377
5378 def parse_http_range(range):
5379     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5380     if not range:
5381         return None, None, None
5382     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5383     if not crg:
5384         return None, None, None
5385     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5386
5387
5388 def read_stdin(what):
5389     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5390     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5391     return sys.stdin
5392
5393
5394 class Config:
5395     own_args = None
5396     parsed_args = None
5397     filename = None
5398     __initialized = False
5399
5400     def __init__(self, parser, label=None):
5401         self.parser, self.label = parser, label
5402         self._loaded_paths, self.configs = set(), []
5403
5404     def init(self, args=None, filename=None):
5405         assert not self.__initialized
5406         self.own_args, self.filename = args, filename
5407         return self.load_configs()
5408
5409     def load_configs(self):
5410         directory = ''
5411         if self.filename:
5412             location = os.path.realpath(self.filename)
5413             directory = os.path.dirname(location)
5414             if location in self._loaded_paths:
5415                 return False
5416             self._loaded_paths.add(location)
5417
5418         self.__initialized = True
5419         opts, _ = self.parser.parse_known_args(self.own_args)
5420         self.parsed_args = self.own_args
5421         for location in opts.config_locations or []:
5422             if location == '-':
5423                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5424                 continue
5425             location = os.path.join(directory, expand_path(location))
5426             if os.path.isdir(location):
5427                 location = os.path.join(location, 'yt-dlp.conf')
5428             if not os.path.exists(location):
5429                 self.parser.error(f'config location {location} does not exist')
5430             self.append_config(self.read_file(location), location)
5431         return True
5432
5433     def __str__(self):
5434         label = join_nonempty(
5435             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5436             delim=' ')
5437         return join_nonempty(
5438             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5439             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5440             delim='\n')
5441
5442     @staticmethod
5443     def read_file(filename, default=[]):
5444         try:
5445             optionf = open(filename)
5446         except OSError:
5447             return default  # silently skip if file is not present
5448         try:
5449             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5450             contents = optionf.read()
5451             res = shlex.split(contents, comments=True)
5452         except Exception as err:
5453             raise ValueError(f'Unable to parse "{filename}": {err}')
5454         finally:
5455             optionf.close()
5456         return res
5457
5458     @staticmethod
5459     def hide_login_info(opts):
5460         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5461         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5462
5463         def _scrub_eq(o):
5464             m = eqre.match(o)
5465             if m:
5466                 return m.group('key') + '=PRIVATE'
5467             else:
5468                 return o
5469
5470         opts = list(map(_scrub_eq, opts))
5471         for idx, opt in enumerate(opts):
5472             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5473                 opts[idx + 1] = 'PRIVATE'
5474         return opts
5475
5476     def append_config(self, *args, label=None):
5477         config = type(self)(self.parser, label)
5478         config._loaded_paths = self._loaded_paths
5479         if config.init(*args):
5480             self.configs.append(config)
5481
5482     @property
5483     def all_args(self):
5484         for config in reversed(self.configs):
5485             yield from config.all_args
5486         yield from self.parsed_args or []
5487
5488     def parse_known_args(self, **kwargs):
5489         return self.parser.parse_known_args(self.all_args, **kwargs)
5490
5491     def parse_args(self):
5492         return self.parser.parse_args(self.all_args)
5493
5494
5495 class WebSocketsWrapper():
5496     """Wraps websockets module to use in non-async scopes"""
5497     pool = None
5498
5499     def __init__(self, url, headers=None, connect=True):
5500         self.loop = asyncio.new_event_loop()
5501         # XXX: "loop" is deprecated
5502         self.conn = websockets.connect(
5503             url, extra_headers=headers, ping_interval=None,
5504             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5505         if connect:
5506             self.__enter__()
5507         atexit.register(self.__exit__, None, None, None)
5508
5509     def __enter__(self):
5510         if not self.pool:
5511             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5512         return self
5513
5514     def send(self, *args):
5515         self.run_with_loop(self.pool.send(*args), self.loop)
5516
5517     def recv(self, *args):
5518         return self.run_with_loop(self.pool.recv(*args), self.loop)
5519
5520     def __exit__(self, type, value, traceback):
5521         try:
5522             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5523         finally:
5524             self.loop.close()
5525             self._cancel_all_tasks(self.loop)
5526
5527     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5528     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5529     @staticmethod
5530     def run_with_loop(main, loop):
5531         if not asyncio.iscoroutine(main):
5532             raise ValueError(f'a coroutine was expected, got {main!r}')
5533
5534         try:
5535             return loop.run_until_complete(main)
5536         finally:
5537             loop.run_until_complete(loop.shutdown_asyncgens())
5538             if hasattr(loop, 'shutdown_default_executor'):
5539                 loop.run_until_complete(loop.shutdown_default_executor())
5540
5541     @staticmethod
5542     def _cancel_all_tasks(loop):
5543         to_cancel = asyncio.all_tasks(loop)
5544
5545         if not to_cancel:
5546             return
5547
5548         for task in to_cancel:
5549             task.cancel()
5550
5551         # XXX: "loop" is removed in python 3.10+
5552         loop.run_until_complete(
5553             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5554
5555         for task in to_cancel:
5556             if task.cancelled():
5557                 continue
5558             if task.exception() is not None:
5559                 loop.call_exception_handler({
5560                     'message': 'unhandled exception during asyncio.run() shutdown',
5561                     'exception': task.exception(),
5562                     'task': task,
5563                 })
5564
5565
5566 def merge_headers(*dicts):
5567     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5568     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5569
5570
5571 def cached_method(f):
5572     """Cache a method"""
5573     signature = inspect.signature(f)
5574
5575     @functools.wraps(f)
5576     def wrapper(self, *args, **kwargs):
5577         bound_args = signature.bind(self, *args, **kwargs)
5578         bound_args.apply_defaults()
5579         key = tuple(bound_args.arguments.values())
5580
5581         if not hasattr(self, '__cached_method__cache'):
5582             self.__cached_method__cache = {}
5583         cache = self.__cached_method__cache.setdefault(f.__name__, {})
5584         if key not in cache:
5585             cache[key] = f(self, *args, **kwargs)
5586         return cache[key]
5587     return wrapper
5588
5589
5590 class classproperty:
5591     """property access for class methods"""
5592
5593     def __init__(self, func):
5594         functools.update_wrapper(self, func)
5595         self.func = func
5596
5597     def __get__(self, _, cls):
5598         return self.func(cls)
5599
5600
5601 class Namespace(types.SimpleNamespace):
5602     """Immutable namespace"""
5603
5604     def __iter__(self):
5605         return iter(self.__dict__.values())
5606
5607     @property
5608     def items_(self):
5609         return self.__dict__.items()
5610
5611
5612 # Deprecated
5613 has_certifi = bool(certifi)
5614 has_websockets = bool(websockets)