yt_dlp/utils.py

   1 import atexit
   2 import base64
   3 import binascii
   4 import calendar
   5 import codecs
   6 import collections
   7 import contextlib
   8 import ctypes
   9 import datetime
  10 import email.header
  11 import email.utils
  12 import errno
  13 import gzip
  14 import hashlib
  15 import hmac
  16 import html.entities
  17 import html.parser
  18 import http.client
  19 import http.cookiejar
  20 import importlib.util
  21 import inspect
  22 import io
  23 import itertools
  24 import json
  25 import locale
  26 import math
  27 import mimetypes
  28 import operator
  29 import os
  30 import platform
  31 import random
  32 import re
  33 import shlex
  34 import socket
  35 import ssl
  36 import struct
  37 import subprocess
  38 import sys
  39 import tempfile
  40 import time
  41 import traceback
  42 import types
  43 import urllib.error
  44 import urllib.parse
  45 import urllib.request
  46 import xml.etree.ElementTree
  47 import zlib
  48
  49 from .compat import asyncio, functools  # isort: split
  50 from .compat import (
  51     compat_etree_fromstring,
  52     compat_expanduser,
  53     compat_HTMLParseError,
  54     compat_os_name,
  55     compat_shlex_quote,
  56 )
  57 from .dependencies import brotli, certifi, websockets, xattr
  58 from .socks import ProxyType, sockssocket
  59
  60
  61 def register_socks_protocols():
  62     # "Register" SOCKS protocols
  63     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  64     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  65     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  66         if scheme not in urllib.parse.uses_netloc:
  67             urllib.parse.uses_netloc.append(scheme)
  68
  69
  70 # This is not clearly defined otherwise
  71 compiled_regex_type = type(re.compile(''))
  72
  73
  74 def random_user_agent():
  75     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  76     _CHROME_VERSIONS = (
  77         '90.0.4430.212',
  78         '90.0.4430.24',
  79         '90.0.4430.70',
  80         '90.0.4430.72',
  81         '90.0.4430.85',
  82         '90.0.4430.93',
  83         '91.0.4472.101',
  84         '91.0.4472.106',
  85         '91.0.4472.114',
  86         '91.0.4472.124',
  87         '91.0.4472.164',
  88         '91.0.4472.19',
  89         '91.0.4472.77',
  90         '92.0.4515.107',
  91         '92.0.4515.115',
  92         '92.0.4515.131',
  93         '92.0.4515.159',
  94         '92.0.4515.43',
  95         '93.0.4556.0',
  96         '93.0.4577.15',
  97         '93.0.4577.63',
  98         '93.0.4577.82',
  99         '94.0.4606.41',
 100         '94.0.4606.54',
 101         '94.0.4606.61',
 102         '94.0.4606.71',
 103         '94.0.4606.81',
 104         '94.0.4606.85',
 105         '95.0.4638.17',
 106         '95.0.4638.50',
 107         '95.0.4638.54',
 108         '95.0.4638.69',
 109         '95.0.4638.74',
 110         '96.0.4664.18',
 111         '96.0.4664.45',
 112         '96.0.4664.55',
 113         '96.0.4664.93',
 114         '97.0.4692.20',
 115     )
 116     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
 117
 118
 119 SUPPORTED_ENCODINGS = [
 120     'gzip', 'deflate'
 121 ]
 122 if brotli:
 123     SUPPORTED_ENCODINGS.append('br')
 124
 125 std_headers = {
 126     'User-Agent': random_user_agent(),
 127     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 128     'Accept-Language': 'en-us,en;q=0.5',
 129     'Sec-Fetch-Mode': 'navigate',
 130 }
 131
 132
 133 USER_AGENTS = {
 134     'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
 135 }
 136
 137
 138 NO_DEFAULT = object()
 139 IDENTITY = lambda x: x
 140
 141 ENGLISH_MONTH_NAMES = [
 142     'January', 'February', 'March', 'April', 'May', 'June',
 143     'July', 'August', 'September', 'October', 'November', 'December']
 144
 145 MONTH_NAMES = {
 146     'en': ENGLISH_MONTH_NAMES,
 147     'fr': [
 148         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 149         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
 150 }
 151
 152 KNOWN_EXTENSIONS = (
 153     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 154     'flv', 'f4v', 'f4a', 'f4b',
 155     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 156     'mkv', 'mka', 'mk3d',
 157     'avi', 'divx',
 158     'mov',
 159     'asf', 'wmv', 'wma',
 160     '3gp', '3g2',
 161     'mp3',
 162     'flac',
 163     'ape',
 164     'wav',
 165     'f4f', 'f4m', 'm3u8', 'smil')
 166
 167 # needed for sanitizing filenames in restricted mode
 168 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 169                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
 170                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
 171
 172 DATE_FORMATS = (
 173     '%d %B %Y',
 174     '%d %b %Y',
 175     '%B %d %Y',
 176     '%B %dst %Y',
 177     '%B %dnd %Y',
 178     '%B %drd %Y',
 179     '%B %dth %Y',
 180     '%b %d %Y',
 181     '%b %dst %Y',
 182     '%b %dnd %Y',
 183     '%b %drd %Y',
 184     '%b %dth %Y',
 185     '%b %dst %Y %I:%M',
 186     '%b %dnd %Y %I:%M',
 187     '%b %drd %Y %I:%M',
 188     '%b %dth %Y %I:%M',
 189     '%Y %m %d',
 190     '%Y-%m-%d',
 191     '%Y.%m.%d.',
 192     '%Y/%m/%d',
 193     '%Y/%m/%d %H:%M',
 194     '%Y/%m/%d %H:%M:%S',
 195     '%Y%m%d%H%M',
 196     '%Y%m%d%H%M%S',
 197     '%Y%m%d',
 198     '%Y-%m-%d %H:%M',
 199     '%Y-%m-%d %H:%M:%S',
 200     '%Y-%m-%d %H:%M:%S.%f',
 201     '%Y-%m-%d %H:%M:%S:%f',
 202     '%d.%m.%Y %H:%M',
 203     '%d.%m.%Y %H.%M',
 204     '%Y-%m-%dT%H:%M:%SZ',
 205     '%Y-%m-%dT%H:%M:%S.%fZ',
 206     '%Y-%m-%dT%H:%M:%S.%f0Z',
 207     '%Y-%m-%dT%H:%M:%S',
 208     '%Y-%m-%dT%H:%M:%S.%f',
 209     '%Y-%m-%dT%H:%M',
 210     '%b %d %Y at %H:%M',
 211     '%b %d %Y at %H:%M:%S',
 212     '%B %d %Y at %H:%M',
 213     '%B %d %Y at %H:%M:%S',
 214     '%H:%M %d-%b-%Y',
 215 )
 216
 217 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 218 DATE_FORMATS_DAY_FIRST.extend([
 219     '%d-%m-%Y',
 220     '%d.%m.%Y',
 221     '%d.%m.%y',
 222     '%d/%m/%Y',
 223     '%d/%m/%y',
 224     '%d/%m/%Y %H:%M:%S',
 225 ])
 226
 227 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 228 DATE_FORMATS_MONTH_FIRST.extend([
 229     '%m-%d-%Y',
 230     '%m.%d.%Y',
 231     '%m/%d/%Y',
 232     '%m/%d/%y',
 233     '%m/%d/%Y %H:%M:%S',
 234 ])
 235
 236 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
 237 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
 238
 239 NUMBER_RE = r'\d+(?:\.\d+)?'
 240
 241
 242 @functools.cache
 243 def preferredencoding():
 244     """Get preferred encoding.
 245
 246     Returns the best encoding scheme for the system, based on
 247     locale.getpreferredencoding() and some further tweaks.
 248     """
 249     try:
 250         pref = locale.getpreferredencoding()
 251         'TEST'.encode(pref)
 252     except Exception:
 253         pref = 'UTF-8'
 254
 255     return pref
 256
 257
 258 def write_json_file(obj, fn):
 259     """ Encode obj as JSON and write it to fn, atomically if possible """
 260
 261     tf = tempfile.NamedTemporaryFile(
 262         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
 263         suffix='.tmp', delete=False, mode='w', encoding='utf-8')
 264
 265     try:
 266         with tf:
 267             json.dump(obj, tf, ensure_ascii=False)
 268         if sys.platform == 'win32':
 269             # Need to remove existing file on Windows, else os.rename raises
 270             # WindowsError or FileExistsError.
 271             with contextlib.suppress(OSError):
 272                 os.unlink(fn)
 273         with contextlib.suppress(OSError):
 274             mask = os.umask(0)
 275             os.umask(mask)
 276             os.chmod(tf.name, 0o666 & ~mask)
 277         os.rename(tf.name, fn)
 278     except Exception:
 279         with contextlib.suppress(OSError):
 280             os.remove(tf.name)
 281         raise
 282
 283
 284 def find_xpath_attr(node, xpath, key, val=None):
 285     """ Find the xpath xpath[@key=val] """
 286     assert re.match(r'^[a-zA-Z_-]+$', key)
 287     expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
 288     return node.find(expr)
 289
 290 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 291 # the namespace parameter
 292
 293
 294 def xpath_with_ns(path, ns_map):
 295     components = [c.split(':') for c in path.split('/')]
 296     replaced = []
 297     for c in components:
 298         if len(c) == 1:
 299             replaced.append(c[0])
 300         else:
 301             ns, tag = c
 302             replaced.append('{%s}%s' % (ns_map[ns], tag))
 303     return '/'.join(replaced)
 304
 305
 306 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 307     def _find_xpath(xpath):
 308         return node.find(xpath)
 309
 310     if isinstance(xpath, str):
 311         n = _find_xpath(xpath)
 312     else:
 313         for xp in xpath:
 314             n = _find_xpath(xp)
 315             if n is not None:
 316                 break
 317
 318     if n is None:
 319         if default is not NO_DEFAULT:
 320             return default
 321         elif fatal:
 322             name = xpath if name is None else name
 323             raise ExtractorError('Could not find XML element %s' % name)
 324         else:
 325             return None
 326     return n
 327
 328
 329 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 330     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 331     if n is None or n == default:
 332         return n
 333     if n.text is None:
 334         if default is not NO_DEFAULT:
 335             return default
 336         elif fatal:
 337             name = xpath if name is None else name
 338             raise ExtractorError('Could not find XML element\'s text %s' % name)
 339         else:
 340             return None
 341     return n.text
 342
 343
 344 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 345     n = find_xpath_attr(node, xpath, key)
 346     if n is None:
 347         if default is not NO_DEFAULT:
 348             return default
 349         elif fatal:
 350             name = f'{xpath}[@{key}]' if name is None else name
 351             raise ExtractorError('Could not find XML attribute %s' % name)
 352         else:
 353             return None
 354     return n.attrib[key]
 355
 356
 357 def get_element_by_id(id, html, **kwargs):
 358     """Return the content of the tag with the specified ID in the passed HTML document"""
 359     return get_element_by_attribute('id', id, html, **kwargs)
 360
 361
 362 def get_element_html_by_id(id, html, **kwargs):
 363     """Return the html of the tag with the specified ID in the passed HTML document"""
 364     return get_element_html_by_attribute('id', id, html, **kwargs)
 365
 366
 367 def get_element_by_class(class_name, html):
 368     """Return the content of the first tag with the specified class in the passed HTML document"""
 369     retval = get_elements_by_class(class_name, html)
 370     return retval[0] if retval else None
 371
 372
 373 def get_element_html_by_class(class_name, html):
 374     """Return the html of the first tag with the specified class in the passed HTML document"""
 375     retval = get_elements_html_by_class(class_name, html)
 376     return retval[0] if retval else None
 377
 378
 379 def get_element_by_attribute(attribute, value, html, **kwargs):
 380     retval = get_elements_by_attribute(attribute, value, html, **kwargs)
 381     return retval[0] if retval else None
 382
 383
 384 def get_element_html_by_attribute(attribute, value, html, **kargs):
 385     retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
 386     return retval[0] if retval else None
 387
 388
 389 def get_elements_by_class(class_name, html, **kargs):
 390     """Return the content of all tags with the specified class in the passed HTML document as a list"""
 391     return get_elements_by_attribute(
 392         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 393         html, escape_value=False)
 394
 395
 396 def get_elements_html_by_class(class_name, html):
 397     """Return the html of all tags with the specified class in the passed HTML document as a list"""
 398     return get_elements_html_by_attribute(
 399         'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
 400         html, escape_value=False)
 401
 402
 403 def get_elements_by_attribute(*args, **kwargs):
 404     """Return the content of the tag with the specified attribute in the passed HTML document"""
 405     return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 406
 407
 408 def get_elements_html_by_attribute(*args, **kwargs):
 409     """Return the html of the tag with the specified attribute in the passed HTML document"""
 410     return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
 411
 412
 413 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
 414     """
 415     Return the text (content) and the html (whole) of the tag with the specified
 416     attribute in the passed HTML document
 417     """
 418
 419     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
 420
 421     value = re.escape(value) if escape_value else value
 422
 423     partial_element_re = rf'''(?x)
 424         <(?P<tag>[a-zA-Z0-9:._-]+)
 425          (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
 426          \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
 427         '''
 428
 429     for m in re.finditer(partial_element_re, html):
 430         content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
 431
 432         yield (
 433             unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
 434             whole
 435         )
 436
 437
 438 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
 439     """
 440     HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
 441     closing tag for the first opening tag it has encountered, and can be used
 442     as a context manager
 443     """
 444
 445     class HTMLBreakOnClosingTagException(Exception):
 446         pass
 447
 448     def __init__(self):
 449         self.tagstack = collections.deque()
 450         html.parser.HTMLParser.__init__(self)
 451
 452     def __enter__(self):
 453         return self
 454
 455     def __exit__(self, *_):
 456         self.close()
 457
 458     def close(self):
 459         # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
 460         # so data remains buffered; we no longer have any interest in it, thus
 461         # override this method to discard it
 462         pass
 463
 464     def handle_starttag(self, tag, _):
 465         self.tagstack.append(tag)
 466
 467     def handle_endtag(self, tag):
 468         if not self.tagstack:
 469             raise compat_HTMLParseError('no tags in the stack')
 470         while self.tagstack:
 471             inner_tag = self.tagstack.pop()
 472             if inner_tag == tag:
 473                 break
 474         else:
 475             raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
 476         if not self.tagstack:
 477             raise self.HTMLBreakOnClosingTagException()
 478
 479
 480 def get_element_text_and_html_by_tag(tag, html):
 481     """
 482     For the first element with the specified tag in the passed HTML document
 483     return its' content (text) and the whole element (html)
 484     """
 485     def find_or_raise(haystack, needle, exc):
 486         try:
 487             return haystack.index(needle)
 488         except ValueError:
 489             raise exc
 490     closing_tag = f'</{tag}>'
 491     whole_start = find_or_raise(
 492         html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
 493     content_start = find_or_raise(
 494         html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
 495     content_start += whole_start + 1
 496     with HTMLBreakOnClosingTagParser() as parser:
 497         parser.feed(html[whole_start:content_start])
 498         if not parser.tagstack or parser.tagstack[0] != tag:
 499             raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
 500         offset = content_start
 501         while offset < len(html):
 502             next_closing_tag_start = find_or_raise(
 503                 html[offset:], closing_tag,
 504                 compat_HTMLParseError(f'closing {tag} tag not found'))
 505             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
 506             try:
 507                 parser.feed(html[offset:offset + next_closing_tag_end])
 508                 offset += next_closing_tag_end
 509             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
 510                 return html[content_start:offset + next_closing_tag_start], \
 511                     html[whole_start:offset + next_closing_tag_end]
 512         raise compat_HTMLParseError('unexpected end of html')
 513
 514
 515 class HTMLAttributeParser(html.parser.HTMLParser):
 516     """Trivial HTML parser to gather the attributes for a single element"""
 517
 518     def __init__(self):
 519         self.attrs = {}
 520         html.parser.HTMLParser.__init__(self)
 521
 522     def handle_starttag(self, tag, attrs):
 523         self.attrs = dict(attrs)
 524
 525
 526 class HTMLListAttrsParser(html.parser.HTMLParser):
 527     """HTML parser to gather the attributes for the elements of a list"""
 528
 529     def __init__(self):
 530         html.parser.HTMLParser.__init__(self)
 531         self.items = []
 532         self._level = 0
 533
 534     def handle_starttag(self, tag, attrs):
 535         if tag == 'li' and self._level == 0:
 536             self.items.append(dict(attrs))
 537         self._level += 1
 538
 539     def handle_endtag(self, tag):
 540         self._level -= 1
 541
 542
 543 def extract_attributes(html_element):
 544     """Given a string for an HTML element such as
 545     <el
 546          a="foo" B="bar" c="&98;az" d=boz
 547          empty= noval entity="&amp;"
 548          sq='"' dq="'"
 549     >
 550     Decode and return a dictionary of attributes.
 551     {
 552         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 553         'empty': '', 'noval': None, 'entity': '&',
 554         'sq': '"', 'dq': '\''
 555     }.
 556     """
 557     parser = HTMLAttributeParser()
 558     with contextlib.suppress(compat_HTMLParseError):
 559         parser.feed(html_element)
 560         parser.close()
 561     return parser.attrs
 562
 563
 564 def parse_list(webpage):
 565     """Given a string for an series of HTML <li> elements,
 566     return a dictionary of their attributes"""
 567     parser = HTMLListAttrsParser()
 568     parser.feed(webpage)
 569     parser.close()
 570     return parser.items
 571
 572
 573 def clean_html(html):
 574     """Clean an HTML snippet into a readable string"""
 575
 576     if html is None:  # Convenience for sanitizing descriptions etc.
 577         return html
 578
 579     html = re.sub(r'\s+', ' ', html)
 580     html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
 581     html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
 582     # Strip html tags
 583     html = re.sub('<.*?>', '', html)
 584     # Replace html entities
 585     html = unescapeHTML(html)
 586     return html.strip()
 587
 588
 589 class LenientJSONDecoder(json.JSONDecoder):
 590     def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
 591         self.transform_source, self.ignore_extra = transform_source, ignore_extra
 592         super().__init__(*args, **kwargs)
 593
 594     def decode(self, s):
 595         if self.transform_source:
 596             s = self.transform_source(s)
 597         if self.ignore_extra:
 598             return self.raw_decode(s.lstrip())[0]
 599         return super().decode(s)
 600
 601
 602 def sanitize_open(filename, open_mode):
 603     """Try to open the given filename, and slightly tweak it if this fails.
 604
 605     Attempts to open the given filename. If this fails, it tries to change
 606     the filename slightly, step by step, until it's either able to open it
 607     or it fails and raises a final exception, like the standard open()
 608     function.
 609
 610     It returns the tuple (stream, definitive_file_name).
 611     """
 612     if filename == '-':
 613         if sys.platform == 'win32':
 614             import msvcrt
 615             msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 616         return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 617
 618     for attempt in range(2):
 619         try:
 620             try:
 621                 if sys.platform == 'win32':
 622                     # FIXME: An exclusive lock also locks the file from being read.
 623                     # Since windows locks are mandatory, don't lock the file on windows (for now).
 624                     # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
 625                     raise LockingUnsupportedError()
 626                 stream = locked_file(filename, open_mode, block=False).__enter__()
 627             except OSError:
 628                 stream = open(filename, open_mode)
 629             return stream, filename
 630         except OSError as err:
 631             if attempt or err.errno in (errno.EACCES,):
 632                 raise
 633             old_filename, filename = filename, sanitize_path(filename)
 634             if old_filename == filename:
 635                 raise
 636
 637
 638 def timeconvert(timestr):
 639     """Convert RFC 2822 defined time string into system timestamp"""
 640     timestamp = None
 641     timetuple = email.utils.parsedate_tz(timestr)
 642     if timetuple is not None:
 643         timestamp = email.utils.mktime_tz(timetuple)
 644     return timestamp
 645
 646
 647 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
 648     """Sanitizes a string so it could be used as part of a filename.
 649     @param restricted   Use a stricter subset of allowed characters
 650     @param is_id        Whether this is an ID that should be kept unchanged if possible.
 651                         If unset, yt-dlp's new sanitization rules are in effect
 652     """
 653     if s == '':
 654         return ''
 655
 656     def replace_insane(char):
 657         if restricted and char in ACCENT_CHARS:
 658             return ACCENT_CHARS[char]
 659         elif not restricted and char == '\n':
 660             return '\0 '
 661         elif char == '?' or ord(char) < 32 or ord(char) == 127:
 662             return ''
 663         elif char == '"':
 664             return '' if restricted else '\''
 665         elif char == ':':
 666             return '\0_\0-' if restricted else '\0 \0-'
 667         elif char in '\\/|*<>':
 668             return '\0_'
 669         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
 670             return '\0_'
 671         return char
 672
 673     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
 674     result = ''.join(map(replace_insane, s))
 675     if is_id is NO_DEFAULT:
 676         result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result)  # Remove repeated substitute chars
 677         STRIP_RE = r'(?:\0.|[ _-])*'
 678         result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
 679     result = result.replace('\0', '') or '_'
 680
 681     if not is_id:
 682         while '__' in result:
 683             result = result.replace('__', '_')
 684         result = result.strip('_')
 685         # Common case of "Foreign band name - English song title"
 686         if restricted and result.startswith('-_'):
 687             result = result[2:]
 688         if result.startswith('-'):
 689             result = '_' + result[len('-'):]
 690         result = result.lstrip('.')
 691         if not result:
 692             result = '_'
 693     return result
 694
 695
 696 def sanitize_path(s, force=False):
 697     """Sanitizes and normalizes path on Windows"""
 698     if sys.platform == 'win32':
 699         force = False
 700         drive_or_unc, _ = os.path.splitdrive(s)
 701     elif force:
 702         drive_or_unc = ''
 703     else:
 704         return s
 705
 706     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 707     if drive_or_unc:
 708         norm_path.pop(0)
 709     sanitized_path = [
 710         path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
 711         for path_part in norm_path]
 712     if drive_or_unc:
 713         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 714     elif force and s and s[0] == os.path.sep:
 715         sanitized_path.insert(0, os.path.sep)
 716     return os.path.join(*sanitized_path)
 717
 718
 719 def sanitize_url(url):
 720     # Prepend protocol-less URLs with `http:` scheme in order to mitigate
 721     # the number of unwanted failures due to missing protocol
 722     if url is None:
 723         return
 724     elif url.startswith('//'):
 725         return 'http:%s' % url
 726     # Fix some common typos seen so far
 727     COMMON_TYPOS = (
 728         # https://github.com/ytdl-org/youtube-dl/issues/15649
 729         (r'^httpss://', r'https://'),
 730         # https://bx1.be/lives/direct-tv/
 731         (r'^rmtp([es]?)://', r'rtmp\1://'),
 732     )
 733     for mistake, fixup in COMMON_TYPOS:
 734         if re.match(mistake, url):
 735             return re.sub(mistake, fixup, url)
 736     return url
 737
 738
 739 def extract_basic_auth(url):
 740     parts = urllib.parse.urlsplit(url)
 741     if parts.username is None:
 742         return url, None
 743     url = urllib.parse.urlunsplit(parts._replace(netloc=(
 744         parts.hostname if parts.port is None
 745         else '%s:%d' % (parts.hostname, parts.port))))
 746     auth_payload = base64.b64encode(
 747         ('%s:%s' % (parts.username, parts.password or '')).encode())
 748     return url, f'Basic {auth_payload.decode()}'
 749
 750
 751 def sanitized_Request(url, *args, **kwargs):
 752     url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
 753     if auth_header is not None:
 754         headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
 755         headers['Authorization'] = auth_header
 756     return urllib.request.Request(url, *args, **kwargs)
 757
 758
 759 def expand_path(s):
 760     """Expand shell variables and ~"""
 761     return os.path.expandvars(compat_expanduser(s))
 762
 763
 764 def orderedSet(iterable, *, lazy=False):
 765     """Remove all duplicates from the input iterable"""
 766     def _iter():
 767         seen = []  # Do not use set since the items can be unhashable
 768         for x in iterable:
 769             if x not in seen:
 770                 seen.append(x)
 771                 yield x
 772
 773     return _iter() if lazy else list(_iter())
 774
 775
 776 def _htmlentity_transform(entity_with_semicolon):
 777     """Transforms an HTML entity to a character."""
 778     entity = entity_with_semicolon[:-1]
 779
 780     # Known non-numeric HTML entity
 781     if entity in html.entities.name2codepoint:
 782         return chr(html.entities.name2codepoint[entity])
 783
 784     # TODO: HTML5 allows entities without a semicolon. For example,
 785     # '&Eacuteric' should be decoded as 'Éric'.
 786     if entity_with_semicolon in html.entities.html5:
 787         return html.entities.html5[entity_with_semicolon]
 788
 789     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 790     if mobj is not None:
 791         numstr = mobj.group(1)
 792         if numstr.startswith('x'):
 793             base = 16
 794             numstr = '0%s' % numstr
 795         else:
 796             base = 10
 797         # See https://github.com/ytdl-org/youtube-dl/issues/7518
 798         with contextlib.suppress(ValueError):
 799             return chr(int(numstr, base))
 800
 801     # Unknown entity in name, return its literal representation
 802     return '&%s;' % entity
 803
 804
 805 def unescapeHTML(s):
 806     if s is None:
 807         return None
 808     assert isinstance(s, str)
 809
 810     return re.sub(
 811         r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 812
 813
 814 def escapeHTML(text):
 815     return (
 816         text
 817         .replace('&', '&amp;')
 818         .replace('<', '&lt;')
 819         .replace('>', '&gt;')
 820         .replace('"', '&quot;')
 821         .replace("'", '&#39;')
 822     )
 823
 824
 825 def process_communicate_or_kill(p, *args, **kwargs):
 826     write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
 827                  'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
 828     return Popen.communicate_or_kill(p, *args, **kwargs)
 829
 830
 831 class Popen(subprocess.Popen):
 832     if sys.platform == 'win32':
 833         _startupinfo = subprocess.STARTUPINFO()
 834         _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 835     else:
 836         _startupinfo = None
 837
 838     def __init__(self, *args, text=False, **kwargs):
 839         if text is True:
 840             kwargs['universal_newlines'] = True  # For 3.6 compatibility
 841             kwargs.setdefault('encoding', 'utf-8')
 842             kwargs.setdefault('errors', 'replace')
 843         super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
 844
 845     def communicate_or_kill(self, *args, **kwargs):
 846         try:
 847             return self.communicate(*args, **kwargs)
 848         except BaseException:  # Including KeyboardInterrupt
 849             self.kill(timeout=None)
 850             raise
 851
 852     def kill(self, *, timeout=0):
 853         super().kill()
 854         if timeout != 0:
 855             self.wait(timeout=timeout)
 856
 857     @classmethod
 858     def run(cls, *args, **kwargs):
 859         with cls(*args, **kwargs) as proc:
 860             stdout, stderr = proc.communicate_or_kill()
 861             return stdout or '', stderr or '', proc.returncode
 862
 863
 864 def get_subprocess_encoding():
 865     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 866         # For subprocess calls, encode with locale encoding
 867         # Refer to http://stackoverflow.com/a/9951851/35070
 868         encoding = preferredencoding()
 869     else:
 870         encoding = sys.getfilesystemencoding()
 871     if encoding is None:
 872         encoding = 'utf-8'
 873     return encoding
 874
 875
 876 def encodeFilename(s, for_subprocess=False):
 877     assert isinstance(s, str)
 878     return s
 879
 880
 881 def decodeFilename(b, for_subprocess=False):
 882     return b
 883
 884
 885 def encodeArgument(s):
 886     # Legacy code that uses byte strings
 887     # Uncomment the following line after fixing all post processors
 888     # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
 889     return s if isinstance(s, str) else s.decode('ascii')
 890
 891
 892 def decodeArgument(b):
 893     return b
 894
 895
 896 def decodeOption(optval):
 897     if optval is None:
 898         return optval
 899     if isinstance(optval, bytes):
 900         optval = optval.decode(preferredencoding())
 901
 902     assert isinstance(optval, str)
 903     return optval
 904
 905
 906 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
 907
 908
 909 def timetuple_from_msec(msec):
 910     secs, msec = divmod(msec, 1000)
 911     mins, secs = divmod(secs, 60)
 912     hrs, mins = divmod(mins, 60)
 913     return _timetuple(hrs, mins, secs, msec)
 914
 915
 916 def formatSeconds(secs, delim=':', msec=False):
 917     time = timetuple_from_msec(secs * 1000)
 918     if time.hours:
 919         ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
 920     elif time.minutes:
 921         ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
 922     else:
 923         ret = '%d' % time.seconds
 924     return '%s.%03d' % (ret, time.milliseconds) if msec else ret
 925
 926
 927 def _ssl_load_windows_store_certs(ssl_context, storename):
 928     # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
 929     try:
 930         certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
 931                  if encoding == 'x509_asn' and (
 932                      trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
 933     except PermissionError:
 934         return
 935     for cert in certs:
 936         with contextlib.suppress(ssl.SSLError):
 937             ssl_context.load_verify_locations(cadata=cert)
 938
 939
 940 def make_HTTPS_handler(params, **kwargs):
 941     opts_check_certificate = not params.get('nocheckcertificate')
 942     context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 943     context.check_hostname = opts_check_certificate
 944     if params.get('legacyserverconnect'):
 945         context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
 946         # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
 947         context.set_ciphers('DEFAULT')
 948
 949     context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
 950     if opts_check_certificate:
 951         if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
 952             context.load_verify_locations(cafile=certifi.where())
 953         else:
 954             try:
 955                 context.load_default_certs()
 956                 # Work around the issue in load_default_certs when there are bad certificates. See:
 957                 # https://github.com/yt-dlp/yt-dlp/issues/1060,
 958                 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
 959             except ssl.SSLError:
 960                 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
 961                 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
 962                     for storename in ('CA', 'ROOT'):
 963                         _ssl_load_windows_store_certs(context, storename)
 964                 context.set_default_verify_paths()
 965
 966     client_certfile = params.get('client_certificate')
 967     if client_certfile:
 968         try:
 969             context.load_cert_chain(
 970                 client_certfile, keyfile=params.get('client_certificate_key'),
 971                 password=params.get('client_certificate_password'))
 972         except ssl.SSLError:
 973             raise YoutubeDLError('Unable to load client certificate')
 974
 975     # Some servers may reject requests if ALPN extension is not sent. See:
 976     # https://github.com/python/cpython/issues/85140
 977     # https://github.com/yt-dlp/yt-dlp/issues/3878
 978     with contextlib.suppress(NotImplementedError):
 979         context.set_alpn_protocols(['http/1.1'])
 980
 981     return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 982
 983
 984 def bug_reports_message(before=';'):
 985     from .update import REPOSITORY
 986
 987     msg = (f'please report this issue on  https://github.com/{REPOSITORY}/issues?q= , '
 988            'filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U')
 989
 990     before = before.rstrip()
 991     if not before or before.endswith(('.', '!', '?')):
 992         msg = msg[0].title() + msg[1:]
 993
 994     return (before + ' ' if before else '') + msg
 995
 996
 997 class YoutubeDLError(Exception):
 998     """Base exception for YoutubeDL errors."""
 999     msg = None
1000
1001     def __init__(self, msg=None):
1002         if msg is not None:
1003             self.msg = msg
1004         elif self.msg is None:
1005             self.msg = type(self).__name__
1006         super().__init__(self.msg)
1007
1008
1009 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1010 if hasattr(ssl, 'CertificateError'):
1011     network_exceptions.append(ssl.CertificateError)
1012 network_exceptions = tuple(network_exceptions)
1013
1014
1015 class ExtractorError(YoutubeDLError):
1016     """Error during info extraction."""
1017
1018     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1019         """ tb, if given, is the original traceback (so that it can be printed out).
1020         If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1021         """
1022         if sys.exc_info()[0] in network_exceptions:
1023             expected = True
1024
1025         self.orig_msg = str(msg)
1026         self.traceback = tb
1027         self.expected = expected
1028         self.cause = cause
1029         self.video_id = video_id
1030         self.ie = ie
1031         self.exc_info = sys.exc_info()  # preserve original exception
1032         if isinstance(self.exc_info[1], ExtractorError):
1033             self.exc_info = self.exc_info[1].exc_info
1034
1035         super().__init__(''.join((
1036             format_field(ie, None, '[%s] '),
1037             format_field(video_id, None, '%s: '),
1038             msg,
1039             format_field(cause, None, ' (caused by %r)'),
1040             '' if expected else bug_reports_message())))
1041
1042     def format_traceback(self):
1043         return join_nonempty(
1044             self.traceback and ''.join(traceback.format_tb(self.traceback)),
1045             self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1046             delim='\n') or None
1047
1048
1049 class UnsupportedError(ExtractorError):
1050     def __init__(self, url):
1051         super().__init__(
1052             'Unsupported URL: %s' % url, expected=True)
1053         self.url = url
1054
1055
1056 class RegexNotFoundError(ExtractorError):
1057     """Error when a regex didn't match"""
1058     pass
1059
1060
1061 class GeoRestrictedError(ExtractorError):
1062     """Geographic restriction Error exception.
1063
1064     This exception may be thrown when a video is not available from your
1065     geographic location due to geographic restrictions imposed by a website.
1066     """
1067
1068     def __init__(self, msg, countries=None, **kwargs):
1069         kwargs['expected'] = True
1070         super().__init__(msg, **kwargs)
1071         self.countries = countries
1072
1073
1074 class DownloadError(YoutubeDLError):
1075     """Download Error exception.
1076
1077     This exception may be thrown by FileDownloader objects if they are not
1078     configured to continue on errors. They will contain the appropriate
1079     error message.
1080     """
1081
1082     def __init__(self, msg, exc_info=None):
1083         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1084         super().__init__(msg)
1085         self.exc_info = exc_info
1086
1087
1088 class EntryNotInPlaylist(YoutubeDLError):
1089     """Entry not in playlist exception.
1090
1091     This exception will be thrown by YoutubeDL when a requested entry
1092     is not found in the playlist info_dict
1093     """
1094     msg = 'Entry not found in info'
1095
1096
1097 class SameFileError(YoutubeDLError):
1098     """Same File exception.
1099
1100     This exception will be thrown by FileDownloader objects if they detect
1101     multiple files would have to be downloaded to the same file on disk.
1102     """
1103     msg = 'Fixed output name but more than one file to download'
1104
1105     def __init__(self, filename=None):
1106         if filename is not None:
1107             self.msg += f': {filename}'
1108         super().__init__(self.msg)
1109
1110
1111 class PostProcessingError(YoutubeDLError):
1112     """Post Processing exception.
1113
1114     This exception may be raised by PostProcessor's .run() method to
1115     indicate an error in the postprocessing task.
1116     """
1117
1118
1119 class DownloadCancelled(YoutubeDLError):
1120     """ Exception raised when the download queue should be interrupted """
1121     msg = 'The download was cancelled'
1122
1123
1124 class ExistingVideoReached(DownloadCancelled):
1125     """ --break-on-existing triggered """
1126     msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1127
1128
1129 class RejectedVideoReached(DownloadCancelled):
1130     """ --break-on-reject triggered """
1131     msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1132
1133
1134 class MaxDownloadsReached(DownloadCancelled):
1135     """ --max-downloads limit has been reached. """
1136     msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1137
1138
1139 class ReExtractInfo(YoutubeDLError):
1140     """ Video info needs to be re-extracted. """
1141
1142     def __init__(self, msg, expected=False):
1143         super().__init__(msg)
1144         self.expected = expected
1145
1146
1147 class ThrottledDownload(ReExtractInfo):
1148     """ Download speed below --throttled-rate. """
1149     msg = 'The download speed is below throttle limit'
1150
1151     def __init__(self):
1152         super().__init__(self.msg, expected=False)
1153
1154
1155 class UnavailableVideoError(YoutubeDLError):
1156     """Unavailable Format exception.
1157
1158     This exception will be thrown when a video is requested
1159     in a format that is not available for that video.
1160     """
1161     msg = 'Unable to download video'
1162
1163     def __init__(self, err=None):
1164         if err is not None:
1165             self.msg += f': {err}'
1166         super().__init__(self.msg)
1167
1168
1169 class ContentTooShortError(YoutubeDLError):
1170     """Content Too Short exception.
1171
1172     This exception may be raised by FileDownloader objects when a file they
1173     download is too small for what the server announced first, indicating
1174     the connection was probably interrupted.
1175     """
1176
1177     def __init__(self, downloaded, expected):
1178         super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1179         # Both in bytes
1180         self.downloaded = downloaded
1181         self.expected = expected
1182
1183
1184 class XAttrMetadataError(YoutubeDLError):
1185     def __init__(self, code=None, msg='Unknown error'):
1186         super().__init__(msg)
1187         self.code = code
1188         self.msg = msg
1189
1190         # Parsing code and msg
1191         if (self.code in (errno.ENOSPC, errno.EDQUOT)
1192                 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1193             self.reason = 'NO_SPACE'
1194         elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1195             self.reason = 'VALUE_TOO_LONG'
1196         else:
1197             self.reason = 'NOT_SUPPORTED'
1198
1199
1200 class XAttrUnavailableError(YoutubeDLError):
1201     pass
1202
1203
1204 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1205     hc = http_class(*args, **kwargs)
1206     source_address = ydl_handler._params.get('source_address')
1207
1208     if source_address is not None:
1209         # This is to workaround _create_connection() from socket where it will try all
1210         # address data from getaddrinfo() including IPv6. This filters the result from
1211         # getaddrinfo() based on the source_address value.
1212         # This is based on the cpython socket.create_connection() function.
1213         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1214         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1215             host, port = address
1216             err = None
1217             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1218             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1219             ip_addrs = [addr for addr in addrs if addr[0] == af]
1220             if addrs and not ip_addrs:
1221                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1222                 raise OSError(
1223                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
1224                     % (ip_version, source_address[0]))
1225             for res in ip_addrs:
1226                 af, socktype, proto, canonname, sa = res
1227                 sock = None
1228                 try:
1229                     sock = socket.socket(af, socktype, proto)
1230                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1231                         sock.settimeout(timeout)
1232                     sock.bind(source_address)
1233                     sock.connect(sa)
1234                     err = None  # Explicitly break reference cycle
1235                     return sock
1236                 except OSError as _:
1237                     err = _
1238                     if sock is not None:
1239                         sock.close()
1240             if err is not None:
1241                 raise err
1242             else:
1243                 raise OSError('getaddrinfo returns an empty list')
1244         if hasattr(hc, '_create_connection'):
1245             hc._create_connection = _create_connection
1246         hc.source_address = (source_address, 0)
1247
1248     return hc
1249
1250
1251 def handle_youtubedl_headers(headers):
1252     filtered_headers = headers
1253
1254     if 'Youtubedl-no-compression' in filtered_headers:
1255         filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1256         del filtered_headers['Youtubedl-no-compression']
1257
1258     return filtered_headers
1259
1260
1261 class YoutubeDLHandler(urllib.request.HTTPHandler):
1262     """Handler for HTTP requests and responses.
1263
1264     This class, when installed with an OpenerDirector, automatically adds
1265     the standard headers to every HTTP request and handles gzipped and
1266     deflated responses from web servers. If compression is to be avoided in
1267     a particular request, the original request in the program code only has
1268     to include the HTTP header "Youtubedl-no-compression", which will be
1269     removed before making the real request.
1270
1271     Part of this code was copied from:
1272
1273     http://techknack.net/python-urllib2-handlers/
1274
1275     Andrew Rowls, the author of that code, agreed to release it to the
1276     public domain.
1277     """
1278
1279     def __init__(self, params, *args, **kwargs):
1280         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1281         self._params = params
1282
1283     def http_open(self, req):
1284         conn_class = http.client.HTTPConnection
1285
1286         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1287         if socks_proxy:
1288             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1289             del req.headers['Ytdl-socks-proxy']
1290
1291         return self.do_open(functools.partial(
1292             _create_http_connection, self, conn_class, False),
1293             req)
1294
1295     @staticmethod
1296     def deflate(data):
1297         if not data:
1298             return data
1299         try:
1300             return zlib.decompress(data, -zlib.MAX_WBITS)
1301         except zlib.error:
1302             return zlib.decompress(data)
1303
1304     @staticmethod
1305     def brotli(data):
1306         if not data:
1307             return data
1308         return brotli.decompress(data)
1309
1310     def http_request(self, req):
1311         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1312         # always respected by websites, some tend to give out URLs with non percent-encoded
1313         # non-ASCII characters (see telemb.py, ard.py [#3412])
1314         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1315         # To work around aforementioned issue we will replace request's original URL with
1316         # percent-encoded one
1317         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1318         # the code of this workaround has been moved here from YoutubeDL.urlopen()
1319         url = req.get_full_url()
1320         url_escaped = escape_url(url)
1321
1322         # Substitute URL if any change after escaping
1323         if url != url_escaped:
1324             req = update_Request(req, url=url_escaped)
1325
1326         for h, v in self._params.get('http_headers', std_headers).items():
1327             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1328             # The dict keys are capitalized because of this bug by urllib
1329             if h.capitalize() not in req.headers:
1330                 req.add_header(h, v)
1331
1332         if 'Accept-encoding' not in req.headers:
1333             req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1334
1335         req.headers = handle_youtubedl_headers(req.headers)
1336
1337         return super().do_request_(req)
1338
1339     def http_response(self, req, resp):
1340         old_resp = resp
1341         # gzip
1342         if resp.headers.get('Content-encoding', '') == 'gzip':
1343             content = resp.read()
1344             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1345             try:
1346                 uncompressed = io.BytesIO(gz.read())
1347             except OSError as original_ioerror:
1348                 # There may be junk add the end of the file
1349                 # See http://stackoverflow.com/q/4928560/35070 for details
1350                 for i in range(1, 1024):
1351                     try:
1352                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1353                         uncompressed = io.BytesIO(gz.read())
1354                     except OSError:
1355                         continue
1356                     break
1357                 else:
1358                     raise original_ioerror
1359             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1360             resp.msg = old_resp.msg
1361             del resp.headers['Content-encoding']
1362         # deflate
1363         if resp.headers.get('Content-encoding', '') == 'deflate':
1364             gz = io.BytesIO(self.deflate(resp.read()))
1365             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1366             resp.msg = old_resp.msg
1367             del resp.headers['Content-encoding']
1368         # brotli
1369         if resp.headers.get('Content-encoding', '') == 'br':
1370             resp = urllib.request.addinfourl(
1371                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1372             resp.msg = old_resp.msg
1373             del resp.headers['Content-encoding']
1374         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1375         # https://github.com/ytdl-org/youtube-dl/issues/6457).
1376         if 300 <= resp.code < 400:
1377             location = resp.headers.get('Location')
1378             if location:
1379                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1380                 location = location.encode('iso-8859-1').decode()
1381                 location_escaped = escape_url(location)
1382                 if location != location_escaped:
1383                     del resp.headers['Location']
1384                     resp.headers['Location'] = location_escaped
1385         return resp
1386
1387     https_request = http_request
1388     https_response = http_response
1389
1390
1391 def make_socks_conn_class(base_class, socks_proxy):
1392     assert issubclass(base_class, (
1393         http.client.HTTPConnection, http.client.HTTPSConnection))
1394
1395     url_components = urllib.parse.urlparse(socks_proxy)
1396     if url_components.scheme.lower() == 'socks5':
1397         socks_type = ProxyType.SOCKS5
1398     elif url_components.scheme.lower() in ('socks', 'socks4'):
1399         socks_type = ProxyType.SOCKS4
1400     elif url_components.scheme.lower() == 'socks4a':
1401         socks_type = ProxyType.SOCKS4A
1402
1403     def unquote_if_non_empty(s):
1404         if not s:
1405             return s
1406         return urllib.parse.unquote_plus(s)
1407
1408     proxy_args = (
1409         socks_type,
1410         url_components.hostname, url_components.port or 1080,
1411         True,  # Remote DNS
1412         unquote_if_non_empty(url_components.username),
1413         unquote_if_non_empty(url_components.password),
1414     )
1415
1416     class SocksConnection(base_class):
1417         def connect(self):
1418             self.sock = sockssocket()
1419             self.sock.setproxy(*proxy_args)
1420             if isinstance(self.timeout, (int, float)):
1421                 self.sock.settimeout(self.timeout)
1422             self.sock.connect((self.host, self.port))
1423
1424             if isinstance(self, http.client.HTTPSConnection):
1425                 if hasattr(self, '_context'):  # Python > 2.6
1426                     self.sock = self._context.wrap_socket(
1427                         self.sock, server_hostname=self.host)
1428                 else:
1429                     self.sock = ssl.wrap_socket(self.sock)
1430
1431     return SocksConnection
1432
1433
1434 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1435     def __init__(self, params, https_conn_class=None, *args, **kwargs):
1436         urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1437         self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1438         self._params = params
1439
1440     def https_open(self, req):
1441         kwargs = {}
1442         conn_class = self._https_conn_class
1443
1444         if hasattr(self, '_context'):  # python > 2.6
1445             kwargs['context'] = self._context
1446         if hasattr(self, '_check_hostname'):  # python 3.x
1447             kwargs['check_hostname'] = self._check_hostname
1448
1449         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1450         if socks_proxy:
1451             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1452             del req.headers['Ytdl-socks-proxy']
1453
1454         try:
1455             return self.do_open(
1456                 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1457         except urllib.error.URLError as e:
1458             if (isinstance(e.reason, ssl.SSLError)
1459                     and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1460                 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1461             raise
1462
1463
1464 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1465     """
1466     See [1] for cookie file format.
1467
1468     1. https://curl.haxx.se/docs/http-cookies.html
1469     """
1470     _HTTPONLY_PREFIX = '#HttpOnly_'
1471     _ENTRY_LEN = 7
1472     _HEADER = '''# Netscape HTTP Cookie File
1473 # This file is generated by yt-dlp.  Do not edit.
1474
1475 '''
1476     _CookieFileEntry = collections.namedtuple(
1477         'CookieFileEntry',
1478         ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1479
1480     def __init__(self, filename=None, *args, **kwargs):
1481         super().__init__(None, *args, **kwargs)
1482         if self.is_path(filename):
1483             filename = os.fspath(filename)
1484         self.filename = filename
1485
1486     @staticmethod
1487     def _true_or_false(cndn):
1488         return 'TRUE' if cndn else 'FALSE'
1489
1490     @staticmethod
1491     def is_path(file):
1492         return isinstance(file, (str, bytes, os.PathLike))
1493
1494     @contextlib.contextmanager
1495     def open(self, file, *, write=False):
1496         if self.is_path(file):
1497             with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1498                 yield f
1499         else:
1500             if write:
1501                 file.truncate(0)
1502             yield file
1503
1504     def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1505         now = time.time()
1506         for cookie in self:
1507             if (not ignore_discard and cookie.discard
1508                     or not ignore_expires and cookie.is_expired(now)):
1509                 continue
1510             name, value = cookie.name, cookie.value
1511             if value is None:
1512                 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1513                 # with no name, whereas http.cookiejar regards it as a
1514                 # cookie with no value.
1515                 name, value = '', name
1516             f.write('%s\n' % '\t'.join((
1517                 cookie.domain,
1518                 self._true_or_false(cookie.domain.startswith('.')),
1519                 cookie.path,
1520                 self._true_or_false(cookie.secure),
1521                 str_or_none(cookie.expires, default=''),
1522                 name, value
1523             )))
1524
1525     def save(self, filename=None, *args, **kwargs):
1526         """
1527         Save cookies to a file.
1528         Code is taken from CPython 3.6
1529         https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1530
1531         if filename is None:
1532             if self.filename is not None:
1533                 filename = self.filename
1534             else:
1535                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1536
1537         # Store session cookies with `expires` set to 0 instead of an empty string
1538         for cookie in self:
1539             if cookie.expires is None:
1540                 cookie.expires = 0
1541
1542         with self.open(filename, write=True) as f:
1543             f.write(self._HEADER)
1544             self._really_save(f, *args, **kwargs)
1545
1546     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1547         """Load cookies from a file."""
1548         if filename is None:
1549             if self.filename is not None:
1550                 filename = self.filename
1551             else:
1552                 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1553
1554         def prepare_line(line):
1555             if line.startswith(self._HTTPONLY_PREFIX):
1556                 line = line[len(self._HTTPONLY_PREFIX):]
1557             # comments and empty lines are fine
1558             if line.startswith('#') or not line.strip():
1559                 return line
1560             cookie_list = line.split('\t')
1561             if len(cookie_list) != self._ENTRY_LEN:
1562                 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1563             cookie = self._CookieFileEntry(*cookie_list)
1564             if cookie.expires_at and not cookie.expires_at.isdigit():
1565                 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1566             return line
1567
1568         cf = io.StringIO()
1569         with self.open(filename) as f:
1570             for line in f:
1571                 try:
1572                     cf.write(prepare_line(line))
1573                 except http.cookiejar.LoadError as e:
1574                     if f'{line.strip()} '[0] in '[{"':
1575                         raise http.cookiejar.LoadError(
1576                             'Cookies file must be Netscape formatted, not JSON. See  '
1577                             'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1578                     write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1579                     continue
1580         cf.seek(0)
1581         self._really_load(cf, filename, ignore_discard, ignore_expires)
1582         # Session cookies are denoted by either `expires` field set to
1583         # an empty string or 0. MozillaCookieJar only recognizes the former
1584         # (see [1]). So we need force the latter to be recognized as session
1585         # cookies on our own.
1586         # Session cookies may be important for cookies-based authentication,
1587         # e.g. usually, when user does not check 'Remember me' check box while
1588         # logging in on a site, some important cookies are stored as session
1589         # cookies so that not recognizing them will result in failed login.
1590         # 1. https://bugs.python.org/issue17164
1591         for cookie in self:
1592             # Treat `expires=0` cookies as session cookies
1593             if cookie.expires == 0:
1594                 cookie.expires = None
1595                 cookie.discard = True
1596
1597
1598 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1599     def __init__(self, cookiejar=None):
1600         urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1601
1602     def http_response(self, request, response):
1603         return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1604
1605     https_request = urllib.request.HTTPCookieProcessor.http_request
1606     https_response = http_response
1607
1608
1609 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1610     """YoutubeDL redirect handler
1611
1612     The code is based on HTTPRedirectHandler implementation from CPython [1].
1613
1614     This redirect handler solves two issues:
1615      - ensures redirect URL is always unicode under python 2
1616      - introduces support for experimental HTTP response status code
1617        308 Permanent Redirect [2] used by some sites [3]
1618
1619     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1620     2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1621     3. https://github.com/ytdl-org/youtube-dl/issues/28768
1622     """
1623
1624     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1625
1626     def redirect_request(self, req, fp, code, msg, headers, newurl):
1627         """Return a Request or None in response to a redirect.
1628
1629         This is called by the http_error_30x methods when a
1630         redirection response is received.  If a redirection should
1631         take place, return a new Request to allow http_error_30x to
1632         perform the redirect.  Otherwise, raise HTTPError if no-one
1633         else should try to handle this url.  Return None if you can't
1634         but another Handler might.
1635         """
1636         m = req.get_method()
1637         if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1638                  or code in (301, 302, 303) and m == "POST")):
1639             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1640         # Strictly (according to RFC 2616), 301 or 302 in response to
1641         # a POST MUST NOT cause a redirection without confirmation
1642         # from the user (of urllib.request, in this case).  In practice,
1643         # essentially all clients do redirect in this case, so we do
1644         # the same.
1645
1646         # Be conciliant with URIs containing a space.  This is mainly
1647         # redundant with the more complete encoding done in http_error_302(),
1648         # but it is kept for compatibility with other callers.
1649         newurl = newurl.replace(' ', '%20')
1650
1651         CONTENT_HEADERS = ("content-length", "content-type")
1652         # NB: don't use dict comprehension for python 2.6 compatibility
1653         newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1654
1655         # A 303 must either use GET or HEAD for subsequent request
1656         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1657         if code == 303 and m != 'HEAD':
1658             m = 'GET'
1659         # 301 and 302 redirects are commonly turned into a GET from a POST
1660         # for subsequent requests by browsers, so we'll do the same.
1661         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1662         # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1663         if code in (301, 302) and m == 'POST':
1664             m = 'GET'
1665
1666         return urllib.request.Request(
1667             newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1668             unverifiable=True, method=m)
1669
1670
1671 def extract_timezone(date_str):
1672     m = re.search(
1673         r'''(?x)
1674             ^.{8,}?                                              # >=8 char non-TZ prefix, if present
1675             (?P<tz>Z|                                            # just the UTC Z, or
1676                 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
1677                    (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1678                    [ ]?                                          # optional space
1679                 (?P<sign>\+|-)                                   # +/-
1680                 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
1681             $)
1682         ''', date_str)
1683     if not m:
1684         timezone = datetime.timedelta()
1685     else:
1686         date_str = date_str[:-len(m.group('tz'))]
1687         if not m.group('sign'):
1688             timezone = datetime.timedelta()
1689         else:
1690             sign = 1 if m.group('sign') == '+' else -1
1691             timezone = datetime.timedelta(
1692                 hours=sign * int(m.group('hours')),
1693                 minutes=sign * int(m.group('minutes')))
1694     return timezone, date_str
1695
1696
1697 def parse_iso8601(date_str, delimiter='T', timezone=None):
1698     """ Return a UNIX timestamp from the given date """
1699
1700     if date_str is None:
1701         return None
1702
1703     date_str = re.sub(r'\.[0-9]+', '', date_str)
1704
1705     if timezone is None:
1706         timezone, date_str = extract_timezone(date_str)
1707
1708     with contextlib.suppress(ValueError):
1709         date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1710         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1711         return calendar.timegm(dt.timetuple())
1712
1713
1714 def date_formats(day_first=True):
1715     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1716
1717
1718 def unified_strdate(date_str, day_first=True):
1719     """Return a string with the date in the format YYYYMMDD"""
1720
1721     if date_str is None:
1722         return None
1723     upload_date = None
1724     # Replace commas
1725     date_str = date_str.replace(',', ' ')
1726     # Remove AM/PM + timezone
1727     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1728     _, date_str = extract_timezone(date_str)
1729
1730     for expression in date_formats(day_first):
1731         with contextlib.suppress(ValueError):
1732             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1733     if upload_date is None:
1734         timetuple = email.utils.parsedate_tz(date_str)
1735         if timetuple:
1736             with contextlib.suppress(ValueError):
1737                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1738     if upload_date is not None:
1739         return str(upload_date)
1740
1741
1742 def unified_timestamp(date_str, day_first=True):
1743     if date_str is None:
1744         return None
1745
1746     date_str = re.sub(r'[,|]', '', date_str)
1747
1748     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1749     timezone, date_str = extract_timezone(date_str)
1750
1751     # Remove AM/PM + timezone
1752     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1753
1754     # Remove unrecognized timezones from ISO 8601 alike timestamps
1755     m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1756     if m:
1757         date_str = date_str[:-len(m.group('tz'))]
1758
1759     # Python only supports microseconds, so remove nanoseconds
1760     m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1761     if m:
1762         date_str = m.group(1)
1763
1764     for expression in date_formats(day_first):
1765         with contextlib.suppress(ValueError):
1766             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1767             return calendar.timegm(dt.timetuple())
1768     timetuple = email.utils.parsedate_tz(date_str)
1769     if timetuple:
1770         return calendar.timegm(timetuple) + pm_delta * 3600
1771
1772
1773 def determine_ext(url, default_ext='unknown_video'):
1774     if url is None or '.' not in url:
1775         return default_ext
1776     guess = url.partition('?')[0].rpartition('.')[2]
1777     if re.match(r'^[A-Za-z0-9]+$', guess):
1778         return guess
1779     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1780     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1781         return guess.rstrip('/')
1782     else:
1783         return default_ext
1784
1785
1786 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1787     return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1788
1789
1790 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1791     R"""
1792     Return a datetime object from a string.
1793     Supported format:
1794         (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1795
1796     @param format       strftime format of DATE
1797     @param precision    Round the datetime object: auto|microsecond|second|minute|hour|day
1798                         auto: round to the unit provided in date_str (if applicable).
1799     """
1800     auto_precision = False
1801     if precision == 'auto':
1802         auto_precision = True
1803         precision = 'microsecond'
1804     today = datetime_round(datetime.datetime.utcnow(), precision)
1805     if date_str in ('now', 'today'):
1806         return today
1807     if date_str == 'yesterday':
1808         return today - datetime.timedelta(days=1)
1809     match = re.match(
1810         r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1811         date_str)
1812     if match is not None:
1813         start_time = datetime_from_str(match.group('start'), precision, format)
1814         time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1815         unit = match.group('unit')
1816         if unit == 'month' or unit == 'year':
1817             new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1818             unit = 'day'
1819         else:
1820             if unit == 'week':
1821                 unit = 'day'
1822                 time *= 7
1823             delta = datetime.timedelta(**{unit + 's': time})
1824             new_date = start_time + delta
1825         if auto_precision:
1826             return datetime_round(new_date, unit)
1827         return new_date
1828
1829     return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1830
1831
1832 def date_from_str(date_str, format='%Y%m%d', strict=False):
1833     R"""
1834     Return a date object from a string using datetime_from_str
1835
1836     @param strict  Restrict allowed patterns to "YYYYMMDD" and
1837                    (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1838     """
1839     if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1840         raise ValueError(f'Invalid date format "{date_str}"')
1841     return datetime_from_str(date_str, precision='microsecond', format=format).date()
1842
1843
1844 def datetime_add_months(dt, months):
1845     """Increment/Decrement a datetime object by months."""
1846     month = dt.month + months - 1
1847     year = dt.year + month // 12
1848     month = month % 12 + 1
1849     day = min(dt.day, calendar.monthrange(year, month)[1])
1850     return dt.replace(year, month, day)
1851
1852
1853 def datetime_round(dt, precision='day'):
1854     """
1855     Round a datetime object's time to a specific precision
1856     """
1857     if precision == 'microsecond':
1858         return dt
1859
1860     unit_seconds = {
1861         'day': 86400,
1862         'hour': 3600,
1863         'minute': 60,
1864         'second': 1,
1865     }
1866     roundto = lambda x, n: ((x + n / 2) // n) * n
1867     timestamp = calendar.timegm(dt.timetuple())
1868     return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1869
1870
1871 def hyphenate_date(date_str):
1872     """
1873     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1874     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1875     if match is not None:
1876         return '-'.join(match.groups())
1877     else:
1878         return date_str
1879
1880
1881 class DateRange:
1882     """Represents a time interval between two dates"""
1883
1884     def __init__(self, start=None, end=None):
1885         """start and end must be strings in the format accepted by date"""
1886         if start is not None:
1887             self.start = date_from_str(start, strict=True)
1888         else:
1889             self.start = datetime.datetime.min.date()
1890         if end is not None:
1891             self.end = date_from_str(end, strict=True)
1892         else:
1893             self.end = datetime.datetime.max.date()
1894         if self.start > self.end:
1895             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1896
1897     @classmethod
1898     def day(cls, day):
1899         """Returns a range that only contains the given day"""
1900         return cls(day, day)
1901
1902     def __contains__(self, date):
1903         """Check if the date is in the range"""
1904         if not isinstance(date, datetime.date):
1905             date = date_from_str(date)
1906         return self.start <= date <= self.end
1907
1908     def __str__(self):
1909         return f'{self.start.isoformat()} - {self.end.isoformat()}'
1910
1911     def __eq__(self, other):
1912         return (isinstance(other, DateRange)
1913                 and self.start == other.start and self.end == other.end)
1914
1915
1916 def platform_name():
1917     """ Returns the platform name as a str """
1918     write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1919     return platform.platform()
1920
1921
1922 @functools.cache
1923 def system_identifier():
1924     python_implementation = platform.python_implementation()
1925     if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1926         python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1927
1928     return 'Python %s (%s %s) - %s %s' % (
1929         platform.python_version(),
1930         python_implementation,
1931         platform.architecture()[0],
1932         platform.platform(),
1933         format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1934     )
1935
1936
1937 @functools.cache
1938 def get_windows_version():
1939     ''' Get Windows version. returns () if it's not running on Windows '''
1940     if compat_os_name == 'nt':
1941         return version_tuple(platform.win32_ver()[1])
1942     else:
1943         return ()
1944
1945
1946 def write_string(s, out=None, encoding=None):
1947     assert isinstance(s, str)
1948     out = out or sys.stderr
1949
1950     if compat_os_name == 'nt' and supports_terminal_sequences(out):
1951         s = re.sub(r'([\r\n]+)', r' \1', s)
1952
1953     enc, buffer = None, out
1954     if 'b' in getattr(out, 'mode', ''):
1955         enc = encoding or preferredencoding()
1956     elif hasattr(out, 'buffer'):
1957         buffer = out.buffer
1958         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1959
1960     buffer.write(s.encode(enc, 'ignore') if enc else s)
1961     out.flush()
1962
1963
1964 def bytes_to_intlist(bs):
1965     if not bs:
1966         return []
1967     if isinstance(bs[0], int):  # Python 3
1968         return list(bs)
1969     else:
1970         return [ord(c) for c in bs]
1971
1972
1973 def intlist_to_bytes(xs):
1974     if not xs:
1975         return b''
1976     return struct.pack('%dB' % len(xs), *xs)
1977
1978
1979 class LockingUnsupportedError(OSError):
1980     msg = 'File locking is not supported'
1981
1982     def __init__(self):
1983         super().__init__(self.msg)
1984
1985
1986 # Cross-platform file locking
1987 if sys.platform == 'win32':
1988     import ctypes.wintypes
1989     import msvcrt
1990
1991     class OVERLAPPED(ctypes.Structure):
1992         _fields_ = [
1993             ('Internal', ctypes.wintypes.LPVOID),
1994             ('InternalHigh', ctypes.wintypes.LPVOID),
1995             ('Offset', ctypes.wintypes.DWORD),
1996             ('OffsetHigh', ctypes.wintypes.DWORD),
1997             ('hEvent', ctypes.wintypes.HANDLE),
1998         ]
1999
2000     kernel32 = ctypes.windll.kernel32
2001     LockFileEx = kernel32.LockFileEx
2002     LockFileEx.argtypes = [
2003         ctypes.wintypes.HANDLE,     # hFile
2004         ctypes.wintypes.DWORD,      # dwFlags
2005         ctypes.wintypes.DWORD,      # dwReserved
2006         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2007         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2008         ctypes.POINTER(OVERLAPPED)  # Overlapped
2009     ]
2010     LockFileEx.restype = ctypes.wintypes.BOOL
2011     UnlockFileEx = kernel32.UnlockFileEx
2012     UnlockFileEx.argtypes = [
2013         ctypes.wintypes.HANDLE,     # hFile
2014         ctypes.wintypes.DWORD,      # dwReserved
2015         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
2016         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
2017         ctypes.POINTER(OVERLAPPED)  # Overlapped
2018     ]
2019     UnlockFileEx.restype = ctypes.wintypes.BOOL
2020     whole_low = 0xffffffff
2021     whole_high = 0x7fffffff
2022
2023     def _lock_file(f, exclusive, block):
2024         overlapped = OVERLAPPED()
2025         overlapped.Offset = 0
2026         overlapped.OffsetHigh = 0
2027         overlapped.hEvent = 0
2028         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2029
2030         if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2031                           (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2032                           0, whole_low, whole_high, f._lock_file_overlapped_p):
2033             # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2034             raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2035
2036     def _unlock_file(f):
2037         assert f._lock_file_overlapped_p
2038         handle = msvcrt.get_osfhandle(f.fileno())
2039         if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2040             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2041
2042 else:
2043     try:
2044         import fcntl
2045
2046         def _lock_file(f, exclusive, block):
2047             flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2048             if not block:
2049                 flags |= fcntl.LOCK_NB
2050             try:
2051                 fcntl.flock(f, flags)
2052             except BlockingIOError:
2053                 raise
2054             except OSError:  # AOSP does not have flock()
2055                 fcntl.lockf(f, flags)
2056
2057         def _unlock_file(f):
2058             try:
2059                 fcntl.flock(f, fcntl.LOCK_UN)
2060             except OSError:
2061                 fcntl.lockf(f, fcntl.LOCK_UN)
2062
2063     except ImportError:
2064
2065         def _lock_file(f, exclusive, block):
2066             raise LockingUnsupportedError()
2067
2068         def _unlock_file(f):
2069             raise LockingUnsupportedError()
2070
2071
2072 class locked_file:
2073     locked = False
2074
2075     def __init__(self, filename, mode, block=True, encoding=None):
2076         if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2077             raise NotImplementedError(mode)
2078         self.mode, self.block = mode, block
2079
2080         writable = any(f in mode for f in 'wax+')
2081         readable = any(f in mode for f in 'r+')
2082         flags = functools.reduce(operator.ior, (
2083             getattr(os, 'O_CLOEXEC', 0),  # UNIX only
2084             getattr(os, 'O_BINARY', 0),  # Windows only
2085             getattr(os, 'O_NOINHERIT', 0),  # Windows only
2086             os.O_CREAT if writable else 0,  # O_TRUNC only after locking
2087             os.O_APPEND if 'a' in mode else 0,
2088             os.O_EXCL if 'x' in mode else 0,
2089             os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2090         ))
2091
2092         self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2093
2094     def __enter__(self):
2095         exclusive = 'r' not in self.mode
2096         try:
2097             _lock_file(self.f, exclusive, self.block)
2098             self.locked = True
2099         except OSError:
2100             self.f.close()
2101             raise
2102         if 'w' in self.mode:
2103             try:
2104                 self.f.truncate()
2105             except OSError as e:
2106                 if e.errno not in (
2107                     errno.ESPIPE,  # Illegal seek - expected for FIFO
2108                     errno.EINVAL,  # Invalid argument - expected for /dev/null
2109                 ):
2110                     raise
2111         return self
2112
2113     def unlock(self):
2114         if not self.locked:
2115             return
2116         try:
2117             _unlock_file(self.f)
2118         finally:
2119             self.locked = False
2120
2121     def __exit__(self, *_):
2122         try:
2123             self.unlock()
2124         finally:
2125             self.f.close()
2126
2127     open = __enter__
2128     close = __exit__
2129
2130     def __getattr__(self, attr):
2131         return getattr(self.f, attr)
2132
2133     def __iter__(self):
2134         return iter(self.f)
2135
2136
2137 @functools.cache
2138 def get_filesystem_encoding():
2139     encoding = sys.getfilesystemencoding()
2140     return encoding if encoding is not None else 'utf-8'
2141
2142
2143 def shell_quote(args):
2144     quoted_args = []
2145     encoding = get_filesystem_encoding()
2146     for a in args:
2147         if isinstance(a, bytes):
2148             # We may get a filename encoded with 'encodeFilename'
2149             a = a.decode(encoding)
2150         quoted_args.append(compat_shlex_quote(a))
2151     return ' '.join(quoted_args)
2152
2153
2154 def smuggle_url(url, data):
2155     """ Pass additional data in a URL for internal use. """
2156
2157     url, idata = unsmuggle_url(url, {})
2158     data.update(idata)
2159     sdata = urllib.parse.urlencode(
2160         {'__youtubedl_smuggle': json.dumps(data)})
2161     return url + '#' + sdata
2162
2163
2164 def unsmuggle_url(smug_url, default=None):
2165     if '#__youtubedl_smuggle' not in smug_url:
2166         return smug_url, default
2167     url, _, sdata = smug_url.rpartition('#')
2168     jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2169     data = json.loads(jsond)
2170     return url, data
2171
2172
2173 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2174     """ Formats numbers with decimal sufixes like K, M, etc """
2175     num, factor = float_or_none(num), float(factor)
2176     if num is None or num < 0:
2177         return None
2178     POSSIBLE_SUFFIXES = 'kMGTPEZY'
2179     exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2180     suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2181     if factor == 1024:
2182         suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2183     converted = num / (factor ** exponent)
2184     return fmt % (converted, suffix)
2185
2186
2187 def format_bytes(bytes):
2188     return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2189
2190
2191 def lookup_unit_table(unit_table, s):
2192     units_re = '|'.join(re.escape(u) for u in unit_table)
2193     m = re.match(
2194         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2195     if not m:
2196         return None
2197     num_str = m.group('num').replace(',', '.')
2198     mult = unit_table[m.group('unit')]
2199     return int(float(num_str) * mult)
2200
2201
2202 def parse_filesize(s):
2203     if s is None:
2204         return None
2205
2206     # The lower-case forms are of course incorrect and unofficial,
2207     # but we support those too
2208     _UNIT_TABLE = {
2209         'B': 1,
2210         'b': 1,
2211         'bytes': 1,
2212         'KiB': 1024,
2213         'KB': 1000,
2214         'kB': 1024,
2215         'Kb': 1000,
2216         'kb': 1000,
2217         'kilobytes': 1000,
2218         'kibibytes': 1024,
2219         'MiB': 1024 ** 2,
2220         'MB': 1000 ** 2,
2221         'mB': 1024 ** 2,
2222         'Mb': 1000 ** 2,
2223         'mb': 1000 ** 2,
2224         'megabytes': 1000 ** 2,
2225         'mebibytes': 1024 ** 2,
2226         'GiB': 1024 ** 3,
2227         'GB': 1000 ** 3,
2228         'gB': 1024 ** 3,
2229         'Gb': 1000 ** 3,
2230         'gb': 1000 ** 3,
2231         'gigabytes': 1000 ** 3,
2232         'gibibytes': 1024 ** 3,
2233         'TiB': 1024 ** 4,
2234         'TB': 1000 ** 4,
2235         'tB': 1024 ** 4,
2236         'Tb': 1000 ** 4,
2237         'tb': 1000 ** 4,
2238         'terabytes': 1000 ** 4,
2239         'tebibytes': 1024 ** 4,
2240         'PiB': 1024 ** 5,
2241         'PB': 1000 ** 5,
2242         'pB': 1024 ** 5,
2243         'Pb': 1000 ** 5,
2244         'pb': 1000 ** 5,
2245         'petabytes': 1000 ** 5,
2246         'pebibytes': 1024 ** 5,
2247         'EiB': 1024 ** 6,
2248         'EB': 1000 ** 6,
2249         'eB': 1024 ** 6,
2250         'Eb': 1000 ** 6,
2251         'eb': 1000 ** 6,
2252         'exabytes': 1000 ** 6,
2253         'exbibytes': 1024 ** 6,
2254         'ZiB': 1024 ** 7,
2255         'ZB': 1000 ** 7,
2256         'zB': 1024 ** 7,
2257         'Zb': 1000 ** 7,
2258         'zb': 1000 ** 7,
2259         'zettabytes': 1000 ** 7,
2260         'zebibytes': 1024 ** 7,
2261         'YiB': 1024 ** 8,
2262         'YB': 1000 ** 8,
2263         'yB': 1024 ** 8,
2264         'Yb': 1000 ** 8,
2265         'yb': 1000 ** 8,
2266         'yottabytes': 1000 ** 8,
2267         'yobibytes': 1024 ** 8,
2268     }
2269
2270     return lookup_unit_table(_UNIT_TABLE, s)
2271
2272
2273 def parse_count(s):
2274     if s is None:
2275         return None
2276
2277     s = re.sub(r'^[^\d]+\s', '', s).strip()
2278
2279     if re.match(r'^[\d,.]+$', s):
2280         return str_to_int(s)
2281
2282     _UNIT_TABLE = {
2283         'k': 1000,
2284         'K': 1000,
2285         'm': 1000 ** 2,
2286         'M': 1000 ** 2,
2287         'kk': 1000 ** 2,
2288         'KK': 1000 ** 2,
2289         'b': 1000 ** 3,
2290         'B': 1000 ** 3,
2291     }
2292
2293     ret = lookup_unit_table(_UNIT_TABLE, s)
2294     if ret is not None:
2295         return ret
2296
2297     mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2298     if mobj:
2299         return str_to_int(mobj.group(1))
2300
2301
2302 def parse_resolution(s, *, lenient=False):
2303     if s is None:
2304         return {}
2305
2306     if lenient:
2307         mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2308     else:
2309         mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2310     if mobj:
2311         return {
2312             'width': int(mobj.group('w')),
2313             'height': int(mobj.group('h')),
2314         }
2315
2316     mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2317     if mobj:
2318         return {'height': int(mobj.group(1))}
2319
2320     mobj = re.search(r'\b([48])[kK]\b', s)
2321     if mobj:
2322         return {'height': int(mobj.group(1)) * 540}
2323
2324     return {}
2325
2326
2327 def parse_bitrate(s):
2328     if not isinstance(s, str):
2329         return
2330     mobj = re.search(r'\b(\d+)\s*kbps', s)
2331     if mobj:
2332         return int(mobj.group(1))
2333
2334
2335 def month_by_name(name, lang='en'):
2336     """ Return the number of a month by (locale-independently) English name """
2337
2338     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2339
2340     try:
2341         return month_names.index(name) + 1
2342     except ValueError:
2343         return None
2344
2345
2346 def month_by_abbreviation(abbrev):
2347     """ Return the number of a month by (locale-independently) English
2348         abbreviations """
2349
2350     try:
2351         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2352     except ValueError:
2353         return None
2354
2355
2356 def fix_xml_ampersands(xml_str):
2357     """Replace all the '&' by '&amp;' in XML"""
2358     return re.sub(
2359         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2360         '&amp;',
2361         xml_str)
2362
2363
2364 def setproctitle(title):
2365     assert isinstance(title, str)
2366
2367     # ctypes in Jython is not complete
2368     # http://bugs.jython.org/issue2148
2369     if sys.platform.startswith('java'):
2370         return
2371
2372     try:
2373         libc = ctypes.cdll.LoadLibrary('libc.so.6')
2374     except OSError:
2375         return
2376     except TypeError:
2377         # LoadLibrary in Windows Python 2.7.13 only expects
2378         # a bytestring, but since unicode_literals turns
2379         # every string into a unicode string, it fails.
2380         return
2381     title_bytes = title.encode()
2382     buf = ctypes.create_string_buffer(len(title_bytes))
2383     buf.value = title_bytes
2384     try:
2385         libc.prctl(15, buf, 0, 0, 0)
2386     except AttributeError:
2387         return  # Strange libc, just skip this
2388
2389
2390 def remove_start(s, start):
2391     return s[len(start):] if s is not None and s.startswith(start) else s
2392
2393
2394 def remove_end(s, end):
2395     return s[:-len(end)] if s is not None and s.endswith(end) else s
2396
2397
2398 def remove_quotes(s):
2399     if s is None or len(s) < 2:
2400         return s
2401     for quote in ('"', "'", ):
2402         if s[0] == quote and s[-1] == quote:
2403             return s[1:-1]
2404     return s
2405
2406
2407 def get_domain(url):
2408     """
2409     This implementation is inconsistent, but is kept for compatibility.
2410     Use this only for "webpage_url_domain"
2411     """
2412     return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2413
2414
2415 def url_basename(url):
2416     path = urllib.parse.urlparse(url).path
2417     return path.strip('/').split('/')[-1]
2418
2419
2420 def base_url(url):
2421     return re.match(r'https?://[^?#&]+/', url).group()
2422
2423
2424 def urljoin(base, path):
2425     if isinstance(path, bytes):
2426         path = path.decode()
2427     if not isinstance(path, str) or not path:
2428         return None
2429     if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2430         return path
2431     if isinstance(base, bytes):
2432         base = base.decode()
2433     if not isinstance(base, str) or not re.match(
2434             r'^(?:https?:)?//', base):
2435         return None
2436     return urllib.parse.urljoin(base, path)
2437
2438
2439 class HEADRequest(urllib.request.Request):
2440     def get_method(self):
2441         return 'HEAD'
2442
2443
2444 class PUTRequest(urllib.request.Request):
2445     def get_method(self):
2446         return 'PUT'
2447
2448
2449 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2450     if get_attr and v is not None:
2451         v = getattr(v, get_attr, None)
2452     try:
2453         return int(v) * invscale // scale
2454     except (ValueError, TypeError, OverflowError):
2455         return default
2456
2457
2458 def str_or_none(v, default=None):
2459     return default if v is None else str(v)
2460
2461
2462 def str_to_int(int_str):
2463     """ A more relaxed version of int_or_none """
2464     if isinstance(int_str, int):
2465         return int_str
2466     elif isinstance(int_str, str):
2467         int_str = re.sub(r'[,\.\+]', '', int_str)
2468         return int_or_none(int_str)
2469
2470
2471 def float_or_none(v, scale=1, invscale=1, default=None):
2472     if v is None:
2473         return default
2474     try:
2475         return float(v) * invscale / scale
2476     except (ValueError, TypeError):
2477         return default
2478
2479
2480 def bool_or_none(v, default=None):
2481     return v if isinstance(v, bool) else default
2482
2483
2484 def strip_or_none(v, default=None):
2485     return v.strip() if isinstance(v, str) else default
2486
2487
2488 def url_or_none(url):
2489     if not url or not isinstance(url, str):
2490         return None
2491     url = url.strip()
2492     return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2493
2494
2495 def request_to_url(req):
2496     if isinstance(req, urllib.request.Request):
2497         return req.get_full_url()
2498     else:
2499         return req
2500
2501
2502 def strftime_or_none(timestamp, date_format, default=None):
2503     datetime_object = None
2504     try:
2505         if isinstance(timestamp, (int, float)):  # unix timestamp
2506             datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2507         elif isinstance(timestamp, str):  # assume YYYYMMDD
2508             datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2509         return datetime_object.strftime(date_format)
2510     except (ValueError, TypeError, AttributeError):
2511         return default
2512
2513
2514 def parse_duration(s):
2515     if not isinstance(s, str):
2516         return None
2517     s = s.strip()
2518     if not s:
2519         return None
2520
2521     days, hours, mins, secs, ms = [None] * 5
2522     m = re.match(r'''(?x)
2523             (?P<before_secs>
2524                 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2525             (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2526             (?P<ms>[.:][0-9]+)?Z?$
2527         ''', s)
2528     if m:
2529         days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2530     else:
2531         m = re.match(
2532             r'''(?ix)(?:P?
2533                 (?:
2534                     [0-9]+\s*y(?:ears?)?,?\s*
2535                 )?
2536                 (?:
2537                     [0-9]+\s*m(?:onths?)?,?\s*
2538                 )?
2539                 (?:
2540                     [0-9]+\s*w(?:eeks?)?,?\s*
2541                 )?
2542                 (?:
2543                     (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2544                 )?
2545                 T)?
2546                 (?:
2547                     (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2548                 )?
2549                 (?:
2550                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2551                 )?
2552                 (?:
2553                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2554                 )?Z?$''', s)
2555         if m:
2556             days, hours, mins, secs, ms = m.groups()
2557         else:
2558             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2559             if m:
2560                 hours, mins = m.groups()
2561             else:
2562                 return None
2563
2564     if ms:
2565         ms = ms.replace(':', '.')
2566     return sum(float(part or 0) * mult for part, mult in (
2567         (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2568
2569
2570 def prepend_extension(filename, ext, expected_real_ext=None):
2571     name, real_ext = os.path.splitext(filename)
2572     return (
2573         f'{name}.{ext}{real_ext}'
2574         if not expected_real_ext or real_ext[1:] == expected_real_ext
2575         else f'{filename}.{ext}')
2576
2577
2578 def replace_extension(filename, ext, expected_real_ext=None):
2579     name, real_ext = os.path.splitext(filename)
2580     return '{}.{}'.format(
2581         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2582         ext)
2583
2584
2585 def check_executable(exe, args=[]):
2586     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2587     args can be a list of arguments for a short output (like -version) """
2588     try:
2589         Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2590     except OSError:
2591         return False
2592     return exe
2593
2594
2595 def _get_exe_version_output(exe, args, *, to_screen=None):
2596     if to_screen:
2597         to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2598     try:
2599         # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2600         # SIGTTOU if yt-dlp is run in the background.
2601         # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2602         stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2603                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2604     except OSError:
2605         return False
2606     return stdout
2607
2608
2609 def detect_exe_version(output, version_re=None, unrecognized='present'):
2610     assert isinstance(output, str)
2611     if version_re is None:
2612         version_re = r'version\s+([-0-9._a-zA-Z]+)'
2613     m = re.search(version_re, output)
2614     if m:
2615         return m.group(1)
2616     else:
2617         return unrecognized
2618
2619
2620 def get_exe_version(exe, args=['--version'],
2621                     version_re=None, unrecognized='present'):
2622     """ Returns the version of the specified executable,
2623     or False if the executable is not present """
2624     out = _get_exe_version_output(exe, args)
2625     return detect_exe_version(out, version_re, unrecognized) if out else False
2626
2627
2628 def frange(start=0, stop=None, step=1):
2629     """Float range"""
2630     if stop is None:
2631         start, stop = 0, start
2632     sign = [-1, 1][step > 0] if step else 0
2633     while sign * start < sign * stop:
2634         yield start
2635         start += step
2636
2637
2638 class LazyList(collections.abc.Sequence):
2639     """Lazy immutable list from an iterable
2640     Note that slices of a LazyList are lists and not LazyList"""
2641
2642     class IndexError(IndexError):
2643         pass
2644
2645     def __init__(self, iterable, *, reverse=False, _cache=None):
2646         self._iterable = iter(iterable)
2647         self._cache = [] if _cache is None else _cache
2648         self._reversed = reverse
2649
2650     def __iter__(self):
2651         if self._reversed:
2652             # We need to consume the entire iterable to iterate in reverse
2653             yield from self.exhaust()
2654             return
2655         yield from self._cache
2656         for item in self._iterable:
2657             self._cache.append(item)
2658             yield item
2659
2660     def _exhaust(self):
2661         self._cache.extend(self._iterable)
2662         self._iterable = []  # Discard the emptied iterable to make it pickle-able
2663         return self._cache
2664
2665     def exhaust(self):
2666         """Evaluate the entire iterable"""
2667         return self._exhaust()[::-1 if self._reversed else 1]
2668
2669     @staticmethod
2670     def _reverse_index(x):
2671         return None if x is None else ~x
2672
2673     def __getitem__(self, idx):
2674         if isinstance(idx, slice):
2675             if self._reversed:
2676                 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2677             start, stop, step = idx.start, idx.stop, idx.step or 1
2678         elif isinstance(idx, int):
2679             if self._reversed:
2680                 idx = self._reverse_index(idx)
2681             start, stop, step = idx, idx, 0
2682         else:
2683             raise TypeError('indices must be integers or slices')
2684         if ((start or 0) < 0 or (stop or 0) < 0
2685                 or (start is None and step < 0)
2686                 or (stop is None and step > 0)):
2687             # We need to consume the entire iterable to be able to slice from the end
2688             # Obviously, never use this with infinite iterables
2689             self._exhaust()
2690             try:
2691                 return self._cache[idx]
2692             except IndexError as e:
2693                 raise self.IndexError(e) from e
2694         n = max(start or 0, stop or 0) - len(self._cache) + 1
2695         if n > 0:
2696             self._cache.extend(itertools.islice(self._iterable, n))
2697         try:
2698             return self._cache[idx]
2699         except IndexError as e:
2700             raise self.IndexError(e) from e
2701
2702     def __bool__(self):
2703         try:
2704             self[-1] if self._reversed else self[0]
2705         except self.IndexError:
2706             return False
2707         return True
2708
2709     def __len__(self):
2710         self._exhaust()
2711         return len(self._cache)
2712
2713     def __reversed__(self):
2714         return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2715
2716     def __copy__(self):
2717         return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2718
2719     def __repr__(self):
2720         # repr and str should mimic a list. So we exhaust the iterable
2721         return repr(self.exhaust())
2722
2723     def __str__(self):
2724         return repr(self.exhaust())
2725
2726
2727 class PagedList:
2728
2729     class IndexError(IndexError):
2730         pass
2731
2732     def __len__(self):
2733         # This is only useful for tests
2734         return len(self.getslice())
2735
2736     def __init__(self, pagefunc, pagesize, use_cache=True):
2737         self._pagefunc = pagefunc
2738         self._pagesize = pagesize
2739         self._pagecount = float('inf')
2740         self._use_cache = use_cache
2741         self._cache = {}
2742
2743     def getpage(self, pagenum):
2744         page_results = self._cache.get(pagenum)
2745         if page_results is None:
2746             page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2747         if self._use_cache:
2748             self._cache[pagenum] = page_results
2749         return page_results
2750
2751     def getslice(self, start=0, end=None):
2752         return list(self._getslice(start, end))
2753
2754     def _getslice(self, start, end):
2755         raise NotImplementedError('This method must be implemented by subclasses')
2756
2757     def __getitem__(self, idx):
2758         assert self._use_cache, 'Indexing PagedList requires cache'
2759         if not isinstance(idx, int) or idx < 0:
2760             raise TypeError('indices must be non-negative integers')
2761         entries = self.getslice(idx, idx + 1)
2762         if not entries:
2763             raise self.IndexError()
2764         return entries[0]
2765
2766
2767 class OnDemandPagedList(PagedList):
2768     """Download pages until a page with less than maximum results"""
2769
2770     def _getslice(self, start, end):
2771         for pagenum in itertools.count(start // self._pagesize):
2772             firstid = pagenum * self._pagesize
2773             nextfirstid = pagenum * self._pagesize + self._pagesize
2774             if start >= nextfirstid:
2775                 continue
2776
2777             startv = (
2778                 start % self._pagesize
2779                 if firstid <= start < nextfirstid
2780                 else 0)
2781             endv = (
2782                 ((end - 1) % self._pagesize) + 1
2783                 if (end is not None and firstid <= end <= nextfirstid)
2784                 else None)
2785
2786             try:
2787                 page_results = self.getpage(pagenum)
2788             except Exception:
2789                 self._pagecount = pagenum - 1
2790                 raise
2791             if startv != 0 or endv is not None:
2792                 page_results = page_results[startv:endv]
2793             yield from page_results
2794
2795             # A little optimization - if current page is not "full", ie. does
2796             # not contain page_size videos then we can assume that this page
2797             # is the last one - there are no more ids on further pages -
2798             # i.e. no need to query again.
2799             if len(page_results) + startv < self._pagesize:
2800                 break
2801
2802             # If we got the whole page, but the next page is not interesting,
2803             # break out early as well
2804             if end == nextfirstid:
2805                 break
2806
2807
2808 class InAdvancePagedList(PagedList):
2809     """PagedList with total number of pages known in advance"""
2810
2811     def __init__(self, pagefunc, pagecount, pagesize):
2812         PagedList.__init__(self, pagefunc, pagesize, True)
2813         self._pagecount = pagecount
2814
2815     def _getslice(self, start, end):
2816         start_page = start // self._pagesize
2817         end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2818         skip_elems = start - start_page * self._pagesize
2819         only_more = None if end is None else end - start
2820         for pagenum in range(start_page, end_page):
2821             page_results = self.getpage(pagenum)
2822             if skip_elems:
2823                 page_results = page_results[skip_elems:]
2824                 skip_elems = None
2825             if only_more is not None:
2826                 if len(page_results) < only_more:
2827                     only_more -= len(page_results)
2828                 else:
2829                     yield from page_results[:only_more]
2830                     break
2831             yield from page_results
2832
2833
2834 class PlaylistEntries:
2835     MissingEntry = object()
2836     is_exhausted = False
2837
2838     def __init__(self, ydl, info_dict):
2839         self.ydl = ydl
2840
2841         # _entries must be assigned now since infodict can change during iteration
2842         entries = info_dict.get('entries')
2843         if entries is None:
2844             raise EntryNotInPlaylist('There are no entries')
2845         elif isinstance(entries, list):
2846             self.is_exhausted = True
2847
2848         requested_entries = info_dict.get('requested_entries')
2849         self.is_incomplete = bool(requested_entries)
2850         if self.is_incomplete:
2851             assert self.is_exhausted
2852             self._entries = [self.MissingEntry] * max(requested_entries)
2853             for i, entry in zip(requested_entries, entries):
2854                 self._entries[i - 1] = entry
2855         elif isinstance(entries, (list, PagedList, LazyList)):
2856             self._entries = entries
2857         else:
2858             self._entries = LazyList(entries)
2859
2860     PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2861         (?P<start>[+-]?\d+)?
2862         (?P<range>[:-]
2863             (?P<end>[+-]?\d+|inf(?:inite)?)?
2864             (?::(?P<step>[+-]?\d+))?
2865         )?''')
2866
2867     @classmethod
2868     def parse_playlist_items(cls, string):
2869         for segment in string.split(','):
2870             if not segment:
2871                 raise ValueError('There is two or more consecutive commas')
2872             mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2873             if not mobj:
2874                 raise ValueError(f'{segment!r} is not a valid specification')
2875             start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2876             if int_or_none(step) == 0:
2877                 raise ValueError(f'Step in {segment!r} cannot be zero')
2878             yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2879
2880     def get_requested_items(self):
2881         playlist_items = self.ydl.params.get('playlist_items')
2882         playlist_start = self.ydl.params.get('playliststart', 1)
2883         playlist_end = self.ydl.params.get('playlistend')
2884         # For backwards compatibility, interpret -1 as whole list
2885         if playlist_end in (-1, None):
2886             playlist_end = ''
2887         if not playlist_items:
2888             playlist_items = f'{playlist_start}:{playlist_end}'
2889         elif playlist_start != 1 or playlist_end:
2890             self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2891
2892         for index in self.parse_playlist_items(playlist_items):
2893             for i, entry in self[index]:
2894                 yield i, entry
2895                 if not entry:
2896                     continue
2897                 try:
2898                     # TODO: Add auto-generated fields
2899                     self.ydl._match_entry(entry, incomplete=True, silent=True)
2900                 except (ExistingVideoReached, RejectedVideoReached):
2901                     return
2902
2903     def get_full_count(self):
2904         if self.is_exhausted and not self.is_incomplete:
2905             return len(self)
2906         elif isinstance(self._entries, InAdvancePagedList):
2907             if self._entries._pagesize == 1:
2908                 return self._entries._pagecount
2909
2910     @functools.cached_property
2911     def _getter(self):
2912         if isinstance(self._entries, list):
2913             def get_entry(i):
2914                 try:
2915                     entry = self._entries[i]
2916                 except IndexError:
2917                     entry = self.MissingEntry
2918                     if not self.is_incomplete:
2919                         raise self.IndexError()
2920                 if entry is self.MissingEntry:
2921                     raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2922                 return entry
2923         else:
2924             def get_entry(i):
2925                 try:
2926                     return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2927                 except (LazyList.IndexError, PagedList.IndexError):
2928                     raise self.IndexError()
2929         return get_entry
2930
2931     def __getitem__(self, idx):
2932         if isinstance(idx, int):
2933             idx = slice(idx, idx)
2934
2935         # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2936         step = 1 if idx.step is None else idx.step
2937         if idx.start is None:
2938             start = 0 if step > 0 else len(self) - 1
2939         else:
2940             start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2941
2942         # NB: Do not call len(self) when idx == [:]
2943         if idx.stop is None:
2944             stop = 0 if step < 0 else float('inf')
2945         else:
2946             stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2947         stop += [-1, 1][step > 0]
2948
2949         for i in frange(start, stop, step):
2950             if i < 0:
2951                 continue
2952             try:
2953                 entry = self._getter(i)
2954             except self.IndexError:
2955                 self.is_exhausted = True
2956                 if step > 0:
2957                     break
2958                 continue
2959             yield i + 1, entry
2960
2961     def __len__(self):
2962         return len(tuple(self[:]))
2963
2964     class IndexError(IndexError):
2965         pass
2966
2967
2968 def uppercase_escape(s):
2969     unicode_escape = codecs.getdecoder('unicode_escape')
2970     return re.sub(
2971         r'\\U[0-9a-fA-F]{8}',
2972         lambda m: unicode_escape(m.group(0))[0],
2973         s)
2974
2975
2976 def lowercase_escape(s):
2977     unicode_escape = codecs.getdecoder('unicode_escape')
2978     return re.sub(
2979         r'\\u[0-9a-fA-F]{4}',
2980         lambda m: unicode_escape(m.group(0))[0],
2981         s)
2982
2983
2984 def escape_rfc3986(s):
2985     """Escape non-ASCII characters as suggested by RFC 3986"""
2986     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2987
2988
2989 def escape_url(url):
2990     """Escape URL as suggested by RFC 3986"""
2991     url_parsed = urllib.parse.urlparse(url)
2992     return url_parsed._replace(
2993         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2994         path=escape_rfc3986(url_parsed.path),
2995         params=escape_rfc3986(url_parsed.params),
2996         query=escape_rfc3986(url_parsed.query),
2997         fragment=escape_rfc3986(url_parsed.fragment)
2998     ).geturl()
2999
3000
3001 def parse_qs(url):
3002     return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
3003
3004
3005 def read_batch_urls(batch_fd):
3006     def fixup(url):
3007         if not isinstance(url, str):
3008             url = url.decode('utf-8', 'replace')
3009         BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3010         for bom in BOM_UTF8:
3011             if url.startswith(bom):
3012                 url = url[len(bom):]
3013         url = url.lstrip()
3014         if not url or url.startswith(('#', ';', ']')):
3015             return False
3016         # "#" cannot be stripped out since it is part of the URI
3017         # However, it can be safely stripped out if following a whitespace
3018         return re.split(r'\s#', url, 1)[0].rstrip()
3019
3020     with contextlib.closing(batch_fd) as fd:
3021         return [url for url in map(fixup, fd) if url]
3022
3023
3024 def urlencode_postdata(*args, **kargs):
3025     return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3026
3027
3028 def update_url_query(url, query):
3029     if not query:
3030         return url
3031     parsed_url = urllib.parse.urlparse(url)
3032     qs = urllib.parse.parse_qs(parsed_url.query)
3033     qs.update(query)
3034     return urllib.parse.urlunparse(parsed_url._replace(
3035         query=urllib.parse.urlencode(qs, True)))
3036
3037
3038 def update_Request(req, url=None, data=None, headers=None, query=None):
3039     req_headers = req.headers.copy()
3040     req_headers.update(headers or {})
3041     req_data = data or req.data
3042     req_url = update_url_query(url or req.get_full_url(), query)
3043     req_get_method = req.get_method()
3044     if req_get_method == 'HEAD':
3045         req_type = HEADRequest
3046     elif req_get_method == 'PUT':
3047         req_type = PUTRequest
3048     else:
3049         req_type = urllib.request.Request
3050     new_req = req_type(
3051         req_url, data=req_data, headers=req_headers,
3052         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3053     if hasattr(req, 'timeout'):
3054         new_req.timeout = req.timeout
3055     return new_req
3056
3057
3058 def _multipart_encode_impl(data, boundary):
3059     content_type = 'multipart/form-data; boundary=%s' % boundary
3060
3061     out = b''
3062     for k, v in data.items():
3063         out += b'--' + boundary.encode('ascii') + b'\r\n'
3064         if isinstance(k, str):
3065             k = k.encode()
3066         if isinstance(v, str):
3067             v = v.encode()
3068         # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3069         # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3070         content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3071         if boundary.encode('ascii') in content:
3072             raise ValueError('Boundary overlaps with data')
3073         out += content
3074
3075     out += b'--' + boundary.encode('ascii') + b'--\r\n'
3076
3077     return out, content_type
3078
3079
3080 def multipart_encode(data, boundary=None):
3081     '''
3082     Encode a dict to RFC 7578-compliant form-data
3083
3084     data:
3085         A dict where keys and values can be either Unicode or bytes-like
3086         objects.
3087     boundary:
3088         If specified a Unicode object, it's used as the boundary. Otherwise
3089         a random boundary is generated.
3090
3091     Reference: https://tools.ietf.org/html/rfc7578
3092     '''
3093     has_specified_boundary = boundary is not None
3094
3095     while True:
3096         if boundary is None:
3097             boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3098
3099         try:
3100             out, content_type = _multipart_encode_impl(data, boundary)
3101             break
3102         except ValueError:
3103             if has_specified_boundary:
3104                 raise
3105             boundary = None
3106
3107     return out, content_type
3108
3109
3110 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3111     for val in map(d.get, variadic(key_or_keys)):
3112         if val is not None and (val or not skip_false_values):
3113             return val
3114     return default
3115
3116
3117 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3118     for f in funcs:
3119         try:
3120             val = f(*args, **kwargs)
3121         except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3122             pass
3123         else:
3124             if expected_type is None or isinstance(val, expected_type):
3125                 return val
3126
3127
3128 def try_get(src, getter, expected_type=None):
3129     return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3130
3131
3132 def filter_dict(dct, cndn=lambda _, v: v is not None):
3133     return {k: v for k, v in dct.items() if cndn(k, v)}
3134
3135
3136 def merge_dicts(*dicts):
3137     merged = {}
3138     for a_dict in dicts:
3139         for k, v in a_dict.items():
3140             if (v is not None and k not in merged
3141                     or isinstance(v, str) and merged[k] == ''):
3142                 merged[k] = v
3143     return merged
3144
3145
3146 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3147     return string if isinstance(string, str) else str(string, encoding, errors)
3148
3149
3150 US_RATINGS = {
3151     'G': 0,
3152     'PG': 10,
3153     'PG-13': 13,
3154     'R': 16,
3155     'NC': 18,
3156 }
3157
3158
3159 TV_PARENTAL_GUIDELINES = {
3160     'TV-Y': 0,
3161     'TV-Y7': 7,
3162     'TV-G': 0,
3163     'TV-PG': 0,
3164     'TV-14': 14,
3165     'TV-MA': 17,
3166 }
3167
3168
3169 def parse_age_limit(s):
3170     # isinstance(False, int) is True. So type() must be used instead
3171     if type(s) is int:  # noqa: E721
3172         return s if 0 <= s <= 21 else None
3173     elif not isinstance(s, str):
3174         return None
3175     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3176     if m:
3177         return int(m.group('age'))
3178     s = s.upper()
3179     if s in US_RATINGS:
3180         return US_RATINGS[s]
3181     m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3182     if m:
3183         return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3184     return None
3185
3186
3187 def strip_jsonp(code):
3188     return re.sub(
3189         r'''(?sx)^
3190             (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3191             (?:\s*&&\s*(?P=func_name))?
3192             \s*\(\s*(?P<callback_data>.*)\);?
3193             \s*?(?://[^\n]*)*$''',
3194         r'\g<callback_data>', code)
3195
3196
3197 def js_to_json(code, vars={}):
3198     # vars is a dict of var, val pairs to substitute
3199     COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3200     SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3201     INTEGER_TABLE = (
3202         (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3203         (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3204     )
3205
3206     def fix_kv(m):
3207         v = m.group(0)
3208         if v in ('true', 'false', 'null'):
3209             return v
3210         elif v in ('undefined', 'void 0'):
3211             return 'null'
3212         elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3213             return ""
3214
3215         if v[0] in ("'", '"'):
3216             v = re.sub(r'(?s)\\.|"', lambda m: {
3217                 '"': '\\"',
3218                 "\\'": "'",
3219                 '\\\n': '',
3220                 '\\x': '\\u00',
3221             }.get(m.group(0), m.group(0)), v[1:-1])
3222         else:
3223             for regex, base in INTEGER_TABLE:
3224                 im = re.match(regex, v)
3225                 if im:
3226                     i = int(im.group(1), base)
3227                     return '"%d":' % i if v.endswith(':') else '%d' % i
3228
3229             if v in vars:
3230                 return vars[v]
3231
3232         return '"%s"' % v
3233
3234     def create_map(mobj):
3235         return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3236
3237     code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3238     code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3239
3240     return re.sub(r'''(?sx)
3241         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3242         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3243         {comment}|,(?={skip}[\]}}])|
3244         void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3245         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3246         [0-9]+(?={skip}:)|
3247         !+
3248         '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3249
3250
3251 def qualities(quality_ids):
3252     """ Get a numeric quality value out of a list of possible values """
3253     def q(qid):
3254         try:
3255             return quality_ids.index(qid)
3256         except ValueError:
3257             return -1
3258     return q
3259
3260
3261 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3262
3263
3264 DEFAULT_OUTTMPL = {
3265     'default': '%(title)s [%(id)s].%(ext)s',
3266     'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3267 }
3268 OUTTMPL_TYPES = {
3269     'chapter': None,
3270     'subtitle': None,
3271     'thumbnail': None,
3272     'description': 'description',
3273     'annotation': 'annotations.xml',
3274     'infojson': 'info.json',
3275     'link': None,
3276     'pl_video': None,
3277     'pl_thumbnail': None,
3278     'pl_description': 'description',
3279     'pl_infojson': 'info.json',
3280 }
3281
3282 # As of [1] format syntax is:
3283 #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3284 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3285 STR_FORMAT_RE_TMPL = r'''(?x)
3286     (?<!%)(?P<prefix>(?:%%)*)
3287     %
3288     (?P<has_key>\((?P<key>{0})\))?
3289     (?P<format>
3290         (?P<conversion>[#0\-+ ]+)?
3291         (?P<min_width>\d+)?
3292         (?P<precision>\.\d+)?
3293         (?P<len_mod>[hlL])?  # unused in python
3294         {1}  # conversion type
3295     )
3296 '''
3297
3298
3299 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3300
3301
3302 def limit_length(s, length):
3303     """ Add ellipses to overly long strings """
3304     if s is None:
3305         return None
3306     ELLIPSES = '...'
3307     if len(s) > length:
3308         return s[:length - len(ELLIPSES)] + ELLIPSES
3309     return s
3310
3311
3312 def version_tuple(v):
3313     return tuple(int(e) for e in re.split(r'[-.]', v))
3314
3315
3316 def is_outdated_version(version, limit, assume_new=True):
3317     if not version:
3318         return not assume_new
3319     try:
3320         return version_tuple(version) < version_tuple(limit)
3321     except ValueError:
3322         return not assume_new
3323
3324
3325 def ytdl_is_updateable():
3326     """ Returns if yt-dlp can be updated with -U """
3327
3328     from .update import is_non_updateable
3329
3330     return not is_non_updateable()
3331
3332
3333 def args_to_str(args):
3334     # Get a short string representation for a subprocess command
3335     return ' '.join(compat_shlex_quote(a) for a in args)
3336
3337
3338 def error_to_compat_str(err):
3339     return str(err)
3340
3341
3342 def error_to_str(err):
3343     return f'{type(err).__name__}: {err}'
3344
3345
3346 def mimetype2ext(mt):
3347     if mt is None:
3348         return None
3349
3350     mt, _, params = mt.partition(';')
3351     mt = mt.strip()
3352
3353     FULL_MAP = {
3354         'audio/mp4': 'm4a',
3355         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3356         # it's the most popular one
3357         'audio/mpeg': 'mp3',
3358         'audio/x-wav': 'wav',
3359         'audio/wav': 'wav',
3360         'audio/wave': 'wav',
3361     }
3362
3363     ext = FULL_MAP.get(mt)
3364     if ext is not None:
3365         return ext
3366
3367     SUBTYPE_MAP = {
3368         '3gpp': '3gp',
3369         'smptett+xml': 'tt',
3370         'ttaf+xml': 'dfxp',
3371         'ttml+xml': 'ttml',
3372         'x-flv': 'flv',
3373         'x-mp4-fragmented': 'mp4',
3374         'x-ms-sami': 'sami',
3375         'x-ms-wmv': 'wmv',
3376         'mpegurl': 'm3u8',
3377         'x-mpegurl': 'm3u8',
3378         'vnd.apple.mpegurl': 'm3u8',
3379         'dash+xml': 'mpd',
3380         'f4m+xml': 'f4m',
3381         'hds+xml': 'f4m',
3382         'vnd.ms-sstr+xml': 'ism',
3383         'quicktime': 'mov',
3384         'mp2t': 'ts',
3385         'x-wav': 'wav',
3386         'filmstrip+json': 'fs',
3387         'svg+xml': 'svg',
3388     }
3389
3390     _, _, subtype = mt.rpartition('/')
3391     ext = SUBTYPE_MAP.get(subtype.lower())
3392     if ext is not None:
3393         return ext
3394
3395     SUFFIX_MAP = {
3396         'json': 'json',
3397         'xml': 'xml',
3398         'zip': 'zip',
3399         'gzip': 'gz',
3400     }
3401
3402     _, _, suffix = subtype.partition('+')
3403     ext = SUFFIX_MAP.get(suffix)
3404     if ext is not None:
3405         return ext
3406
3407     return subtype.replace('+', '.')
3408
3409
3410 def ext2mimetype(ext_or_url):
3411     if not ext_or_url:
3412         return None
3413     if '.' not in ext_or_url:
3414         ext_or_url = f'file.{ext_or_url}'
3415     return mimetypes.guess_type(ext_or_url)[0]
3416
3417
3418 def parse_codecs(codecs_str):
3419     # http://tools.ietf.org/html/rfc6381
3420     if not codecs_str:
3421         return {}
3422     split_codecs = list(filter(None, map(
3423         str.strip, codecs_str.strip().strip(',').split(','))))
3424     vcodec, acodec, scodec, hdr = None, None, None, None
3425     for full_codec in split_codecs:
3426         parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3427         if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3428                         'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3429             if vcodec:
3430                 continue
3431             vcodec = full_codec
3432             if parts[0] in ('dvh1', 'dvhe'):
3433                 hdr = 'DV'
3434             elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3435                 hdr = 'HDR10'
3436             elif parts[:2] == ['vp9', '2']:
3437                 hdr = 'HDR10'
3438         elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3439                           'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3440             acodec = acodec or full_codec
3441         elif parts[0] in ('stpp', 'wvtt'):
3442             scodec = scodec or full_codec
3443         else:
3444             write_string(f'WARNING: Unknown codec {full_codec}\n')
3445     if vcodec or acodec or scodec:
3446         return {
3447             'vcodec': vcodec or 'none',
3448             'acodec': acodec or 'none',
3449             'dynamic_range': hdr,
3450             **({'scodec': scodec} if scodec is not None else {}),
3451         }
3452     elif len(split_codecs) == 2:
3453         return {
3454             'vcodec': split_codecs[0],
3455             'acodec': split_codecs[1],
3456         }
3457     return {}
3458
3459
3460 def urlhandle_detect_ext(url_handle):
3461     getheader = url_handle.headers.get
3462
3463     cd = getheader('Content-Disposition')
3464     if cd:
3465         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3466         if m:
3467             e = determine_ext(m.group('filename'), default_ext=None)
3468             if e:
3469                 return e
3470
3471     return mimetype2ext(getheader('Content-Type'))
3472
3473
3474 def encode_data_uri(data, mime_type):
3475     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3476
3477
3478 def age_restricted(content_limit, age_limit):
3479     """ Returns True iff the content should be blocked """
3480
3481     if age_limit is None:  # No limit set
3482         return False
3483     if content_limit is None:
3484         return False  # Content available for everyone
3485     return age_limit < content_limit
3486
3487
3488 # List of known byte-order-marks (BOM)
3489 BOMS = [
3490     (b'\xef\xbb\xbf', 'utf-8'),
3491     (b'\x00\x00\xfe\xff', 'utf-32-be'),
3492     (b'\xff\xfe\x00\x00', 'utf-32-le'),
3493     (b'\xff\xfe', 'utf-16-le'),
3494     (b'\xfe\xff', 'utf-16-be'),
3495 ]
3496
3497
3498 def is_html(first_bytes):
3499     """ Detect whether a file contains HTML by examining its first bytes. """
3500
3501     encoding = 'utf-8'
3502     for bom, enc in BOMS:
3503         while first_bytes.startswith(bom):
3504             encoding, first_bytes = enc, first_bytes[len(bom):]
3505
3506     return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3507
3508
3509 def determine_protocol(info_dict):
3510     protocol = info_dict.get('protocol')
3511     if protocol is not None:
3512         return protocol
3513
3514     url = sanitize_url(info_dict['url'])
3515     if url.startswith('rtmp'):
3516         return 'rtmp'
3517     elif url.startswith('mms'):
3518         return 'mms'
3519     elif url.startswith('rtsp'):
3520         return 'rtsp'
3521
3522     ext = determine_ext(url)
3523     if ext == 'm3u8':
3524         return 'm3u8'
3525     elif ext == 'f4m':
3526         return 'f4m'
3527
3528     return urllib.parse.urlparse(url).scheme
3529
3530
3531 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3532     """ Render a list of rows, each as a list of values.
3533     Text after a \t will be right aligned """
3534     def width(string):
3535         return len(remove_terminal_sequences(string).replace('\t', ''))
3536
3537     def get_max_lens(table):
3538         return [max(width(str(v)) for v in col) for col in zip(*table)]
3539
3540     def filter_using_list(row, filterArray):
3541         return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3542
3543     max_lens = get_max_lens(data) if hide_empty else []
3544     header_row = filter_using_list(header_row, max_lens)
3545     data = [filter_using_list(row, max_lens) for row in data]
3546
3547     table = [header_row] + data
3548     max_lens = get_max_lens(table)
3549     extra_gap += 1
3550     if delim:
3551         table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3552         table[1][-1] = table[1][-1][:-extra_gap * len(delim)]  # Remove extra_gap from end of delimiter
3553     for row in table:
3554         for pos, text in enumerate(map(str, row)):
3555             if '\t' in text:
3556                 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3557             else:
3558                 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3559     ret = '\n'.join(''.join(row).rstrip() for row in table)
3560     return ret
3561
3562
3563 def _match_one(filter_part, dct, incomplete):
3564     # TODO: Generalize code with YoutubeDL._build_format_filter
3565     STRING_OPERATORS = {
3566         '*=': operator.contains,
3567         '^=': lambda attr, value: attr.startswith(value),
3568         '$=': lambda attr, value: attr.endswith(value),
3569         '~=': lambda attr, value: re.search(value, attr),
3570     }
3571     COMPARISON_OPERATORS = {
3572         **STRING_OPERATORS,
3573         '<=': operator.le,  # "<=" must be defined above "<"
3574         '<': operator.lt,
3575         '>=': operator.ge,
3576         '>': operator.gt,
3577         '=': operator.eq,
3578     }
3579
3580     if isinstance(incomplete, bool):
3581         is_incomplete = lambda _: incomplete
3582     else:
3583         is_incomplete = lambda k: k in incomplete
3584
3585     operator_rex = re.compile(r'''(?x)
3586         (?P<key>[a-z_]+)
3587         \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3588         (?:
3589             (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3590             (?P<strval>.+?)
3591         )
3592         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3593     m = operator_rex.fullmatch(filter_part.strip())
3594     if m:
3595         m = m.groupdict()
3596         unnegated_op = COMPARISON_OPERATORS[m['op']]
3597         if m['negation']:
3598             op = lambda attr, value: not unnegated_op(attr, value)
3599         else:
3600             op = unnegated_op
3601         comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3602         if m['quote']:
3603             comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3604         actual_value = dct.get(m['key'])
3605         numeric_comparison = None
3606         if isinstance(actual_value, (int, float)):
3607             # If the original field is a string and matching comparisonvalue is
3608             # a number we should respect the origin of the original field
3609             # and process comparison value as a string (see
3610             # https://github.com/ytdl-org/youtube-dl/issues/11082)
3611             try:
3612                 numeric_comparison = int(comparison_value)
3613             except ValueError:
3614                 numeric_comparison = parse_filesize(comparison_value)
3615                 if numeric_comparison is None:
3616                     numeric_comparison = parse_filesize(f'{comparison_value}B')
3617                 if numeric_comparison is None:
3618                     numeric_comparison = parse_duration(comparison_value)
3619         if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3620             raise ValueError('Operator %s only supports string values!' % m['op'])
3621         if actual_value is None:
3622             return is_incomplete(m['key']) or m['none_inclusive']
3623         return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3624
3625     UNARY_OPERATORS = {
3626         '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3627         '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3628     }
3629     operator_rex = re.compile(r'''(?x)
3630         (?P<op>%s)\s*(?P<key>[a-z_]+)
3631         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3632     m = operator_rex.fullmatch(filter_part.strip())
3633     if m:
3634         op = UNARY_OPERATORS[m.group('op')]
3635         actual_value = dct.get(m.group('key'))
3636         if is_incomplete(m.group('key')) and actual_value is None:
3637             return True
3638         return op(actual_value)
3639
3640     raise ValueError('Invalid filter part %r' % filter_part)
3641
3642
3643 def match_str(filter_str, dct, incomplete=False):
3644     """ Filter a dictionary with a simple string syntax.
3645     @returns           Whether the filter passes
3646     @param incomplete  Set of keys that is expected to be missing from dct.
3647                        Can be True/False to indicate all/none of the keys may be missing.
3648                        All conditions on incomplete keys pass if the key is missing
3649     """
3650     return all(
3651         _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3652         for filter_part in re.split(r'(?<!\\)&', filter_str))
3653
3654
3655 def match_filter_func(filters):
3656     if not filters:
3657         return None
3658     filters = set(variadic(filters))
3659
3660     interactive = '-' in filters
3661     if interactive:
3662         filters.remove('-')
3663
3664     def _match_func(info_dict, incomplete=False):
3665         if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3666             return NO_DEFAULT if interactive and not incomplete else None
3667         else:
3668             video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3669             filter_str = ') | ('.join(map(str.strip, filters))
3670             return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3671     return _match_func
3672
3673
3674 class download_range_func:
3675     def __init__(self, chapters, ranges):
3676         self.chapters, self.ranges = chapters, ranges
3677
3678     def __call__(self, info_dict, ydl):
3679         warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3680                    else 'Cannot match chapters since chapter information is unavailable')
3681         for regex in self.chapters or []:
3682             for i, chapter in enumerate(info_dict.get('chapters') or []):
3683                 if re.search(regex, chapter['title']):
3684                     warning = None
3685                     yield {**chapter, 'index': i}
3686         if self.chapters and warning:
3687             ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3688
3689         yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3690
3691     def __eq__(self, other):
3692         return (isinstance(other, download_range_func)
3693                 and self.chapters == other.chapters and self.ranges == other.ranges)
3694
3695
3696 def parse_dfxp_time_expr(time_expr):
3697     if not time_expr:
3698         return
3699
3700     mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3701     if mobj:
3702         return float(mobj.group('time_offset'))
3703
3704     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3705     if mobj:
3706         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3707
3708
3709 def srt_subtitles_timecode(seconds):
3710     return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3711
3712
3713 def ass_subtitles_timecode(seconds):
3714     time = timetuple_from_msec(seconds * 1000)
3715     return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3716
3717
3718 def dfxp2srt(dfxp_data):
3719     '''
3720     @param dfxp_data A bytes-like object containing DFXP data
3721     @returns A unicode object containing converted SRT data
3722     '''
3723     LEGACY_NAMESPACES = (
3724         (b'http://www.w3.org/ns/ttml', [
3725             b'http://www.w3.org/2004/11/ttaf1',
3726             b'http://www.w3.org/2006/04/ttaf1',
3727             b'http://www.w3.org/2006/10/ttaf1',
3728         ]),
3729         (b'http://www.w3.org/ns/ttml#styling', [
3730             b'http://www.w3.org/ns/ttml#style',
3731         ]),
3732     )
3733
3734     SUPPORTED_STYLING = [
3735         'color',
3736         'fontFamily',
3737         'fontSize',
3738         'fontStyle',
3739         'fontWeight',
3740         'textDecoration'
3741     ]
3742
3743     _x = functools.partial(xpath_with_ns, ns_map={
3744         'xml': 'http://www.w3.org/XML/1998/namespace',
3745         'ttml': 'http://www.w3.org/ns/ttml',
3746         'tts': 'http://www.w3.org/ns/ttml#styling',
3747     })
3748
3749     styles = {}
3750     default_style = {}
3751
3752     class TTMLPElementParser:
3753         _out = ''
3754         _unclosed_elements = []
3755         _applied_styles = []
3756
3757         def start(self, tag, attrib):
3758             if tag in (_x('ttml:br'), 'br'):
3759                 self._out += '\n'
3760             else:
3761                 unclosed_elements = []
3762                 style = {}
3763                 element_style_id = attrib.get('style')
3764                 if default_style:
3765                     style.update(default_style)
3766                 if element_style_id:
3767                     style.update(styles.get(element_style_id, {}))
3768                 for prop in SUPPORTED_STYLING:
3769                     prop_val = attrib.get(_x('tts:' + prop))
3770                     if prop_val:
3771                         style[prop] = prop_val
3772                 if style:
3773                     font = ''
3774                     for k, v in sorted(style.items()):
3775                         if self._applied_styles and self._applied_styles[-1].get(k) == v:
3776                             continue
3777                         if k == 'color':
3778                             font += ' color="%s"' % v
3779                         elif k == 'fontSize':
3780                             font += ' size="%s"' % v
3781                         elif k == 'fontFamily':
3782                             font += ' face="%s"' % v
3783                         elif k == 'fontWeight' and v == 'bold':
3784                             self._out += '<b>'
3785                             unclosed_elements.append('b')
3786                         elif k == 'fontStyle' and v == 'italic':
3787                             self._out += '<i>'
3788                             unclosed_elements.append('i')
3789                         elif k == 'textDecoration' and v == 'underline':
3790                             self._out += '<u>'
3791                             unclosed_elements.append('u')
3792                     if font:
3793                         self._out += '<font' + font + '>'
3794                         unclosed_elements.append('font')
3795                     applied_style = {}
3796                     if self._applied_styles:
3797                         applied_style.update(self._applied_styles[-1])
3798                     applied_style.update(style)
3799                     self._applied_styles.append(applied_style)
3800                 self._unclosed_elements.append(unclosed_elements)
3801
3802         def end(self, tag):
3803             if tag not in (_x('ttml:br'), 'br'):
3804                 unclosed_elements = self._unclosed_elements.pop()
3805                 for element in reversed(unclosed_elements):
3806                     self._out += '</%s>' % element
3807                 if unclosed_elements and self._applied_styles:
3808                     self._applied_styles.pop()
3809
3810         def data(self, data):
3811             self._out += data
3812
3813         def close(self):
3814             return self._out.strip()
3815
3816     def parse_node(node):
3817         target = TTMLPElementParser()
3818         parser = xml.etree.ElementTree.XMLParser(target=target)
3819         parser.feed(xml.etree.ElementTree.tostring(node))
3820         return parser.close()
3821
3822     for k, v in LEGACY_NAMESPACES:
3823         for ns in v:
3824             dfxp_data = dfxp_data.replace(ns, k)
3825
3826     dfxp = compat_etree_fromstring(dfxp_data)
3827     out = []
3828     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3829
3830     if not paras:
3831         raise ValueError('Invalid dfxp/TTML subtitle')
3832
3833     repeat = False
3834     while True:
3835         for style in dfxp.findall(_x('.//ttml:style')):
3836             style_id = style.get('id') or style.get(_x('xml:id'))
3837             if not style_id:
3838                 continue
3839             parent_style_id = style.get('style')
3840             if parent_style_id:
3841                 if parent_style_id not in styles:
3842                     repeat = True
3843                     continue
3844                 styles[style_id] = styles[parent_style_id].copy()
3845             for prop in SUPPORTED_STYLING:
3846                 prop_val = style.get(_x('tts:' + prop))
3847                 if prop_val:
3848                     styles.setdefault(style_id, {})[prop] = prop_val
3849         if repeat:
3850             repeat = False
3851         else:
3852             break
3853
3854     for p in ('body', 'div'):
3855         ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3856         if ele is None:
3857             continue
3858         style = styles.get(ele.get('style'))
3859         if not style:
3860             continue
3861         default_style.update(style)
3862
3863     for para, index in zip(paras, itertools.count(1)):
3864         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3865         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3866         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3867         if begin_time is None:
3868             continue
3869         if not end_time:
3870             if not dur:
3871                 continue
3872             end_time = begin_time + dur
3873         out.append('%d\n%s --> %s\n%s\n\n' % (
3874             index,
3875             srt_subtitles_timecode(begin_time),
3876             srt_subtitles_timecode(end_time),
3877             parse_node(para)))
3878
3879     return ''.join(out)
3880
3881
3882 def cli_option(params, command_option, param, separator=None):
3883     param = params.get(param)
3884     return ([] if param is None
3885             else [command_option, str(param)] if separator is None
3886             else [f'{command_option}{separator}{param}'])
3887
3888
3889 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3890     param = params.get(param)
3891     assert param in (True, False, None)
3892     return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3893
3894
3895 def cli_valueless_option(params, command_option, param, expected_value=True):
3896     return [command_option] if params.get(param) == expected_value else []
3897
3898
3899 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3900     if isinstance(argdict, (list, tuple)):  # for backward compatibility
3901         if use_compat:
3902             return argdict
3903         else:
3904             argdict = None
3905     if argdict is None:
3906         return default
3907     assert isinstance(argdict, dict)
3908
3909     assert isinstance(keys, (list, tuple))
3910     for key_list in keys:
3911         arg_list = list(filter(
3912             lambda x: x is not None,
3913             [argdict.get(key.lower()) for key in variadic(key_list)]))
3914         if arg_list:
3915             return [arg for args in arg_list for arg in args]
3916     return default
3917
3918
3919 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3920     main_key, exe = main_key.lower(), exe.lower()
3921     root_key = exe if main_key == exe else f'{main_key}+{exe}'
3922     keys = [f'{root_key}{k}' for k in (keys or [''])]
3923     if root_key in keys:
3924         if main_key != exe:
3925             keys.append((main_key, exe))
3926         keys.append('default')
3927     else:
3928         use_compat = False
3929     return cli_configuration_args(argdict, keys, default, use_compat)
3930
3931
3932 class ISO639Utils:
3933     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3934     _lang_map = {
3935         'aa': 'aar',
3936         'ab': 'abk',
3937         'ae': 'ave',
3938         'af': 'afr',
3939         'ak': 'aka',
3940         'am': 'amh',
3941         'an': 'arg',
3942         'ar': 'ara',
3943         'as': 'asm',
3944         'av': 'ava',
3945         'ay': 'aym',
3946         'az': 'aze',
3947         'ba': 'bak',
3948         'be': 'bel',
3949         'bg': 'bul',
3950         'bh': 'bih',
3951         'bi': 'bis',
3952         'bm': 'bam',
3953         'bn': 'ben',
3954         'bo': 'bod',
3955         'br': 'bre',
3956         'bs': 'bos',
3957         'ca': 'cat',
3958         'ce': 'che',
3959         'ch': 'cha',
3960         'co': 'cos',
3961         'cr': 'cre',
3962         'cs': 'ces',
3963         'cu': 'chu',
3964         'cv': 'chv',
3965         'cy': 'cym',
3966         'da': 'dan',
3967         'de': 'deu',
3968         'dv': 'div',
3969         'dz': 'dzo',
3970         'ee': 'ewe',
3971         'el': 'ell',
3972         'en': 'eng',
3973         'eo': 'epo',
3974         'es': 'spa',
3975         'et': 'est',
3976         'eu': 'eus',
3977         'fa': 'fas',
3978         'ff': 'ful',
3979         'fi': 'fin',
3980         'fj': 'fij',
3981         'fo': 'fao',
3982         'fr': 'fra',
3983         'fy': 'fry',
3984         'ga': 'gle',
3985         'gd': 'gla',
3986         'gl': 'glg',
3987         'gn': 'grn',
3988         'gu': 'guj',
3989         'gv': 'glv',
3990         'ha': 'hau',
3991         'he': 'heb',
3992         'iw': 'heb',  # Replaced by he in 1989 revision
3993         'hi': 'hin',
3994         'ho': 'hmo',
3995         'hr': 'hrv',
3996         'ht': 'hat',
3997         'hu': 'hun',
3998         'hy': 'hye',
3999         'hz': 'her',
4000         'ia': 'ina',
4001         'id': 'ind',
4002         'in': 'ind',  # Replaced by id in 1989 revision
4003         'ie': 'ile',
4004         'ig': 'ibo',
4005         'ii': 'iii',
4006         'ik': 'ipk',
4007         'io': 'ido',
4008         'is': 'isl',
4009         'it': 'ita',
4010         'iu': 'iku',
4011         'ja': 'jpn',
4012         'jv': 'jav',
4013         'ka': 'kat',
4014         'kg': 'kon',
4015         'ki': 'kik',
4016         'kj': 'kua',
4017         'kk': 'kaz',
4018         'kl': 'kal',
4019         'km': 'khm',
4020         'kn': 'kan',
4021         'ko': 'kor',
4022         'kr': 'kau',
4023         'ks': 'kas',
4024         'ku': 'kur',
4025         'kv': 'kom',
4026         'kw': 'cor',
4027         'ky': 'kir',
4028         'la': 'lat',
4029         'lb': 'ltz',
4030         'lg': 'lug',
4031         'li': 'lim',
4032         'ln': 'lin',
4033         'lo': 'lao',
4034         'lt': 'lit',
4035         'lu': 'lub',
4036         'lv': 'lav',
4037         'mg': 'mlg',
4038         'mh': 'mah',
4039         'mi': 'mri',
4040         'mk': 'mkd',
4041         'ml': 'mal',
4042         'mn': 'mon',
4043         'mr': 'mar',
4044         'ms': 'msa',
4045         'mt': 'mlt',
4046         'my': 'mya',
4047         'na': 'nau',
4048         'nb': 'nob',
4049         'nd': 'nde',
4050         'ne': 'nep',
4051         'ng': 'ndo',
4052         'nl': 'nld',
4053         'nn': 'nno',
4054         'no': 'nor',
4055         'nr': 'nbl',
4056         'nv': 'nav',
4057         'ny': 'nya',
4058         'oc': 'oci',
4059         'oj': 'oji',
4060         'om': 'orm',
4061         'or': 'ori',
4062         'os': 'oss',
4063         'pa': 'pan',
4064         'pi': 'pli',
4065         'pl': 'pol',
4066         'ps': 'pus',
4067         'pt': 'por',
4068         'qu': 'que',
4069         'rm': 'roh',
4070         'rn': 'run',
4071         'ro': 'ron',
4072         'ru': 'rus',
4073         'rw': 'kin',
4074         'sa': 'san',
4075         'sc': 'srd',
4076         'sd': 'snd',
4077         'se': 'sme',
4078         'sg': 'sag',
4079         'si': 'sin',
4080         'sk': 'slk',
4081         'sl': 'slv',
4082         'sm': 'smo',
4083         'sn': 'sna',
4084         'so': 'som',
4085         'sq': 'sqi',
4086         'sr': 'srp',
4087         'ss': 'ssw',
4088         'st': 'sot',
4089         'su': 'sun',
4090         'sv': 'swe',
4091         'sw': 'swa',
4092         'ta': 'tam',
4093         'te': 'tel',
4094         'tg': 'tgk',
4095         'th': 'tha',
4096         'ti': 'tir',
4097         'tk': 'tuk',
4098         'tl': 'tgl',
4099         'tn': 'tsn',
4100         'to': 'ton',
4101         'tr': 'tur',
4102         'ts': 'tso',
4103         'tt': 'tat',
4104         'tw': 'twi',
4105         'ty': 'tah',
4106         'ug': 'uig',
4107         'uk': 'ukr',
4108         'ur': 'urd',
4109         'uz': 'uzb',
4110         've': 'ven',
4111         'vi': 'vie',
4112         'vo': 'vol',
4113         'wa': 'wln',
4114         'wo': 'wol',
4115         'xh': 'xho',
4116         'yi': 'yid',
4117         'ji': 'yid',  # Replaced by yi in 1989 revision
4118         'yo': 'yor',
4119         'za': 'zha',
4120         'zh': 'zho',
4121         'zu': 'zul',
4122     }
4123
4124     @classmethod
4125     def short2long(cls, code):
4126         """Convert language code from ISO 639-1 to ISO 639-2/T"""
4127         return cls._lang_map.get(code[:2])
4128
4129     @classmethod
4130     def long2short(cls, code):
4131         """Convert language code from ISO 639-2/T to ISO 639-1"""
4132         for short_name, long_name in cls._lang_map.items():
4133             if long_name == code:
4134                 return short_name
4135
4136
4137 class ISO3166Utils:
4138     # From http://data.okfn.org/data/core/country-list
4139     _country_map = {
4140         'AF': 'Afghanistan',
4141         'AX': 'Åland Islands',
4142         'AL': 'Albania',
4143         'DZ': 'Algeria',
4144         'AS': 'American Samoa',
4145         'AD': 'Andorra',
4146         'AO': 'Angola',
4147         'AI': 'Anguilla',
4148         'AQ': 'Antarctica',
4149         'AG': 'Antigua and Barbuda',
4150         'AR': 'Argentina',
4151         'AM': 'Armenia',
4152         'AW': 'Aruba',
4153         'AU': 'Australia',
4154         'AT': 'Austria',
4155         'AZ': 'Azerbaijan',
4156         'BS': 'Bahamas',
4157         'BH': 'Bahrain',
4158         'BD': 'Bangladesh',
4159         'BB': 'Barbados',
4160         'BY': 'Belarus',
4161         'BE': 'Belgium',
4162         'BZ': 'Belize',
4163         'BJ': 'Benin',
4164         'BM': 'Bermuda',
4165         'BT': 'Bhutan',
4166         'BO': 'Bolivia, Plurinational State of',
4167         'BQ': 'Bonaire, Sint Eustatius and Saba',
4168         'BA': 'Bosnia and Herzegovina',
4169         'BW': 'Botswana',
4170         'BV': 'Bouvet Island',
4171         'BR': 'Brazil',
4172         'IO': 'British Indian Ocean Territory',
4173         'BN': 'Brunei Darussalam',
4174         'BG': 'Bulgaria',
4175         'BF': 'Burkina Faso',
4176         'BI': 'Burundi',
4177         'KH': 'Cambodia',
4178         'CM': 'Cameroon',
4179         'CA': 'Canada',
4180         'CV': 'Cape Verde',
4181         'KY': 'Cayman Islands',
4182         'CF': 'Central African Republic',
4183         'TD': 'Chad',
4184         'CL': 'Chile',
4185         'CN': 'China',
4186         'CX': 'Christmas Island',
4187         'CC': 'Cocos (Keeling) Islands',
4188         'CO': 'Colombia',
4189         'KM': 'Comoros',
4190         'CG': 'Congo',
4191         'CD': 'Congo, the Democratic Republic of the',
4192         'CK': 'Cook Islands',
4193         'CR': 'Costa Rica',
4194         'CI': 'Côte d\'Ivoire',
4195         'HR': 'Croatia',
4196         'CU': 'Cuba',
4197         'CW': 'Curaçao',
4198         'CY': 'Cyprus',
4199         'CZ': 'Czech Republic',
4200         'DK': 'Denmark',
4201         'DJ': 'Djibouti',
4202         'DM': 'Dominica',
4203         'DO': 'Dominican Republic',
4204         'EC': 'Ecuador',
4205         'EG': 'Egypt',
4206         'SV': 'El Salvador',
4207         'GQ': 'Equatorial Guinea',
4208         'ER': 'Eritrea',
4209         'EE': 'Estonia',
4210         'ET': 'Ethiopia',
4211         'FK': 'Falkland Islands (Malvinas)',
4212         'FO': 'Faroe Islands',
4213         'FJ': 'Fiji',
4214         'FI': 'Finland',
4215         'FR': 'France',
4216         'GF': 'French Guiana',
4217         'PF': 'French Polynesia',
4218         'TF': 'French Southern Territories',
4219         'GA': 'Gabon',
4220         'GM': 'Gambia',
4221         'GE': 'Georgia',
4222         'DE': 'Germany',
4223         'GH': 'Ghana',
4224         'GI': 'Gibraltar',
4225         'GR': 'Greece',
4226         'GL': 'Greenland',
4227         'GD': 'Grenada',
4228         'GP': 'Guadeloupe',
4229         'GU': 'Guam',
4230         'GT': 'Guatemala',
4231         'GG': 'Guernsey',
4232         'GN': 'Guinea',
4233         'GW': 'Guinea-Bissau',
4234         'GY': 'Guyana',
4235         'HT': 'Haiti',
4236         'HM': 'Heard Island and McDonald Islands',
4237         'VA': 'Holy See (Vatican City State)',
4238         'HN': 'Honduras',
4239         'HK': 'Hong Kong',
4240         'HU': 'Hungary',
4241         'IS': 'Iceland',
4242         'IN': 'India',
4243         'ID': 'Indonesia',
4244         'IR': 'Iran, Islamic Republic of',
4245         'IQ': 'Iraq',
4246         'IE': 'Ireland',
4247         'IM': 'Isle of Man',
4248         'IL': 'Israel',
4249         'IT': 'Italy',
4250         'JM': 'Jamaica',
4251         'JP': 'Japan',
4252         'JE': 'Jersey',
4253         'JO': 'Jordan',
4254         'KZ': 'Kazakhstan',
4255         'KE': 'Kenya',
4256         'KI': 'Kiribati',
4257         'KP': 'Korea, Democratic People\'s Republic of',
4258         'KR': 'Korea, Republic of',
4259         'KW': 'Kuwait',
4260         'KG': 'Kyrgyzstan',
4261         'LA': 'Lao People\'s Democratic Republic',
4262         'LV': 'Latvia',
4263         'LB': 'Lebanon',
4264         'LS': 'Lesotho',
4265         'LR': 'Liberia',
4266         'LY': 'Libya',
4267         'LI': 'Liechtenstein',
4268         'LT': 'Lithuania',
4269         'LU': 'Luxembourg',
4270         'MO': 'Macao',
4271         'MK': 'Macedonia, the Former Yugoslav Republic of',
4272         'MG': 'Madagascar',
4273         'MW': 'Malawi',
4274         'MY': 'Malaysia',
4275         'MV': 'Maldives',
4276         'ML': 'Mali',
4277         'MT': 'Malta',
4278         'MH': 'Marshall Islands',
4279         'MQ': 'Martinique',
4280         'MR': 'Mauritania',
4281         'MU': 'Mauritius',
4282         'YT': 'Mayotte',
4283         'MX': 'Mexico',
4284         'FM': 'Micronesia, Federated States of',
4285         'MD': 'Moldova, Republic of',
4286         'MC': 'Monaco',
4287         'MN': 'Mongolia',
4288         'ME': 'Montenegro',
4289         'MS': 'Montserrat',
4290         'MA': 'Morocco',
4291         'MZ': 'Mozambique',
4292         'MM': 'Myanmar',
4293         'NA': 'Namibia',
4294         'NR': 'Nauru',
4295         'NP': 'Nepal',
4296         'NL': 'Netherlands',
4297         'NC': 'New Caledonia',
4298         'NZ': 'New Zealand',
4299         'NI': 'Nicaragua',
4300         'NE': 'Niger',
4301         'NG': 'Nigeria',
4302         'NU': 'Niue',
4303         'NF': 'Norfolk Island',
4304         'MP': 'Northern Mariana Islands',
4305         'NO': 'Norway',
4306         'OM': 'Oman',
4307         'PK': 'Pakistan',
4308         'PW': 'Palau',
4309         'PS': 'Palestine, State of',
4310         'PA': 'Panama',
4311         'PG': 'Papua New Guinea',
4312         'PY': 'Paraguay',
4313         'PE': 'Peru',
4314         'PH': 'Philippines',
4315         'PN': 'Pitcairn',
4316         'PL': 'Poland',
4317         'PT': 'Portugal',
4318         'PR': 'Puerto Rico',
4319         'QA': 'Qatar',
4320         'RE': 'Réunion',
4321         'RO': 'Romania',
4322         'RU': 'Russian Federation',
4323         'RW': 'Rwanda',
4324         'BL': 'Saint Barthélemy',
4325         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4326         'KN': 'Saint Kitts and Nevis',
4327         'LC': 'Saint Lucia',
4328         'MF': 'Saint Martin (French part)',
4329         'PM': 'Saint Pierre and Miquelon',
4330         'VC': 'Saint Vincent and the Grenadines',
4331         'WS': 'Samoa',
4332         'SM': 'San Marino',
4333         'ST': 'Sao Tome and Principe',
4334         'SA': 'Saudi Arabia',
4335         'SN': 'Senegal',
4336         'RS': 'Serbia',
4337         'SC': 'Seychelles',
4338         'SL': 'Sierra Leone',
4339         'SG': 'Singapore',
4340         'SX': 'Sint Maarten (Dutch part)',
4341         'SK': 'Slovakia',
4342         'SI': 'Slovenia',
4343         'SB': 'Solomon Islands',
4344         'SO': 'Somalia',
4345         'ZA': 'South Africa',
4346         'GS': 'South Georgia and the South Sandwich Islands',
4347         'SS': 'South Sudan',
4348         'ES': 'Spain',
4349         'LK': 'Sri Lanka',
4350         'SD': 'Sudan',
4351         'SR': 'Suriname',
4352         'SJ': 'Svalbard and Jan Mayen',
4353         'SZ': 'Swaziland',
4354         'SE': 'Sweden',
4355         'CH': 'Switzerland',
4356         'SY': 'Syrian Arab Republic',
4357         'TW': 'Taiwan, Province of China',
4358         'TJ': 'Tajikistan',
4359         'TZ': 'Tanzania, United Republic of',
4360         'TH': 'Thailand',
4361         'TL': 'Timor-Leste',
4362         'TG': 'Togo',
4363         'TK': 'Tokelau',
4364         'TO': 'Tonga',
4365         'TT': 'Trinidad and Tobago',
4366         'TN': 'Tunisia',
4367         'TR': 'Turkey',
4368         'TM': 'Turkmenistan',
4369         'TC': 'Turks and Caicos Islands',
4370         'TV': 'Tuvalu',
4371         'UG': 'Uganda',
4372         'UA': 'Ukraine',
4373         'AE': 'United Arab Emirates',
4374         'GB': 'United Kingdom',
4375         'US': 'United States',
4376         'UM': 'United States Minor Outlying Islands',
4377         'UY': 'Uruguay',
4378         'UZ': 'Uzbekistan',
4379         'VU': 'Vanuatu',
4380         'VE': 'Venezuela, Bolivarian Republic of',
4381         'VN': 'Viet Nam',
4382         'VG': 'Virgin Islands, British',
4383         'VI': 'Virgin Islands, U.S.',
4384         'WF': 'Wallis and Futuna',
4385         'EH': 'Western Sahara',
4386         'YE': 'Yemen',
4387         'ZM': 'Zambia',
4388         'ZW': 'Zimbabwe',
4389         # Not ISO 3166 codes, but used for IP blocks
4390         'AP': 'Asia/Pacific Region',
4391         'EU': 'Europe',
4392     }
4393
4394     @classmethod
4395     def short2full(cls, code):
4396         """Convert an ISO 3166-2 country code to the corresponding full name"""
4397         return cls._country_map.get(code.upper())
4398
4399
4400 class GeoUtils:
4401     # Major IPv4 address blocks per country
4402     _country_ip_map = {
4403         'AD': '46.172.224.0/19',
4404         'AE': '94.200.0.0/13',
4405         'AF': '149.54.0.0/17',
4406         'AG': '209.59.64.0/18',
4407         'AI': '204.14.248.0/21',
4408         'AL': '46.99.0.0/16',
4409         'AM': '46.70.0.0/15',
4410         'AO': '105.168.0.0/13',
4411         'AP': '182.50.184.0/21',
4412         'AQ': '23.154.160.0/24',
4413         'AR': '181.0.0.0/12',
4414         'AS': '202.70.112.0/20',
4415         'AT': '77.116.0.0/14',
4416         'AU': '1.128.0.0/11',
4417         'AW': '181.41.0.0/18',
4418         'AX': '185.217.4.0/22',
4419         'AZ': '5.197.0.0/16',
4420         'BA': '31.176.128.0/17',
4421         'BB': '65.48.128.0/17',
4422         'BD': '114.130.0.0/16',
4423         'BE': '57.0.0.0/8',
4424         'BF': '102.178.0.0/15',
4425         'BG': '95.42.0.0/15',
4426         'BH': '37.131.0.0/17',
4427         'BI': '154.117.192.0/18',
4428         'BJ': '137.255.0.0/16',
4429         'BL': '185.212.72.0/23',
4430         'BM': '196.12.64.0/18',
4431         'BN': '156.31.0.0/16',
4432         'BO': '161.56.0.0/16',
4433         'BQ': '161.0.80.0/20',
4434         'BR': '191.128.0.0/12',
4435         'BS': '24.51.64.0/18',
4436         'BT': '119.2.96.0/19',
4437         'BW': '168.167.0.0/16',
4438         'BY': '178.120.0.0/13',
4439         'BZ': '179.42.192.0/18',
4440         'CA': '99.224.0.0/11',
4441         'CD': '41.243.0.0/16',
4442         'CF': '197.242.176.0/21',
4443         'CG': '160.113.0.0/16',
4444         'CH': '85.0.0.0/13',
4445         'CI': '102.136.0.0/14',
4446         'CK': '202.65.32.0/19',
4447         'CL': '152.172.0.0/14',
4448         'CM': '102.244.0.0/14',
4449         'CN': '36.128.0.0/10',
4450         'CO': '181.240.0.0/12',
4451         'CR': '201.192.0.0/12',
4452         'CU': '152.206.0.0/15',
4453         'CV': '165.90.96.0/19',
4454         'CW': '190.88.128.0/17',
4455         'CY': '31.153.0.0/16',
4456         'CZ': '88.100.0.0/14',
4457         'DE': '53.0.0.0/8',
4458         'DJ': '197.241.0.0/17',
4459         'DK': '87.48.0.0/12',
4460         'DM': '192.243.48.0/20',
4461         'DO': '152.166.0.0/15',
4462         'DZ': '41.96.0.0/12',
4463         'EC': '186.68.0.0/15',
4464         'EE': '90.190.0.0/15',
4465         'EG': '156.160.0.0/11',
4466         'ER': '196.200.96.0/20',
4467         'ES': '88.0.0.0/11',
4468         'ET': '196.188.0.0/14',
4469         'EU': '2.16.0.0/13',
4470         'FI': '91.152.0.0/13',
4471         'FJ': '144.120.0.0/16',
4472         'FK': '80.73.208.0/21',
4473         'FM': '119.252.112.0/20',
4474         'FO': '88.85.32.0/19',
4475         'FR': '90.0.0.0/9',
4476         'GA': '41.158.0.0/15',
4477         'GB': '25.0.0.0/8',
4478         'GD': '74.122.88.0/21',
4479         'GE': '31.146.0.0/16',
4480         'GF': '161.22.64.0/18',
4481         'GG': '62.68.160.0/19',
4482         'GH': '154.160.0.0/12',
4483         'GI': '95.164.0.0/16',
4484         'GL': '88.83.0.0/19',
4485         'GM': '160.182.0.0/15',
4486         'GN': '197.149.192.0/18',
4487         'GP': '104.250.0.0/19',
4488         'GQ': '105.235.224.0/20',
4489         'GR': '94.64.0.0/13',
4490         'GT': '168.234.0.0/16',
4491         'GU': '168.123.0.0/16',
4492         'GW': '197.214.80.0/20',
4493         'GY': '181.41.64.0/18',
4494         'HK': '113.252.0.0/14',
4495         'HN': '181.210.0.0/16',
4496         'HR': '93.136.0.0/13',
4497         'HT': '148.102.128.0/17',
4498         'HU': '84.0.0.0/14',
4499         'ID': '39.192.0.0/10',
4500         'IE': '87.32.0.0/12',
4501         'IL': '79.176.0.0/13',
4502         'IM': '5.62.80.0/20',
4503         'IN': '117.192.0.0/10',
4504         'IO': '203.83.48.0/21',
4505         'IQ': '37.236.0.0/14',
4506         'IR': '2.176.0.0/12',
4507         'IS': '82.221.0.0/16',
4508         'IT': '79.0.0.0/10',
4509         'JE': '87.244.64.0/18',
4510         'JM': '72.27.0.0/17',
4511         'JO': '176.29.0.0/16',
4512         'JP': '133.0.0.0/8',
4513         'KE': '105.48.0.0/12',
4514         'KG': '158.181.128.0/17',
4515         'KH': '36.37.128.0/17',
4516         'KI': '103.25.140.0/22',
4517         'KM': '197.255.224.0/20',
4518         'KN': '198.167.192.0/19',
4519         'KP': '175.45.176.0/22',
4520         'KR': '175.192.0.0/10',
4521         'KW': '37.36.0.0/14',
4522         'KY': '64.96.0.0/15',
4523         'KZ': '2.72.0.0/13',
4524         'LA': '115.84.64.0/18',
4525         'LB': '178.135.0.0/16',
4526         'LC': '24.92.144.0/20',
4527         'LI': '82.117.0.0/19',
4528         'LK': '112.134.0.0/15',
4529         'LR': '102.183.0.0/16',
4530         'LS': '129.232.0.0/17',
4531         'LT': '78.56.0.0/13',
4532         'LU': '188.42.0.0/16',
4533         'LV': '46.109.0.0/16',
4534         'LY': '41.252.0.0/14',
4535         'MA': '105.128.0.0/11',
4536         'MC': '88.209.64.0/18',
4537         'MD': '37.246.0.0/16',
4538         'ME': '178.175.0.0/17',
4539         'MF': '74.112.232.0/21',
4540         'MG': '154.126.0.0/17',
4541         'MH': '117.103.88.0/21',
4542         'MK': '77.28.0.0/15',
4543         'ML': '154.118.128.0/18',
4544         'MM': '37.111.0.0/17',
4545         'MN': '49.0.128.0/17',
4546         'MO': '60.246.0.0/16',
4547         'MP': '202.88.64.0/20',
4548         'MQ': '109.203.224.0/19',
4549         'MR': '41.188.64.0/18',
4550         'MS': '208.90.112.0/22',
4551         'MT': '46.11.0.0/16',
4552         'MU': '105.16.0.0/12',
4553         'MV': '27.114.128.0/18',
4554         'MW': '102.70.0.0/15',
4555         'MX': '187.192.0.0/11',
4556         'MY': '175.136.0.0/13',
4557         'MZ': '197.218.0.0/15',
4558         'NA': '41.182.0.0/16',
4559         'NC': '101.101.0.0/18',
4560         'NE': '197.214.0.0/18',
4561         'NF': '203.17.240.0/22',
4562         'NG': '105.112.0.0/12',
4563         'NI': '186.76.0.0/15',
4564         'NL': '145.96.0.0/11',
4565         'NO': '84.208.0.0/13',
4566         'NP': '36.252.0.0/15',
4567         'NR': '203.98.224.0/19',
4568         'NU': '49.156.48.0/22',
4569         'NZ': '49.224.0.0/14',
4570         'OM': '5.36.0.0/15',
4571         'PA': '186.72.0.0/15',
4572         'PE': '186.160.0.0/14',
4573         'PF': '123.50.64.0/18',
4574         'PG': '124.240.192.0/19',
4575         'PH': '49.144.0.0/13',
4576         'PK': '39.32.0.0/11',
4577         'PL': '83.0.0.0/11',
4578         'PM': '70.36.0.0/20',
4579         'PR': '66.50.0.0/16',
4580         'PS': '188.161.0.0/16',
4581         'PT': '85.240.0.0/13',
4582         'PW': '202.124.224.0/20',
4583         'PY': '181.120.0.0/14',
4584         'QA': '37.210.0.0/15',
4585         'RE': '102.35.0.0/16',
4586         'RO': '79.112.0.0/13',
4587         'RS': '93.86.0.0/15',
4588         'RU': '5.136.0.0/13',
4589         'RW': '41.186.0.0/16',
4590         'SA': '188.48.0.0/13',
4591         'SB': '202.1.160.0/19',
4592         'SC': '154.192.0.0/11',
4593         'SD': '102.120.0.0/13',
4594         'SE': '78.64.0.0/12',
4595         'SG': '8.128.0.0/10',
4596         'SI': '188.196.0.0/14',
4597         'SK': '78.98.0.0/15',
4598         'SL': '102.143.0.0/17',
4599         'SM': '89.186.32.0/19',
4600         'SN': '41.82.0.0/15',
4601         'SO': '154.115.192.0/18',
4602         'SR': '186.179.128.0/17',
4603         'SS': '105.235.208.0/21',
4604         'ST': '197.159.160.0/19',
4605         'SV': '168.243.0.0/16',
4606         'SX': '190.102.0.0/20',
4607         'SY': '5.0.0.0/16',
4608         'SZ': '41.84.224.0/19',
4609         'TC': '65.255.48.0/20',
4610         'TD': '154.68.128.0/19',
4611         'TG': '196.168.0.0/14',
4612         'TH': '171.96.0.0/13',
4613         'TJ': '85.9.128.0/18',
4614         'TK': '27.96.24.0/21',
4615         'TL': '180.189.160.0/20',
4616         'TM': '95.85.96.0/19',
4617         'TN': '197.0.0.0/11',
4618         'TO': '175.176.144.0/21',
4619         'TR': '78.160.0.0/11',
4620         'TT': '186.44.0.0/15',
4621         'TV': '202.2.96.0/19',
4622         'TW': '120.96.0.0/11',
4623         'TZ': '156.156.0.0/14',
4624         'UA': '37.52.0.0/14',
4625         'UG': '102.80.0.0/13',
4626         'US': '6.0.0.0/8',
4627         'UY': '167.56.0.0/13',
4628         'UZ': '84.54.64.0/18',
4629         'VA': '212.77.0.0/19',
4630         'VC': '207.191.240.0/21',
4631         'VE': '186.88.0.0/13',
4632         'VG': '66.81.192.0/20',
4633         'VI': '146.226.0.0/16',
4634         'VN': '14.160.0.0/11',
4635         'VU': '202.80.32.0/20',
4636         'WF': '117.20.32.0/21',
4637         'WS': '202.4.32.0/19',
4638         'YE': '134.35.0.0/16',
4639         'YT': '41.242.116.0/22',
4640         'ZA': '41.0.0.0/11',
4641         'ZM': '102.144.0.0/13',
4642         'ZW': '102.177.192.0/18',
4643     }
4644
4645     @classmethod
4646     def random_ipv4(cls, code_or_block):
4647         if len(code_or_block) == 2:
4648             block = cls._country_ip_map.get(code_or_block.upper())
4649             if not block:
4650                 return None
4651         else:
4652             block = code_or_block
4653         addr, preflen = block.split('/')
4654         addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4655         addr_max = addr_min | (0xffffffff >> int(preflen))
4656         return str(socket.inet_ntoa(
4657             struct.pack('!L', random.randint(addr_min, addr_max))))
4658
4659
4660 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4661     def __init__(self, proxies=None):
4662         # Set default handlers
4663         for type in ('http', 'https'):
4664             setattr(self, '%s_open' % type,
4665                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4666                         meth(r, proxy, type))
4667         urllib.request.ProxyHandler.__init__(self, proxies)
4668
4669     def proxy_open(self, req, proxy, type):
4670         req_proxy = req.headers.get('Ytdl-request-proxy')
4671         if req_proxy is not None:
4672             proxy = req_proxy
4673             del req.headers['Ytdl-request-proxy']
4674
4675         if proxy == '__noproxy__':
4676             return None  # No Proxy
4677         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4678             req.add_header('Ytdl-socks-proxy', proxy)
4679             # yt-dlp's http/https handlers do wrapping the socket with socks
4680             return None
4681         return urllib.request.ProxyHandler.proxy_open(
4682             self, req, proxy, type)
4683
4684
4685 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4686 # released into Public Domain
4687 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4688
4689 def long_to_bytes(n, blocksize=0):
4690     """long_to_bytes(n:long, blocksize:int) : string
4691     Convert a long integer to a byte string.
4692
4693     If optional blocksize is given and greater than zero, pad the front of the
4694     byte string with binary zeros so that the length is a multiple of
4695     blocksize.
4696     """
4697     # after much testing, this algorithm was deemed to be the fastest
4698     s = b''
4699     n = int(n)
4700     while n > 0:
4701         s = struct.pack('>I', n & 0xffffffff) + s
4702         n = n >> 32
4703     # strip off leading zeros
4704     for i in range(len(s)):
4705         if s[i] != b'\000'[0]:
4706             break
4707     else:
4708         # only happens when n == 0
4709         s = b'\000'
4710         i = 0
4711     s = s[i:]
4712     # add back some pad bytes.  this could be done more efficiently w.r.t. the
4713     # de-padding being done above, but sigh...
4714     if blocksize > 0 and len(s) % blocksize:
4715         s = (blocksize - len(s) % blocksize) * b'\000' + s
4716     return s
4717
4718
4719 def bytes_to_long(s):
4720     """bytes_to_long(string) : long
4721     Convert a byte string to a long integer.
4722
4723     This is (essentially) the inverse of long_to_bytes().
4724     """
4725     acc = 0
4726     length = len(s)
4727     if length % 4:
4728         extra = (4 - length % 4)
4729         s = b'\000' * extra + s
4730         length = length + extra
4731     for i in range(0, length, 4):
4732         acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4733     return acc
4734
4735
4736 def ohdave_rsa_encrypt(data, exponent, modulus):
4737     '''
4738     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4739
4740     Input:
4741         data: data to encrypt, bytes-like object
4742         exponent, modulus: parameter e and N of RSA algorithm, both integer
4743     Output: hex string of encrypted data
4744
4745     Limitation: supports one block encryption only
4746     '''
4747
4748     payload = int(binascii.hexlify(data[::-1]), 16)
4749     encrypted = pow(payload, exponent, modulus)
4750     return '%x' % encrypted
4751
4752
4753 def pkcs1pad(data, length):
4754     """
4755     Padding input data with PKCS#1 scheme
4756
4757     @param {int[]} data        input data
4758     @param {int}   length      target length
4759     @returns {int[]}           padded data
4760     """
4761     if len(data) > length - 11:
4762         raise ValueError('Input data too long for PKCS#1 padding')
4763
4764     pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4765     return [0, 2] + pseudo_random + [0] + data
4766
4767
4768 def _base_n_table(n, table):
4769     if not table and not n:
4770         raise ValueError('Either table or n must be specified')
4771     table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4772
4773     if n and n != len(table):
4774         raise ValueError(f'base {n} exceeds table length {len(table)}')
4775     return table
4776
4777
4778 def encode_base_n(num, n=None, table=None):
4779     """Convert given int to a base-n string"""
4780     table = _base_n_table(n, table)
4781     if not num:
4782         return table[0]
4783
4784     result, base = '', len(table)
4785     while num:
4786         result = table[num % base] + result
4787         num = num // base
4788     return result
4789
4790
4791 def decode_base_n(string, n=None, table=None):
4792     """Convert given base-n string to int"""
4793     table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4794     result, base = 0, len(table)
4795     for char in string:
4796         result = result * base + table[char]
4797     return result
4798
4799
4800 def decode_base(value, digits):
4801     write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4802                  'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4803     return decode_base_n(value, table=digits)
4804
4805
4806 def decode_packed_codes(code):
4807     mobj = re.search(PACKED_CODES_RE, code)
4808     obfuscated_code, base, count, symbols = mobj.groups()
4809     base = int(base)
4810     count = int(count)
4811     symbols = symbols.split('|')
4812     symbol_table = {}
4813
4814     while count:
4815         count -= 1
4816         base_n_count = encode_base_n(count, base)
4817         symbol_table[base_n_count] = symbols[count] or base_n_count
4818
4819     return re.sub(
4820         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4821         obfuscated_code)
4822
4823
4824 def caesar(s, alphabet, shift):
4825     if shift == 0:
4826         return s
4827     l = len(alphabet)
4828     return ''.join(
4829         alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4830         for c in s)
4831
4832
4833 def rot47(s):
4834     return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4835
4836
4837 def parse_m3u8_attributes(attrib):
4838     info = {}
4839     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4840         if val.startswith('"'):
4841             val = val[1:-1]
4842         info[key] = val
4843     return info
4844
4845
4846 def urshift(val, n):
4847     return val >> n if val >= 0 else (val + 0x100000000) >> n
4848
4849
4850 # Based on png2str() written by @gdkchan and improved by @yokrysty
4851 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4852 def decode_png(png_data):
4853     # Reference: https://www.w3.org/TR/PNG/
4854     header = png_data[8:]
4855
4856     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4857         raise OSError('Not a valid PNG file.')
4858
4859     int_map = {1: '>B', 2: '>H', 4: '>I'}
4860     unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4861
4862     chunks = []
4863
4864     while header:
4865         length = unpack_integer(header[:4])
4866         header = header[4:]
4867
4868         chunk_type = header[:4]
4869         header = header[4:]
4870
4871         chunk_data = header[:length]
4872         header = header[length:]
4873
4874         header = header[4:]  # Skip CRC
4875
4876         chunks.append({
4877             'type': chunk_type,
4878             'length': length,
4879             'data': chunk_data
4880         })
4881
4882     ihdr = chunks[0]['data']
4883
4884     width = unpack_integer(ihdr[:4])
4885     height = unpack_integer(ihdr[4:8])
4886
4887     idat = b''
4888
4889     for chunk in chunks:
4890         if chunk['type'] == b'IDAT':
4891             idat += chunk['data']
4892
4893     if not idat:
4894         raise OSError('Unable to read PNG data.')
4895
4896     decompressed_data = bytearray(zlib.decompress(idat))
4897
4898     stride = width * 3
4899     pixels = []
4900
4901     def _get_pixel(idx):
4902         x = idx % stride
4903         y = idx // stride
4904         return pixels[y][x]
4905
4906     for y in range(height):
4907         basePos = y * (1 + stride)
4908         filter_type = decompressed_data[basePos]
4909
4910         current_row = []
4911
4912         pixels.append(current_row)
4913
4914         for x in range(stride):
4915             color = decompressed_data[1 + basePos + x]
4916             basex = y * stride + x
4917             left = 0
4918             up = 0
4919
4920             if x > 2:
4921                 left = _get_pixel(basex - 3)
4922             if y > 0:
4923                 up = _get_pixel(basex - stride)
4924
4925             if filter_type == 1:  # Sub
4926                 color = (color + left) & 0xff
4927             elif filter_type == 2:  # Up
4928                 color = (color + up) & 0xff
4929             elif filter_type == 3:  # Average
4930                 color = (color + ((left + up) >> 1)) & 0xff
4931             elif filter_type == 4:  # Paeth
4932                 a = left
4933                 b = up
4934                 c = 0
4935
4936                 if x > 2 and y > 0:
4937                     c = _get_pixel(basex - stride - 3)
4938
4939                 p = a + b - c
4940
4941                 pa = abs(p - a)
4942                 pb = abs(p - b)
4943                 pc = abs(p - c)
4944
4945                 if pa <= pb and pa <= pc:
4946                     color = (color + a) & 0xff
4947                 elif pb <= pc:
4948                     color = (color + b) & 0xff
4949                 else:
4950                     color = (color + c) & 0xff
4951
4952             current_row.append(color)
4953
4954     return width, height, pixels
4955
4956
4957 def write_xattr(path, key, value):
4958     # Windows: Write xattrs to NTFS Alternate Data Streams:
4959     # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4960     if compat_os_name == 'nt':
4961         assert ':' not in key
4962         assert os.path.exists(path)
4963
4964         try:
4965             with open(f'{path}:{key}', 'wb') as f:
4966                 f.write(value)
4967         except OSError as e:
4968             raise XAttrMetadataError(e.errno, e.strerror)
4969         return
4970
4971     # UNIX Method 1. Use xattrs/pyxattrs modules
4972
4973     setxattr = None
4974     if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4975         # Unicode arguments are not supported in pyxattr until version 0.5.0
4976         # See https://github.com/ytdl-org/youtube-dl/issues/5498
4977         if version_tuple(xattr.__version__) >= (0, 5, 0):
4978             setxattr = xattr.set
4979     elif xattr:
4980         setxattr = xattr.setxattr
4981
4982     if setxattr:
4983         try:
4984             setxattr(path, key, value)
4985         except OSError as e:
4986             raise XAttrMetadataError(e.errno, e.strerror)
4987         return
4988
4989     # UNIX Method 2. Use setfattr/xattr executables
4990     exe = ('setfattr' if check_executable('setfattr', ['--version'])
4991            else 'xattr' if check_executable('xattr', ['-h']) else None)
4992     if not exe:
4993         raise XAttrUnavailableError(
4994             'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4995             + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4996
4997     value = value.decode()
4998     try:
4999         _, stderr, returncode = Popen.run(
5000             [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5001             text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5002     except OSError as e:
5003         raise XAttrMetadataError(e.errno, e.strerror)
5004     if returncode:
5005         raise XAttrMetadataError(returncode, stderr)
5006
5007
5008 def random_birthday(year_field, month_field, day_field):
5009     start_date = datetime.date(1950, 1, 1)
5010     end_date = datetime.date(1995, 12, 31)
5011     offset = random.randint(0, (end_date - start_date).days)
5012     random_date = start_date + datetime.timedelta(offset)
5013     return {
5014         year_field: str(random_date.year),
5015         month_field: str(random_date.month),
5016         day_field: str(random_date.day),
5017     }
5018
5019
5020 # Templates for internet shortcut files, which are plain text files.
5021 DOT_URL_LINK_TEMPLATE = '''\
5022 [InternetShortcut]
5023 URL=%(url)s
5024 '''
5025
5026 DOT_WEBLOC_LINK_TEMPLATE = '''\
5027 <?xml version="1.0" encoding="UTF-8"?>
5028 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5029 <plist version="1.0">
5030 <dict>
5031 \t<key>URL</key>
5032 \t<string>%(url)s</string>
5033 </dict>
5034 </plist>
5035 '''
5036
5037 DOT_DESKTOP_LINK_TEMPLATE = '''\
5038 [Desktop Entry]
5039 Encoding=UTF-8
5040 Name=%(filename)s
5041 Type=Link
5042 URL=%(url)s
5043 Icon=text-html
5044 '''
5045
5046 LINK_TEMPLATES = {
5047     'url': DOT_URL_LINK_TEMPLATE,
5048     'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5049     'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5050 }
5051
5052
5053 def iri_to_uri(iri):
5054     """
5055     Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5056
5057     The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5058     """
5059
5060     iri_parts = urllib.parse.urlparse(iri)
5061
5062     if '[' in iri_parts.netloc:
5063         raise ValueError('IPv6 URIs are not, yet, supported.')
5064         # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5065
5066     # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5067
5068     net_location = ''
5069     if iri_parts.username:
5070         net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5071         if iri_parts.password is not None:
5072             net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5073         net_location += '@'
5074
5075     net_location += iri_parts.hostname.encode('idna').decode()  # Punycode for Unicode hostnames.
5076     # The 'idna' encoding produces ASCII text.
5077     if iri_parts.port is not None and iri_parts.port != 80:
5078         net_location += ':' + str(iri_parts.port)
5079
5080     return urllib.parse.urlunparse(
5081         (iri_parts.scheme,
5082             net_location,
5083
5084             urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5085
5086             # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5087             urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5088
5089             # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5090             urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5091
5092             urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5093
5094     # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5095
5096
5097 def to_high_limit_path(path):
5098     if sys.platform in ['win32', 'cygwin']:
5099         # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5100         return '\\\\?\\' + os.path.abspath(path)
5101
5102     return path
5103
5104
5105 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5106     val = traverse_obj(obj, *variadic(field))
5107     if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5108         return default
5109     return template % func(val)
5110
5111
5112 def clean_podcast_url(url):
5113     return re.sub(r'''(?x)
5114         (?:
5115             (?:
5116                 chtbl\.com/track|
5117                 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5118                 play\.podtrac\.com
5119             )/[^/]+|
5120             (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5121             flex\.acast\.com|
5122             pd(?:
5123                 cn\.co| # https://podcorn.com/analytics-prefix/
5124                 st\.fm # https://podsights.com/docs/
5125             )/e
5126         )/''', '', url)
5127
5128
5129 _HEX_TABLE = '0123456789abcdef'
5130
5131
5132 def random_uuidv4():
5133     return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5134
5135
5136 def make_dir(path, to_screen=None):
5137     try:
5138         dn = os.path.dirname(path)
5139         if dn and not os.path.exists(dn):
5140             os.makedirs(dn)
5141         return True
5142     except OSError as err:
5143         if callable(to_screen) is not None:
5144             to_screen('unable to create directory ' + error_to_compat_str(err))
5145         return False
5146
5147
5148 def get_executable_path():
5149     from .update import _get_variant_and_executable_path
5150
5151     return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5152
5153
5154 def load_plugins(name, suffix, namespace):
5155     classes = {}
5156     with contextlib.suppress(FileNotFoundError):
5157         plugins_spec = importlib.util.spec_from_file_location(
5158             name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5159         plugins = importlib.util.module_from_spec(plugins_spec)
5160         sys.modules[plugins_spec.name] = plugins
5161         plugins_spec.loader.exec_module(plugins)
5162         for name in dir(plugins):
5163             if name in namespace:
5164                 continue
5165             if not name.endswith(suffix):
5166                 continue
5167             klass = getattr(plugins, name)
5168             classes[name] = namespace[name] = klass
5169     return classes
5170
5171
5172 def traverse_obj(
5173         obj, *path_list, default=None, expected_type=None, get_all=True,
5174         casesense=True, is_user_input=False, traverse_string=False):
5175     ''' Traverse nested list/dict/tuple
5176     @param path_list        A list of paths which are checked one by one.
5177                             Each path is a list of keys where each key is a:
5178                               - None:     Do nothing
5179                               - string:   A dictionary key
5180                               - int:      An index into a list
5181                               - tuple:    A list of keys all of which will be traversed
5182                               - Ellipsis: Fetch all values in the object
5183                               - Function: Takes the key and value as arguments
5184                                           and returns whether the key matches or not
5185     @param default          Default value to return
5186     @param expected_type    Only accept final value of this type (Can also be any callable)
5187     @param get_all          Return all the values obtained from a path or only the first one
5188     @param casesense        Whether to consider dictionary keys as case sensitive
5189     @param is_user_input    Whether the keys are generated from user input. If True,
5190                             strings are converted to int/slice if necessary
5191     @param traverse_string  Whether to traverse inside strings. If True, any
5192                             non-compatible object will also be converted into a string
5193     # TODO: Write tests
5194     '''
5195     if not casesense:
5196         _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5197         path_list = (map(_lower, variadic(path)) for path in path_list)
5198
5199     def _traverse_obj(obj, path, _current_depth=0):
5200         nonlocal depth
5201         path = tuple(variadic(path))
5202         for i, key in enumerate(path):
5203             if None in (key, obj):
5204                 return obj
5205             if isinstance(key, (list, tuple)):
5206                 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5207                 key = ...
5208             if key is ...:
5209                 obj = (obj.values() if isinstance(obj, dict)
5210                        else obj if isinstance(obj, (list, tuple, LazyList))
5211                        else str(obj) if traverse_string else [])
5212                 _current_depth += 1
5213                 depth = max(depth, _current_depth)
5214                 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5215             elif callable(key):
5216                 if isinstance(obj, (list, tuple, LazyList)):
5217                     obj = enumerate(obj)
5218                 elif isinstance(obj, dict):
5219                     obj = obj.items()
5220                 else:
5221                     if not traverse_string:
5222                         return None
5223                     obj = str(obj)
5224                 _current_depth += 1
5225                 depth = max(depth, _current_depth)
5226                 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5227             elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5228                 obj = (obj.get(key) if casesense or (key in obj)
5229                        else next((v for k, v in obj.items() if _lower(k) == key), None))
5230             else:
5231                 if is_user_input:
5232                     key = (int_or_none(key) if ':' not in key
5233                            else slice(*map(int_or_none, key.split(':'))))
5234                     if key == slice(None):
5235                         return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5236                 if not isinstance(key, (int, slice)):
5237                     return None
5238                 if not isinstance(obj, (list, tuple, LazyList)):
5239                     if not traverse_string:
5240                         return None
5241                     obj = str(obj)
5242                 try:
5243                     obj = obj[key]
5244                 except IndexError:
5245                     return None
5246         return obj
5247
5248     if isinstance(expected_type, type):
5249         type_test = lambda val: val if isinstance(val, expected_type) else None
5250     else:
5251         type_test = expected_type or IDENTITY
5252
5253     for path in path_list:
5254         depth = 0
5255         val = _traverse_obj(obj, path)
5256         if val is not None:
5257             if depth:
5258                 for _ in range(depth - 1):
5259                     val = itertools.chain.from_iterable(v for v in val if v is not None)
5260                 val = [v for v in map(type_test, val) if v is not None]
5261                 if val:
5262                     return val if get_all else val[0]
5263             else:
5264                 val = type_test(val)
5265                 if val is not None:
5266                     return val
5267     return default
5268
5269
5270 def traverse_dict(dictn, keys, casesense=True):
5271     write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5272                  'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5273     return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5274
5275
5276 def get_first(obj, keys, **kwargs):
5277     return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5278
5279
5280 def variadic(x, allowed_types=(str, bytes, dict)):
5281     return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5282
5283
5284 def time_seconds(**kwargs):
5285     t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5286     return t.timestamp()
5287
5288
5289 # create a JSON Web Signature (jws) with HS256 algorithm
5290 # the resulting format is in JWS Compact Serialization
5291 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5292 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5293 def jwt_encode_hs256(payload_data, key, headers={}):
5294     header_data = {
5295         'alg': 'HS256',
5296         'typ': 'JWT',
5297     }
5298     if headers:
5299         header_data.update(headers)
5300     header_b64 = base64.b64encode(json.dumps(header_data).encode())
5301     payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5302     h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5303     signature_b64 = base64.b64encode(h.digest())
5304     token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5305     return token
5306
5307
5308 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5309 def jwt_decode_hs256(jwt):
5310     header_b64, payload_b64, signature_b64 = jwt.split('.')
5311     payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5312     return payload_data
5313
5314
5315 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5316
5317
5318 @functools.cache
5319 def supports_terminal_sequences(stream):
5320     if compat_os_name == 'nt':
5321         if not WINDOWS_VT_MODE:
5322             return False
5323     elif not os.getenv('TERM'):
5324         return False
5325     try:
5326         return stream.isatty()
5327     except BaseException:
5328         return False
5329
5330
5331 def windows_enable_vt_mode():  # TODO: Do this the proper way https://bugs.python.org/issue30075
5332     if get_windows_version() < (10, 0, 10586):
5333         return
5334     global WINDOWS_VT_MODE
5335     try:
5336         Popen.run('', shell=True)
5337     except Exception:
5338         return
5339
5340     WINDOWS_VT_MODE = True
5341     supports_terminal_sequences.cache_clear()
5342
5343
5344 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5345
5346
5347 def remove_terminal_sequences(string):
5348     return _terminal_sequences_re.sub('', string)
5349
5350
5351 def number_of_digits(number):
5352     return len('%d' % number)
5353
5354
5355 def join_nonempty(*values, delim='-', from_dict=None):
5356     if from_dict is not None:
5357         values = (traverse_obj(from_dict, variadic(v)) for v in values)
5358     return delim.join(map(str, filter(None, values)))
5359
5360
5361 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5362     """
5363     Find the largest format dimensions in terms of video width and, for each thumbnail:
5364     * Modify the URL: Match the width with the provided regex and replace with the former width
5365     * Update dimensions
5366
5367     This function is useful with video services that scale the provided thumbnails on demand
5368     """
5369     _keys = ('width', 'height')
5370     max_dimensions = max(
5371         (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5372         default=(0, 0))
5373     if not max_dimensions[0]:
5374         return thumbnails
5375     return [
5376         merge_dicts(
5377             {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5378             dict(zip(_keys, max_dimensions)), thumbnail)
5379         for thumbnail in thumbnails
5380     ]
5381
5382
5383 def parse_http_range(range):
5384     """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5385     if not range:
5386         return None, None, None
5387     crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5388     if not crg:
5389         return None, None, None
5390     return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5391
5392
5393 def read_stdin(what):
5394     eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5395     write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5396     return sys.stdin
5397
5398
5399 def determine_file_encoding(data):
5400     """
5401     Detect the text encoding used
5402     @returns (encoding, bytes to skip)
5403     """
5404
5405     # BOM marks are given priority over declarations
5406     for bom, enc in BOMS:
5407         if data.startswith(bom):
5408             return enc, len(bom)
5409
5410     # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5411     # We ignore the endianness to get a good enough match
5412     data = data.replace(b'\0', b'')
5413     mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5414     return mobj.group(1).decode() if mobj else None, 0
5415
5416
5417 class Config:
5418     own_args = None
5419     parsed_args = None
5420     filename = None
5421     __initialized = False
5422
5423     def __init__(self, parser, label=None):
5424         self.parser, self.label = parser, label
5425         self._loaded_paths, self.configs = set(), []
5426
5427     def init(self, args=None, filename=None):
5428         assert not self.__initialized
5429         self.own_args, self.filename = args, filename
5430         return self.load_configs()
5431
5432     def load_configs(self):
5433         directory = ''
5434         if self.filename:
5435             location = os.path.realpath(self.filename)
5436             directory = os.path.dirname(location)
5437             if location in self._loaded_paths:
5438                 return False
5439             self._loaded_paths.add(location)
5440
5441         self.__initialized = True
5442         opts, _ = self.parser.parse_known_args(self.own_args)
5443         self.parsed_args = self.own_args
5444         for location in opts.config_locations or []:
5445             if location == '-':
5446                 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5447                 continue
5448             location = os.path.join(directory, expand_path(location))
5449             if os.path.isdir(location):
5450                 location = os.path.join(location, 'yt-dlp.conf')
5451             if not os.path.exists(location):
5452                 self.parser.error(f'config location {location} does not exist')
5453             self.append_config(self.read_file(location), location)
5454         return True
5455
5456     def __str__(self):
5457         label = join_nonempty(
5458             self.label, 'config', f'"{self.filename}"' if self.filename else '',
5459             delim=' ')
5460         return join_nonempty(
5461             self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5462             *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5463             delim='\n')
5464
5465     @staticmethod
5466     def read_file(filename, default=[]):
5467         try:
5468             optionf = open(filename, 'rb')
5469         except OSError:
5470             return default  # silently skip if file is not present
5471         try:
5472             enc, skip = determine_file_encoding(optionf.read(512))
5473             optionf.seek(skip, io.SEEK_SET)
5474         except OSError:
5475             enc = None  # silently skip read errors
5476         try:
5477             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5478             contents = optionf.read().decode(enc or preferredencoding())
5479             res = shlex.split(contents, comments=True)
5480         except Exception as err:
5481             raise ValueError(f'Unable to parse "{filename}": {err}')
5482         finally:
5483             optionf.close()
5484         return res
5485
5486     @staticmethod
5487     def hide_login_info(opts):
5488         PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5489         eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5490
5491         def _scrub_eq(o):
5492             m = eqre.match(o)
5493             if m:
5494                 return m.group('key') + '=PRIVATE'
5495             else:
5496                 return o
5497
5498         opts = list(map(_scrub_eq, opts))
5499         for idx, opt in enumerate(opts):
5500             if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5501                 opts[idx + 1] = 'PRIVATE'
5502         return opts
5503
5504     def append_config(self, *args, label=None):
5505         config = type(self)(self.parser, label)
5506         config._loaded_paths = self._loaded_paths
5507         if config.init(*args):
5508             self.configs.append(config)
5509
5510     @property
5511     def all_args(self):
5512         for config in reversed(self.configs):
5513             yield from config.all_args
5514         yield from self.parsed_args or []
5515
5516     def parse_known_args(self, **kwargs):
5517         return self.parser.parse_known_args(self.all_args, **kwargs)
5518
5519     def parse_args(self):
5520         return self.parser.parse_args(self.all_args)
5521
5522
5523 class WebSocketsWrapper():
5524     """Wraps websockets module to use in non-async scopes"""
5525     pool = None
5526
5527     def __init__(self, url, headers=None, connect=True):
5528         self.loop = asyncio.new_event_loop()
5529         # XXX: "loop" is deprecated
5530         self.conn = websockets.connect(
5531             url, extra_headers=headers, ping_interval=None,
5532             close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5533         if connect:
5534             self.__enter__()
5535         atexit.register(self.__exit__, None, None, None)
5536
5537     def __enter__(self):
5538         if not self.pool:
5539             self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5540         return self
5541
5542     def send(self, *args):
5543         self.run_with_loop(self.pool.send(*args), self.loop)
5544
5545     def recv(self, *args):
5546         return self.run_with_loop(self.pool.recv(*args), self.loop)
5547
5548     def __exit__(self, type, value, traceback):
5549         try:
5550             return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5551         finally:
5552             self.loop.close()
5553             self._cancel_all_tasks(self.loop)
5554
5555     # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5556     # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5557     @staticmethod
5558     def run_with_loop(main, loop):
5559         if not asyncio.iscoroutine(main):
5560             raise ValueError(f'a coroutine was expected, got {main!r}')
5561
5562         try:
5563             return loop.run_until_complete(main)
5564         finally:
5565             loop.run_until_complete(loop.shutdown_asyncgens())
5566             if hasattr(loop, 'shutdown_default_executor'):
5567                 loop.run_until_complete(loop.shutdown_default_executor())
5568
5569     @staticmethod
5570     def _cancel_all_tasks(loop):
5571         to_cancel = asyncio.all_tasks(loop)
5572
5573         if not to_cancel:
5574             return
5575
5576         for task in to_cancel:
5577             task.cancel()
5578
5579         # XXX: "loop" is removed in python 3.10+
5580         loop.run_until_complete(
5581             asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5582
5583         for task in to_cancel:
5584             if task.cancelled():
5585                 continue
5586             if task.exception() is not None:
5587                 loop.call_exception_handler({
5588                     'message': 'unhandled exception during asyncio.run() shutdown',
5589                     'exception': task.exception(),
5590                     'task': task,
5591                 })
5592
5593
5594 def merge_headers(*dicts):
5595     """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5596     return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5597
5598
5599 def cached_method(f):
5600     """Cache a method"""
5601     signature = inspect.signature(f)
5602
5603     @functools.wraps(f)
5604     def wrapper(self, *args, **kwargs):
5605         bound_args = signature.bind(self, *args, **kwargs)
5606         bound_args.apply_defaults()
5607         key = tuple(bound_args.arguments.values())
5608
5609         if not hasattr(self, '__cached_method__cache'):
5610             self.__cached_method__cache = {}
5611         cache = self.__cached_method__cache.setdefault(f.__name__, {})
5612         if key not in cache:
5613             cache[key] = f(self, *args, **kwargs)
5614         return cache[key]
5615     return wrapper
5616
5617
5618 class classproperty:
5619     """property access for class methods"""
5620
5621     def __init__(self, func):
5622         functools.update_wrapper(self, func)
5623         self.func = func
5624
5625     def __get__(self, _, cls):
5626         return self.func(cls)
5627
5628
5629 class Namespace(types.SimpleNamespace):
5630     """Immutable namespace"""
5631
5632     def __iter__(self):
5633         return iter(self.__dict__.values())
5634
5635     @property
5636     def items_(self):
5637         return self.__dict__.items()
5638
5639
5640 # Deprecated
5641 has_certifi = bool(certifi)
5642 has_websockets = bool(websockets)